# IESB - Graduacao - CIA028 - Costa Rica

In [None]:
#Importando Bibliotecas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import scikitplot as skplt
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Carregando os dados
df = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/test.csv')

df.shape, test.shape

In [None]:
# Juntando os dataframes
df_all = df.append(test)

df_all.shape

In [None]:
# Verificando tamanhos e tipos
df_all.info()

In [None]:
# Quais colunas do dataframe são do tipo object
df_all.select_dtypes('object').head()

In [None]:
# Olhando a coluna dependency
df_all['dependency'].value_counts()

In [None]:
# Analisando os dados da coluna edjefa
df_all['edjefa'].value_counts()

In [None]:
# Analisando os dados da coluna edjefe
df_all['edjefe'].value_counts()

In [None]:
# Vamos transformar 'yes' em 1 e 'no' em 0
# nas colunas edjefa e edjefe
mapeamento = {'yes': 1, 'no': 0}

df_all['edjefa'] = df_all['edjefa'].replace(mapeamento).astype(int)
df_all['edjefe'] = df_all['edjefe'].replace(mapeamento).astype(int)

In [None]:
# Quais colunas do dataframe são do tipo object
df_all.select_dtypes('object').head()

In [None]:
# Olhando a coluna dependency
df_all['dependency'].value_counts()

In [None]:
# Vamos transformar 'yes' em 1 e 'no' em 0
# na coluna dependency
df_all['dependency'] = df_all['dependency'].replace(mapeamento).astype(float)

In [None]:
# Quais colunas do dataframe são do tipo object
df_all.select_dtypes('object').head()

In [None]:
# Visualizando do comando info
df_all.info()

In [None]:
# Verificando os valores nulos
df_all.isnull().sum()

In [None]:
 # Verificando os valores de aluguel (v2a1) para os chefes/as de familia (parentesco1 = 1)
df_all[df_all['parentesco1'] == 1]['v2a1'].isnull().sum()

In [None]:
# Qual a cara dos dados de v18q
df_all['v18q'].value_counts()

In [None]:
# Prenchendo com -1 os valores nulos de v2a1
df_all['v2a1'].fillna(-1, inplace=True)

In [None]:
# Prenchendo com 0 os valores nulos de v18q1
df_all['v18q1'].fillna(0, inplace=True)

In [None]:
# Verificando os valores nulos
df_all.isnull().sum().sort_values()

In [None]:
# Prenchendo com -1 os valores nulos de SQBmeaned, meaneduc e rez_esc
df_all['SQBmeaned'].fillna(-1, inplace=True)
df_all['meaneduc'].fillna(-1, inplace=True)
df_all['rez_esc'].fillna(-1, inplace=True)

In [None]:
#Verificando as Classes
df_all.Target.value_counts()

In [None]:
#Plotando as Classes
import seaborn as sns
ax = sns.countplot(x='Target',  data=df_all)

In [None]:
# Separando as colunas para treinamento
feats = [c for c in df_all.columns if c not in ['Id', 'idhogar', 'Target']]

In [None]:
# Separar os dataframes
train, test = df_all[~df_all['Target'].isnull()], df_all[df_all['Target'].isnull()]

train.shape, test.shape

In [None]:
# Random Forest
rf = RandomForestClassifier(max_depth=None, random_state=42, n_jobs=4, n_estimators=701, min_impurity_decrease=0.0007, min_samples_leaf=2, min_samples_split=5, verbose=0, class_weight='balanced')

rf.fit(train[feats], train['Target'])

test['Target'] = rf.predict(test[feats]).astype(int)
test[['Id', 'Target']].to_csv('submission.csv', index=False)

# **Score Encontrado = **0.44049****

# Agora vamos balancear as classes!

In [None]:
# Dividindo os dados por classes
df_a = df_all[df_all['Target'] == 1]
df_b = df_all[df_all['Target'] == 2]
df_c = df_all[df_all['Target'] == 3]
df_d = df_all[df_all['Target'] == 4]

In [None]:
# Aplicando Over_Sampling
df_a_os = resample(df_a, replace=True, n_samples=len(df_d), random_state=42)
df_b_os = resample(df_b, replace=True, n_samples=len(df_d), random_state=42)
df_c_os = resample(df_c, replace=True, n_samples=len(df_d), random_state=42)

# Concatenando
df_os = pd.concat([df_a_os, df_b_os, df_c_os, df_d])

In [None]:
# Verificando as classes após o balanceamento
df_os['Target'].value_counts()

In [None]:
#Plotando as Classes
import seaborn as sns
ax = sns.countplot(x='Target',  data=df_os)

In [None]:
# Executando novo RandomForest após Over_Sampling

# Dividindo em treino e teste
train, test = train_test_split(df_os, test_size=0.2, random_state=42)

# Treinar o modelo
rf.fit(train[feats], train['Target'])

# Previsões na base de teste
preds_test = rf.predict(test[feats])

# Medir a acurácia
accuracy_score(test['Target'], preds_test)

In [None]:
skplt.metrics.plot_confusion_matrix(test['Target'], preds_test)

In [None]:
# Aplicando Under-Sampling
df_b_us = resample(df_b, replace=False, n_samples=len(df_a),random_state=42)
df_c_us = resample(df_c, replace=False, n_samples=len(df_a),random_state=42)
df_d_us = resample(df_d, replace=False, n_samples=len(df_a),random_state=42)

# Concatenando os dados
df_us = pd.concat([df_a, df_b_us, df_c_us, df_d_us])

In [None]:
# Verificando as Classes após o Balanceamento
df_us['Target'].value_counts()

In [None]:
#Plotando as Classes
import seaborn as sns
ax = sns.countplot(x='Target',  data=df_us)

In [None]:
# Executando novo RandomForest após Over_Sampling

# Dividindo em treino e teste
train, test = train_test_split(df_us, test_size=0.2, random_state=42)

# Treinar o modelo
rf.fit(train[feats], train['Target'])

# Previsões na base de teste
preds_test = rf.predict(test[feats])

# Medir a acurácia
accuracy_score(test['Target'], preds_test)

In [None]:
skplt.metrics.plot_confusion_matrix(test['Target'], preds_test)