In [None]:
!pip install scikit-plot
!pip install catboost


In [None]:
## Bibliotecas que foram utilizadas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import scikitplot as skplt
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import RandomOverSampler


In [None]:
# Kaggle
df_train = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")
df_test = pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")
df_train.shape, df_test.shape

In [None]:
# ## Colab
# from google.colab import drive
# drive.mount('/content/drive')

# df_train = pd.read_csv("/content/drive/MyDrive/00 - Dados/05_MINER_II/train.csv")

# df_train.shape

In [None]:
df_train.info()

## Tratando dados faltantes

In [None]:
print("O dataset original é",df_train.shape, '\n')
df_train.isnull().sum().sort_values(ascending = False)[df_train.isnull().sum().sort_values(ascending = False) >= 1] 

In [None]:
faltantes = list(df_train.isnull().sum().sort_values(ascending = False)[df_train.isnull().sum().sort_values(ascending = False) >= 1].index) 

In [None]:
df_train[faltantes].info()

In [None]:
df_train[faltantes].hist(figsize = (20,7), grid = False)
plt.show()

In [None]:
drop = [faltantes.pop(0) for i in range(3)]


In [None]:
## Dropando colunas com muitos missings
df_train.drop(drop, axis =1, inplace = True)
df_train.shape

In [None]:
for i in faltantes:
    df_train[i] = df_train[i].fillna(df_train[i].mean())

In [None]:
df_train.isnull().sum()[df_train.isna().sum() >= 1]

## Análise exploratória

In [None]:
df_train.select_dtypes('object').head()

In [None]:
mapeamento = {'yes': 0, 'no': -1}

df_train["dependency"] = df_train["dependency"].replace(mapeamento).astype(float)
df_train["edjefe"] = df_train["edjefe"].replace(mapeamento).astype(int)
df_train["edjefa"] = df_train["edjefa"].replace(mapeamento).astype(int)


In [None]:
variaveis = list(df_train.select_dtypes('int64').columns)
variaveis.pop(-1)

In [None]:
variaveis = variaveis +list(df_train.select_dtypes('float64').columns)

In [None]:
## Identificando variáveis binárias
binarias = []
for i in variaveis:
  u =df_train[i].unique()

  if u.sum() == 1:
    binarias.append(i)

In [None]:
## Identificando variáveis discretas
discretas = [c for c in variaveis if c not in binarias]


In [None]:
df_train[discretas].shape, df_train[binarias].shape

### Variáveis Discretas

In [None]:
## Histograma de variáveis discretas
ax = df_train[discretas].hist(bins=25, grid=False, figsize=(20,20), zorder=2, rwidth=0.9)
plt.suptitle("Histogramas das variáveis discretas", fontsize = 30)

In [None]:
df_train[discretas].isnull().sum().sort_values()

In [None]:
fig, ax = plt.subplots(figsize = (25,20))
sns.heatmap(df_train[discretas].corr(), annot=True, ax = ax, cmap="YlGnBu")

### Variábeis Binárias

In [None]:
valores = {i:df_train[i].value_counts().values for i in binarias}
df_binarios = pd.DataFrame(valores)

In [None]:
## Plotando variáveis binárias
fig, ax =plt.subplots(21,5, figsize=(20,90))
ax = ax.flatten()

for i,cat in enumerate(df_binarios.columns):
 df_binarios[cat].plot.pie(ax=ax[i]) 
 ax[i].set_title(cat, fontweight='bold', fontsize= 15)
 ax[i].set_ylabel('')
 

In [None]:
df_train[binarias].isnull().sum().sort_values()

## Tratando classes desbalanceadas

In [None]:
## Proporção original
Target = df_train['Target']
Target.value_counts(normalize = True)

In [None]:


ros = RandomOverSampler(random_state = 42)
X_ros, y_ros = ros.fit_resample(df_train[discretas + binarias], df_train["Target"])


In [None]:
df_balance = pd.DataFrame(X_ros, columns=[discretas + binarias][0])


df_balance["Target"] = y_ros

In [None]:
df_balance["Target"].value_counts()

## Treinando Modelos de árvores

In [None]:
## Data split
train, test = train_test_split(df_balance, random_state =42)

train.shape, test.shape

### Random Forest

In [None]:
# Instanciando um objeto RandomForest
rf = RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=42, oob_score= True)

# Treinando o modelo
rf.fit(train[binarias + discretas], train['Target'])

In [None]:
# Prever o Target de teste usando o modelo treinado
preds = rf.predict(test[binarias + discretas]).astype(int)
test['Pred'] = preds

accuracy_score(test['Target'], preds)

In [None]:
## Cross validation
scores = cross_val_score(rf, train[binarias + discretas], train['Target'], cv=5, n_jobs=-1)

scores.mean()

In [None]:
(test['Target'] == test['Pred']).value_counts(normalize = True)*100

In [None]:
## Matriz de confusão
pd.crosstab(test['Target'],test['Pred'])

In [None]:
# Matriz de Confusão - Dados de validação

skplt.metrics.plot_confusion_matrix(test['Target'], preds, figsize=(15,7))

In [None]:
# Predição do modelo
test['Pred'].value_counts(normalize = True)

In [None]:
# Valores reais
test['Target'].value_counts(normalize = True)

### Decision tree

In [None]:
dt = DecisionTreeRegressor(random_state=42)

dt.fit(train[binarias + discretas], train['Target'])

In [None]:
preds = dt.predict(test[binarias + discretas]).astype(int)
test['Pred'] = preds

accuracy_score(test['Target'], preds)

In [None]:
## Cross validation
scores = cross_val_score(dt, train[binarias + discretas], train['Target'], cv=5, n_jobs=-1)

scores.mean()

In [None]:
## Matriz de confusão
pd.crosstab(test['Target'],test['Pred'])

In [None]:
# Matriz de Confusão - Dados de validação

skplt.metrics.plot_confusion_matrix(test['Target'], preds, figsize=(15,7))

### Gradient Boosting

In [None]:
## Treinando modelo
gbm = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=42)

gbm.fit(train[binarias + discretas], train['Target'])

In [None]:
preds = gbm.predict(test[binarias + discretas])
test['Pred'] = preds

accuracy_score(test['Target'], preds)

In [None]:
## Cross validation
scores = cross_val_score(gbm, train[binarias + discretas], train['Target'], cv=5, n_jobs=-1)

scores.mean()

In [None]:
## Matriz de confusão
pd.crosstab(test['Target'],test['Pred'])

In [None]:
# Matriz de Confusão - Dados de validação

skplt.metrics.plot_confusion_matrix(test['Target'], preds, figsize=(15,7))

### Xgboost

In [None]:
xgb = XGBClassifier(n_estimators=200, learning_rate=0.09, random_state=42)

xgb.fit(train[binarias + discretas], train['Target'])

In [None]:
preds = xgb.predict(test[binarias + discretas])
test['Pred'] = preds

accuracy_score(test['Target'], preds)

In [None]:
## Cross validation
scores = cross_val_score(xgb, train[binarias + discretas], train['Target'], cv=5, n_jobs=-1)

scores.mean()

In [None]:
## Matriz de confusão
pd.crosstab(test['Target'],test['Pred'])

In [None]:
# Matriz de Confusão - Dados de validação

skplt.metrics.plot_confusion_matrix(test['Target'], preds, figsize=(15,7))

### AdaBoost

In [None]:
abc = AdaBoostClassifier(n_estimators=200, learning_rate=1.0, random_state=42)
abc.fit(train[binarias + discretas], train['Target'])

In [None]:
preds = abc.predict(test[binarias + discretas])
test['Pred'] = preds

accuracy_score(test['Target'], preds)

In [None]:
## Cross validation
scores = cross_val_score(abc, train[binarias + discretas], train['Target'], cv=5, n_jobs=-1)

scores.mean()

In [None]:
## Matriz de confusão
pd.crosstab(test['Target'],test['Pred'])

In [None]:
# Matriz de Confusão - Dados de validação

skplt.metrics.plot_confusion_matrix(test['Target'], preds, figsize=(15,7))

### CatBoost

In [None]:
cbc = CatBoostClassifier(random_state=42)

cbc.fit(train[binarias + discretas], train['Target'])

In [None]:
preds = cbc.predict(test[binarias + discretas])
test['Pred'] = preds

accuracy_score(test['Target'], preds)

In [None]:
## Cross validation
scores = cross_val_score(cbc, train[binarias + discretas], train['Target'], cv=5, n_jobs=-1)

scores.mean()

In [None]:
## Matriz de confusão
pd.crosstab(test['Target'],test['Pred'])

In [None]:
# Matriz de Confusão - Dados de validação

skplt.metrics.plot_confusion_matrix(test['Target'], preds, figsize=(15,7))

## Aplicando melhor modelo nos Dados de teste → Random Forest

In [None]:
mapeamento = {'yes': 0, 'no': -1}

df_test["dependency"] = df_test["dependency"].replace(mapeamento).astype(float)
df_test["edjefe"] = df_test["edjefe"].replace(mapeamento).astype(int)
df_test["edjefa"] = df_test["edjefa"].replace(mapeamento).astype(int)

In [None]:
## Dropando colunas com muitos missings
df_test.drop(drop, axis =1, inplace = True)
df_test.shape

In [None]:
for i in faltantes:
    df_test[i] = df_test[i].fillna(df_test[i].mean())

In [None]:
preds = rf.predict(df_test[binarias + discretas]).astype(int)
df_test['Target'] = preds



In [None]:
df_test[['Id','Target']].to_csv('submission.csv', index = False)