# IESB - Miner II - Random Forest

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, plot_confusion_matrix
import scikitplot as skplt

from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Carregando os dados
df = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/test.csv')
dic = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/codebook.csv')

df.shape, test.shape, dic.shape

In [None]:
#Carregando dicionario para analisar os dados
dic = dic.rename(columns={'Variable name': 'name', 'Variable description': 'description'})
dic.head()

In [None]:
#Investigando os dados
df_dtypes = pd.DataFrame(df.dtypes,columns=['dtypes'])
df_dtypes = df_dtypes.reset_index()
df_dtypes['name'] = df_dtypes['index']
df_dtypes = df_dtypes[['name','dtypes']]
df_dtypes['first value'] = df.loc[0].values
df_dtypes['last value'] = df.loc[len(df)-1].values
preview = df_dtypes.merge(dic, on='name',how='left')
#Visualizando as primeiras tri
preview[:30]

In [None]:
#Visualizando as primeiras trinta variáve
preview[30:60]

In [None]:
#Visualizando as as variaveis de 60 a 90
preview[60:90]

In [None]:
#Visualizando as as variaveis de 90 a 120 
preview[90:120]

In [None]:
#Visualizando as as variaveis de 90 a 120 
preview[120:]

In [None]:
df.info()

In [None]:
# Vamos aumentar o número de colunas ara o info mostrar
df.info(max_cols=145)

In [None]:
# Quais colunas do dataframe são do tipo object
df.select_dtypes('object').head()

In [None]:
# Imputando valores 
df['meaneduc'] = df['meaneduc'].fillna(df['meaneduc'].median())
df['SQBmeaned'] = df['SQBmeaned'].fillna(df['SQBmeaned'].median())
df['v2a1'] = df['v2a1'].fillna(df['v2a1'].median())
df['v18q1'] = df['v18q1'].fillna(-1)
df['rez_esc'] = df['rez_esc'].fillna(-1)

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,7))
sns.countplot(x='Target',data=df,ax=axs[0])
axs[0].set_title("Frequência")
df.Target.value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Porcentagem")
plt.show()

In [None]:
feats = [c for c in df.columns if c not in ['Id','idhogar'	,'dependency','edjefe','edjefa', 'Target']]

## Encontrado problema de classes desbalanceada

In [None]:
#Separar features para treinar o modelos
X = df.drop(['Id','idhogar'	,'dependency','edjefe','edjefa', 'Target'], axis=1) 
y = df['Target']
#Separa em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Executando OverSampling para resolver problema de desbalanceamento de classes
ros = RandomOverSampler(random_state=42)
X_ros,y_ros= ros.fit_resample(X_train,y_train)
# Verificando o resultado
y_ros.value_counts()

## Executando Random Forest com classes desbalanceadas

In [None]:
rfd = RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=42)
rfd.fit(X_train, y_train)

In [None]:
predict_rfd = rfd.predict(X_test)
accuracy_rfd = accuracy_score(y_test, predict_rfd)
accuracy_rfd

In [None]:
fig=plt.figure(figsize=(15, 20))
pd.Series(rfd.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, predict_rfd)

## Executando Random Forest com classes Balanceadas

In [None]:
rfb = RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=42)
rfb.fit(X_ros, y_ros)

In [None]:
predict_rfb = rfb.predict(X_test)
accuracy_rfb = accuracy_score(y_test, predict_rfb)
accuracy_rfb

In [None]:
fig=plt.figure(figsize=(15, 20))
pd.Series(rfb.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
# Matriz de Confusão - Dados de teste
skplt.metrics.plot_confusion_matrix(y_test, predict_rfd)

1. ## Executando XGBoost Classifier

In [None]:
xgbc = XGBClassifier(n_estimators=100, learning_rate=1, random_state=42, max_depth=1)
xgbc.fit(X_ros, y_ros);

In [None]:
predict_xgbc = xgbc.predict(X_test)
accuracy_xgbc = accuracy_score(y_test, predict_xgbc)
accuracy_xgbc

In [None]:
fig=plt.figure(figsize=(15, 20))
# Avaliando a importancia de cada coluna (cada variável de entrada)
pd.Series(xgbc.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, predict_xgbc)

## Executando Cat Boost Classifier

In [None]:
cbc = CatBoostClassifier(random_state=42)
cbc.fit(X_ros, y_ros)

In [None]:
predict_cbc = cbc.predict(X_test)
accuracy_cbc = accuracy_score(y_test, predict_cbc)
accuracy_cbc

In [None]:
fig=plt.figure(figsize=(15, 20))
# Avaliando a importancia de cada coluna (cada variável de entrada)
pd.Series(cbc.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, predict_cbc)

## Executando GradientBoostingClassifier

In [None]:
gbm = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=42)
gbm.fit(X_ros, y_ros)

In [None]:
predict_gbm =  gbm.predict(X_test)
accuracy_gbm = accuracy_score(y_test, predict_gbm)
accuracy_gbm

In [None]:
fig=plt.figure(figsize=(15, 20))
# Avaliando a importancia de cada coluna (cada variável de entrada)
pd.Series(gbm.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, predict_gbm)

In [None]:
print('RandomForestClassifier 		=', accuracy_rfb)
print('XGBClassifier 				=', accuracy_xgbc)
print('CatBoostClassifier			=', accuracy_cbc)
print('GradientBoostingClassifier	=', accuracy_gbm)

In [None]:
# Imputando valores para o dataSet de Teste
test['meaneduc'] = test['meaneduc'].fillna(df['meaneduc'].median())
test['SQBmeaned'] = test['SQBmeaned'].fillna(df['SQBmeaned'].median())
test['v2a1'] = test['v2a1'].fillna(df['v2a1'].median())
test['v18q1'] = test['v18q1'].fillna(-1)
test['rez_esc'] = test['rez_esc'].fillna(-1)

featsT = test.drop(['Id','idhogar'	,'dependency','edjefe','edjefa'], axis=1)
featsT.head()

In [None]:
test['Target'] = xgbc.predict(featsT).astype(int)
test

In [None]:
test[['Id', 'Target']]

In [None]:
test[['Id', 'Target']].to_csv('submission.csv', index=False)