# Costa Rican Household Poverty Level Prediction
## Exploratory Data Analysis and Predictive Modeling


Developed by **jhonnatan.torres.suarez@gmail.com**

**Notes:** 
- English is not my native language, so my apologies in advance for any typo or grammar mistake
- The model used in this kernel was not optimized using GridSearch or RandomSearch
- LB Score **0.393**
________________________________________________________________________________________________________________________________________________________________________________________________________________

First of all, the usual/custom libraries are imported

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import os
print(os.listdir("../input"))

Setting the max columns options of Pandas to 150 (in order to visualize the tables with all the features / variables)

In [None]:
pd.set_option('display.max_columns', 150)

Reading the data

In [None]:
tr = pd.read_csv('../input/train.csv')
ts = pd.read_csv('../input/test.csv')

Checking the structure, variables, rows and columns

In [None]:
tr.head()

In [None]:
ts.head()

The training dataset is smaller than the submission dataset

In [None]:
print(tr.shape)
print(ts.shape)

In [None]:
print(tr.columns.values)
print('\n')
print(ts.columns.values)

Including a mock/dummy value in order to concatenate the training and submission datasets

In [None]:
ts['Target']=5

In [None]:
df = pd.concat([tr,ts],axis=0)

In [None]:
df.describe(include='all')

Checking the blank values

In [None]:
df.isna().sum()

In [None]:
#plt.figure(figsize=(18,6))
#sns.countplot(x='dependency',data=tr)
df['Vol']=1
pd.pivot_table(df,values='Vol',columns='dependency',aggfunc='sum')

In [None]:
pd.pivot_table(df,values='Vol',columns='edjefe',aggfunc='sum')

In [None]:
pd.pivot_table(df,values='Vol',columns='edjefa',aggfunc='sum')

It seems like some numbers were included instead of **yes**, the next step is to replace these numbers with **yes**

In [None]:
df['dependency_f']=np.where(df['dependency']!='yes','no','yes')
df['dependency_f'].value_counts()

In [None]:
df['edjefe_f']=np.where(df['edjefe']!='yes','no','yes')
df['edjefe_f'].value_counts()

In [None]:
df['edjefa_f']=np.where(df['edjefa']!='yes','no','yes')
df['edjefa_f'].value_counts()

Removing the columns with numbers and categories and another colums with a high volume of zeros (0)

In [None]:
df.drop(columns=['dependency','edjefa','edjefe','v2a1','v18q1','rez_esc'],inplace=True)

In [None]:
sns.set_context('notebook')
sns.set_style('darkgrid')

Next steps is to creates a couple of histograms of the **SQBmeaned** and the **meaneduc**

In [None]:
print(df['SQBmeaned'].describe(include='all'))
df['SQBmeaned'].plot.hist(bins=30)
plt.grid(alpha=0.25)

In [None]:
print(df['meaneduc'].describe(include='all'))
df['meaneduc'].plot.hist(bins=30)
plt.grid(alpha=0.25)

Replacing the null values with the mean/average

In [None]:
df['SQBmeaned'].fillna(value=np.mean(df['SQBmeaned']),inplace=True)

In [None]:
df['meaneduc'].fillna(value=np.mean(df['meaneduc']),inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.describe(include='all')

In [None]:
print(df['edjefa_f'].value_counts())
print(df['elimbasu4'].value_counts())

Mostly of the variables included in the dataset have been binarized, even, there are 2 columns, one for males and another one for females, hence some columns can be removed

In [None]:
df.drop(columns=['Id','r4h3','r4m3','r4t3','paredother','pisoother','techootro','energcocinar4','elimbasu4','elimbasu5','elimbasu6','epared3','etecho3','eviv3','male','estadocivil7',
                'parentesco12','idhogar','instlevel9','tipovivi5','lugar6','area2','Vol'],inplace=True)

Replacing **yes** and **no** for **1** and **0**

In [None]:
df['edjefe_f']=np.where(df['edjefe_f']=='yes',1,0)
df['edjefa_f']=np.where(df['edjefa_f']=='yes',1,0)
df['dependency_f']=np.where(df['dependency_f']=='yes',1,0)

Scaling the numeric features

In [None]:
sm=df.describe(include='all')
sm.iloc[7,:].sort_values(ascending=False).head(30).index

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
dfs=MinMaxScaler().fit_transform(df[['agesq', 'SQBage', 'SQBmeaned', 'SQBedjefe', 'SQBescolari',
       'SQBhogar_total', 'SQBovercrowding', 'SQBhogar_nin', 'age',
       'SQBdependency', 'meaneduc', 'escolari', 'tamviv', 'rooms',
       'hogar_total', 'hhsize', 'overcrowding', 'tamhog', 'qmobilephone',
       'r4t2', 'hogar_nin', 'r4t1', 'bedrooms', 'hogar_adul', 'r4h2', 'r4m2',
       'r4m1', 'r4h1', 'hogar_mayor']])

In [None]:
dfsc = pd.DataFrame(data=dfs,columns=['agesq', 'SQBage', 'SQBmeaned', 'SQBedjefe', 'SQBescolari',
       'SQBhogar_total', 'SQBovercrowding', 'SQBhogar_nin', 'age',
       'SQBdependency', 'meaneduc', 'escolari', 'tamviv', 'rooms',
       'hogar_total', 'hhsize', 'overcrowding', 'tamhog', 'qmobilephone',
       'r4t2', 'hogar_nin', 'r4t1', 'bedrooms', 'hogar_adul', 'r4h2', 'r4m2',
       'r4m1', 'r4h1', 'hogar_mayor'])
dfsc.describe()

A boxplot of the numeric features

In [None]:
g = dfsc.plot.box(figsize=(20,4))
for item in g.get_xticklabels():
    item.set_rotation(60)

Removing the original numeric features

In [None]:
df.drop(columns=['agesq', 'SQBage', 'SQBmeaned', 'SQBedjefe', 'SQBescolari',
       'SQBhogar_total', 'SQBovercrowding', 'SQBhogar_nin', 'age',
       'SQBdependency', 'meaneduc', 'escolari', 'tamviv', 'rooms',
       'hogar_total', 'hhsize', 'overcrowding', 'tamhog', 'qmobilephone',
       'r4t2', 'hogar_nin', 'r4t1', 'bedrooms', 'hogar_adul', 'r4h2', 'r4m2',
       'r4m1', 'r4h1', 'hogar_mayor'],inplace=True)

Including the scaled numeric features into the original dataframe

In [None]:
dfsc.index=df.index
print(df.shape)
print(dfsc.shape)
ndf=pd.concat([df,dfsc],axis=1)
print(ndf.shape)
ndf.head()
ndf['Target'].value_counts()

Filtering by the competition dataset

In [None]:
dw = ndf[ndf['Target']!=5]
print(dw.shape)
dw['Target'].value_counts()

Correlation heatmap of the dataset features

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(dw.corr(),cmap='coolwarm')

# Machine Learning Stage

Importing the models/algorithms to be used

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import scikitplot as skplt

Decision Tree (used to filter the most important features)

In [None]:
rfc = DecisionTreeClassifier(criterion='entropy',class_weight='balanced',random_state=1234)

In [None]:
rfc.fit(dw.drop(columns='Target'),dw['Target'])

In [None]:
fi = pd.DataFrame(data=rfc.feature_importances_,index=dw.drop(columns='Target').columns,columns=['Importance'])
fi.sort_values(by='Importance',ascending=False).plot.bar(figsize=(20,4))
plt.grid(alpha=0.25)

Top 15 features

In [None]:
fi.sort_values(by='Importance',ascending=False).head(15).index

In [None]:
feats =fi.sort_values(by='Importance',ascending=False).head(15).index

In [None]:
from sklearn.model_selection import train_test_split

Boxplot of features

In [None]:
g = dw[feats].plot.box(figsize=(18,4))
for item in g.get_xticklabels():
    item.set_rotation(60)

80% - 20% ratio was used

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dw[feats],dw['Target'], test_size=0.20, random_state=1234)

Simple bagging classifier

In [None]:
model = BaggingClassifier(n_estimators=2500,random_state=1234,warm_start=False,verbose=1,oob_score=True)

In [None]:
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [None]:
prds = model.predict(X_test)

In [None]:
print(classification_report(y_test,prds))
skplt.metrics.plot_confusion_matrix(y_test,prds,normalize=True)

In [None]:
#cvs= cross_val_score(model,dw[feats],dw['Target'],cv=5,scoring='f1_macro')

In [None]:
#print(cvs)
#print(np.mean(cvs))
#print(np.median(cvs))

In [None]:
ds = ndf[ndf['Target']==5]
ds.head()
ds.drop(columns='Target',inplace=True)
print(ds.shape)

In [None]:
bagprds = model.predict(ds[feats])

In [None]:
pdf = pd.DataFrame(data=bagprds,columns=['Target'])

In [None]:
pdf.index = ts.index
gsub = pd.concat([ts['Id'],pdf],axis=1)
gsub.head()
sns.countplot(gsub['Target'])
plt.grid()

In [None]:
gsub.to_csv('bag_model_submit.csv',index =False,index_label=False)

In [None]:
sns.countplot(ndf['Target'])

In [None]:
k = dw.drop('Target',axis=1).plot.box(figsize=(40,10))
for item in k.get_xticklabels():
    item.set_rotation(60)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

A XGB Classifier was implemented

In [None]:
from xgboost import XGBClassifier

In [None]:
xgbc = XGBClassifier(n_estimators=7500,random_state=1234,max_depth=7)

In [None]:
xgbc.fit(X_train,y_train)

In [None]:
print(classification_report(y_test,xgbc.predict(X_test)))
skplt.metrics.plot_confusion_matrix(y_test,xgbc.predict(X_test),normalize=True)

A voting classifier was implemeted using the bagging and XGB models

In [None]:
vc = VotingClassifier(estimators=[('bag', model), ('xgb', xgbc)], voting='hard')

In [None]:
vc.fit(X_train,y_train)

In [None]:
print(classification_report(y_test,vc.predict(X_test)))
skplt.metrics.plot_confusion_matrix(y_test,vc.predict(X_test),normalize=True)

Cross validation to assess the model (voting) performance 

In [None]:
cvvs= cross_val_score(vc,dw[feats],dw['Target'],cv=5,scoring='f1_macro')
print(cvvs)
print("Average Cross Validation Score: ",np.mean(cvvs))
print("Median Cross Validation Score: ",np.median(cvvs))

Predicting using the submission data and creating the submission file

In [None]:
bagprdss = vc.predict(ds[feats])
pdfs = pd.DataFrame(data=bagprdss,columns=['Target'])

pdfs.index = ts.index
gsubs = pd.concat([ts['Id'],pdfs],axis=1)
gsubs.head()
sns.countplot(gsubs['Target'])
plt.grid()

gsubs.to_csv('voter_submit.csv',index =False,index_label=False)