In [None]:
# Sources:
# https://www.kaggle.com/shivamb/costa-rica-poverty-exploration-kernel
# https://www.kaggle.com/willkoehrsen/a-complete-introduction-and-walkthrough
# https://www.kaggle.com/willkoehrsen/featuretools-for-good


import numpy as np 
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import seaborn as sns
from scipy.stats import spearmanr
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
import shap

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/test.csv')
codebook = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/codebook.csv')
submission = test[['Id', 'idhogar']]

pd.options.display.max_columns = 150
pd.options.display.max_rows = 200


import warnings
warnings.filterwarnings("ignore", message="FixedFormatter should only be used together with FixedLocator")
warnings.filterwarnings("ignore", message="Pass the following variables as keyword args: x, y. From version 0.12, the only valid")

In [None]:
codebook

In [None]:
train.head()

In [None]:
train.info()

# We're dealing with an 9557x143 dataframe, using 10.4MB of memory. A fairly large database to kaggle parameters
# Most values are int, some are categorical and float

In [None]:
train.describe()

# There are many columns that are hot one encoded categorical features
# Which is visible by their min, 75% and max values

In [None]:
train.duplicated().sum()

# No duplicated values in the dataset

In [None]:
cols_nan = train.isna().sum()
print(cols_nan.where(cols_nan > 0).dropna().astype('int32'))

# Only a few columns with missing data. Since there are 9557 entries total, the columns with over 6000 missing entries will likely be dropped
# The columns with 5 missing entries will be further investigated

In [None]:
# First, lets check v2a1
# The column represents the amount of rent payed for the household
# One possibility is that the values are null because the residents own the house, so have no rent to pay
# The columns tipovivi 1 - tipovivi 5 shows if the house is owned by the residents or is paying for it monthly somehow

own_series = train.columns.to_series().str.contains('tipo')
own_var = own_series.where(own_series != False).dropna().index.to_list()

train.loc[train['v2a1'].isnull(), own_var].sum().plot.bar(figsize = (7, 5))
plt.xticks([0, 1, 2, 3, 4],
           ['Owns and Paid Off', 'Owns and Paying', 'Rented', 'Precarious', 'Other'],
          rotation = 60)
plt.title('Home Ownership Status for Households Missing Rent Payments', size = 15);

# So it seems that most houses that pays no rent are houses owned by the residents. This makes sense


In [None]:
# We can fill the NaN values as 0 for tipovivi1
# The other 2 columns will be imputed as 0 as well, since it is the most likely value
# But since for them we're taking a guess, its good practice to create a impute flag column

train.loc[(train['tipovivi1'] == 1), 'v2a1'] = 0
test.loc[(test['tipovivi1'] == 1), 'v2a1'] = 0

train['v2a1-missing'] = train['v2a1'].isnull()
test['v2a1-missing'] = test['v2a1'].isnull()

train['v2a1'].fillna(0, inplace=True)
test['v2a1'].fillna(0, inplace=True)

In [None]:
# Now for the next missing value, v18q1
# v18q1 refers to number of tablets in the household
# Since its a household feature, it only makes sense to all residents in a household have the same value

check_tablet = train.groupby('idhogar')['v18q1'].apply(lambda x: x.nunique() == 1)
dif_tablet = check_tablet[check_tablet != True] 
dif_tablet.sum()

# No distinct tablet count inside a same household, as expected
# So we`ll use only the values for the heads of households

heads = train.loc[train['parentesco1'] == 1]

In [None]:
heads['v18q1'].value_counts().plot.bar()
plt.xlabel('Number of tablets')
plt.ylabel('Number of households')

# So the most common value is 1 tablet per household, but its important to remember that the graph doesnt include NaNs
# The most plausible idea is that the NaNs represents houses with no tablets


In [None]:
# The v18q column indicates whether the family owns a tablet or not
# So comparing this column to the NaNs at v18q1, we must get to a conclusion

heads.groupby('v18q')['v18q1'].apply(lambda x: x.isnull().sum())

# This shows that all NaN values fall under the Dont Own a Tablet category
# So we can simply fill the NaN values with 0

train['v18q1'] = train['v18q1'].fillna(0)
test['v18q1'] = test['v18q1'].fillna(0)

In [None]:
# The third feature with missing values is rez_esc
# It represents the years behind in school the individual has
# The most plausible hypothesis is that the NaN values represents individuals that are not behind in school

train.loc[train['rez_esc'].notnull()]['age'].describe()

In [None]:
train.loc[train['rez_esc'].isnull()]['age'].describe()

# The competition describes the rez_esc feature only taking in consideration people between the ages 7-19
# Anything under 7 or over 19 will be assumed to have no years behind in school

In [None]:
train.loc[((train['age'] > 19) | (train['age'] < 7)) & (train['rez_esc'].isnull()), 'rez_esc'] = 0
test.loc[((test['age'] > 19) | (test['age'] < 7)) & (test['rez_esc'].isnull()), 'rez_esc'] = 0

# For those in the 7 to 19 range and still have missing value, it makes sense to imput with 0 as well
# But its good practice to create the imputed flag column, to flag our assumption

train['rez_esc-missing'] = train['rez_esc'].isnull()
test['rez_esc-missing'] = test['rez_esc'].isnull()

# Now that the flag column has been created, we can finish filling all the missing values

train['rez_esc'].fillna(0, inplace=True)
test['rez_esc'].fillna(0, inplace=True)

# The competition also describes as the maximum years behind school is 5. So anything over it will be considered an outlier

train.loc[train['rez_esc'] > 5, 'rez_esc'] = 5
test.loc[test['rez_esc'] > 5, 'rez_esc'] = 5

In [None]:
cols_nan = train.isna().sum()
print(cols_nan.where(cols_nan > 0).dropna().astype('int32'))

# Now all thats left is meaneduc and SQBmeaned
# meaneduc shows the average years of education on adults (18+)
# SQBmeaned shows the square of the mean years of education of adults (18+) in the household

In [None]:
# First lets check if the NaN values in both columns correspond to the same indexes

nan_index = train[train['meaneduc'].isna()].index
print(nan_index)
print(train[train['SQBmeaned'].isna()].index)

# Same indexes, so they refer to the same individuals

house_with_nan = train.loc[train['meaneduc'].isnull(), 'idhogar'].value_counts()

print('-'*25)
print(house_with_nan)

print('-'*25)
print(train.loc[train['idhogar'].isin(house_with_nan.index), ['age']])

# All individuals with null values are 18 or 19 years old
# They are the only individuals in the household



In [None]:
# Lets check the level of education of these 5 individuals

edu_series = train.columns.to_series().str.contains('instlevel')
edu_var = edu_series.where(edu_series != False).dropna().index.to_list()

train.loc[train['meaneduc'].isnull(), edu_var].sum().plot.bar(figsize = (7, 5))
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8],
           ['No Education', 'Incomplete Primary', 'Complete Primary', 'Incomplete Second Level', 'Complete Second Level', 'Incomplete Technical Level', 'Complete Technical Level', 'Undergraduated and Higher', 'Postgraduate and Higher'],
          rotation = 60)
plt.title('Level of education of individuals with NaN meaneduc value', size = 15);

# So we have a variety of education levels to these 5 individuals

In [None]:
# For each of them, we'll impute the median value for that specific education level

def impute_median(index, df):
    
    df_t = (df.loc[df.index == index, edu_var]).transpose()
    instlevel = df_t[df_t == 1].dropna().index.values
    
    
    mdmed = df.loc[df[instlevel[0]] == 1, 'meaneduc'].median()
    sqmed = df.loc[df[instlevel[0]] == 1, 'SQBmeaned'].median()
        
    df.loc[df.index == index, 'meaneduc'] = mdmed
    df.loc[df.index == index, 'SQBmeaned'] = sqmed

In [None]:
for df in [train, test]: 
    nan_index = df[df['meaneduc'].isna()].index
    for j in nan_index:
        impute_median(j, df)

In [None]:
cols_nan = train.isna().sum()
print(cols_nan.where(cols_nan > 0).dropna().astype('int32'))

# All NaN values in the dataset has been dealt with

In [None]:
# In the dataframe, each column represents one individual. The predictions must be made based on households
# Each individual has its household identified by the idhogar column, and the head of the household is identified as parentesco1 = 1
# Every individual in a same household must share the same Target feature in the train database, since its household based

check_target = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
print(check_target)
print('-'*25)
dif_target = check_target[check_target != True] 
print(len(dif_target))
print('-'*25)
print(train['parentesco1'].sum())
print('-'*25)
print(train.groupby('idhogar')['parentesco1'].apply(lambda x: x.sum() > 1).sum())
# There are 2988 total households in the dataset
# 85 of these households have more than 1 target value for the residents
# There are 2988 different households and 2973 heads, so we have 15 households without a household head
# There are no household with multiple heads

In [None]:
# One way to deal with the multiple targets per idhogar is to assign the household head value to all others
# But in order to do that, we must make sure there aren't households with both multiple target values and no household head

house_has_head = train.groupby('idhogar')['parentesco1'].apply(lambda x: x.sum() == 1)
house_no_head = house_has_head[house_has_head == 0]
train_no_head = train.loc[train['idhogar'].isin(house_no_head.index)]
dif_no_head = train_no_head.groupby('idhogar')['Target'].apply(lambda x: x.nunique() > 1).sum()
print(dif_no_head)

# This proves there are no households without head and with multiple targets
# So we can apply the previous solution

In [None]:
print(train.loc[train['idhogar'] == '0172ab1d9']['Target'])

for household in dif_target.index:
    true_target = int(train[(train['idhogar'] == household) & (train['parentesco1'] == 1)]['Target'])
    train.loc[train['idhogar'] == household, 'Target'] = true_target

    
check_target = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
dif_target = check_target[check_target != True]

print(train.loc[train['idhogar'] == '0172ab1d9']['Target'])

# Correction has been made, now all residents of a same household has the same target value

In [None]:
# Now that all the missing values have been dealt with, and some corrections made, lets further investigate our features
# Lets try to group the overly correlated features, first by analysing their definition

# tipovivi1 - tipovivi5 (own a house or not) and v2a1 (rent payed)
# hacdor, hacapo, overcrowding, SQBovercrowding (whether the house is overcrowded or not)
# rooms, v14a, tamhog, hhsize, pared^, piso^, techo^, cielorazo, sanitario1-6, energcocinar1-6, epared1-3, etecho1-3, eviv1-3, bedrooms, area1-2 are related to house characteristics
# r4h1-3, r4m1-3, r4t1-3, tamviv, dis, male, female, hogar_nin, hogar_adul, hogar_mayor, hogar_total, dependency, SQBhogar_total, SQBhogar_nin, age, agesq, estadocivil1-7, parentesco1-12 are related to number of persons in the house
# refrig, v18q, v18q1, sanitario1-6, computer, television, mobilephone, qmobilephone related to things that are in the house
# abastagua^, public, planpri, noelec, coopele, elimbasu1-6, lugar1-6, related to structure outside the house
# escolari, rez_esc, edjefe, edjefa, instlevel1-9, meaneduc, SQBescolari, SQBedjefe, SQBmeaned, related to education levels

In [None]:
# There are a series of squared feature columns
# These can be useful when using a simple linear model, but to more complex models, it tends to be prejudicial by making the model overfit
# So these features will be dropped

sqbcol = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']
train = train.drop(sqbcol, axis=1)
test = test.drop(sqbcol, axis=1)

In [None]:
# There are many columns that are one hot encoded features, and others that are clearly correlated
# For the hot one encoded, we'll try to create a single ordinal column, when we can define a clear order for the values
# We'll also focus on features house related, since the resident related must be aggregated into house related as well

def mc(dic, colname):
    for df in [train, test]:
        for i in dic:
            df.loc[df[i] == 1, colname] = dic[i]

In [None]:
# First lets join the wall type columns

train['walltype'] = 0
test['walltype'] = 0

wall_dt = {
    'paredother': 1,
    'pareddes': 1,
    'paredfibras': 1,
    'paredzinc': 2,
    'paredzocalo': 3,
    'paredmad': 4,
    'paredpreb': 5,
    'paredblolad': 6    
}

mc(wall_dt, 'walltype')

for i in wall_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)
    
# The number of entries that matches pareddes, paredfibras, paredzinc and other are too few
# Since all of them represent low quality wall, to avoid overfitting they will be joined in a single category

In [None]:
# Wall Quality

train['wallquality'] = 0
test['wallquality'] = 0

wallq_dt = {
    'epared1': 1,
    'epared2': 2,
    'epared3': 3,  
}

mc(wallq_dt, 'wallquality')

for i in wallq_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Now for the floor type columns

train['floortype'] = 0
test['floortype'] = 0

floor_dt = {
    'pisonotiene': 1,
    'pisoother': 2,
    'pisonatur': 2,
    'pisocemento': 3,
    'pisomadera': 4,
    'pisomoscer': 5,  
}

mc(floor_dt, 'floortype')

for i in floor_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)
    
# The number of entries that matches pisoother, pisonatur and pisonotiene are too few
# Since all of them represent low quality floor, to avoid overfitting they will be joined in a single category

In [None]:
# Floor Quality

train['floorqual'] = 0
test['floorqual'] = 0

floorq_dt = {
    'eviv1': 1,
    'eviv2': 2,
    'eviv3': 3 
}

mc(floorq_dt, 'floorqual')

for i in floorq_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Now for the roof

# cielorazo gives 1 if a house HAS a roof. We want a column that is True when it DOES NOT

train['nothaveroof'] = 0
train.loc[train['cielorazo'] == 0, 'nothaveroof'] = 1
train = train.drop('cielorazo', axis=1)

test['nothaveroof'] = 0
test.loc[test['cielorazo'] == 0, 'nothaveroof'] = 1
test = test.drop('cielorazo', axis=1)


train['rooftype'] = 0
test['rooftype'] = 0

roof_dt = {
    'nothaveroof': 1,
    'techootro': 2,
    'techocane': 2,
    'techoentrepiso': 3,
    'techozinc': 4
}

mc(roof_dt, 'rooftype')

for i in roof_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

# The number of entries that matches techootro and techocane are too few
# Since all of them represent low quality ceiling, to avoid overfitting they will be joined in a single category

In [None]:
# Roof quality

train['roofqual'] = 0
test['roofqual'] = 0

roofq_dt = {
    'etecho1': 1,
    'etecho2': 2,
    'etecho3': 3,
}

mc(roofq_dt, 'roofqual')

for i in roofq_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Now for water provision

train['waterprov'] = 0
test['waterprov'] = 0

water_dt = {
    'abastaguano' : 1,
    'abastaguafuera' : 2,
    'abastaguadentro' : 3
}
  
mc(water_dt, 'waterprov')

for i in water_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)
    


In [None]:
# Electricity Source

train['elecsource'] = 0
test['elecsource'] = 0

elec_dt = {
    'noelec' : 1,
    'planpri' : 2,
    'coopele' : 2,
    'public' : 3,
}
  
mc(elec_dt, 'elecsource')

for i in elec_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Toilet Dwelling

train['toiletdwel'] = 0
test['toiletdwel'] = 0

toilet_dt = {
    'sanitario1' : 1,
    'sanitario5' : 2,
    'sanitario6' : 3,
    'sanitario3' : 3,
    'sanitario2' : 4
}
  
mc(toilet_dt, 'toiletdwel')

for i in toilet_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Cooking Energy Source

train['cookingsource'] = 0
test['cookingsource'] = 0

cook_dt = {
    'energcocinar1' : 1,
    'energcocinar4' : 2,
    'energcocinar3' : 3,
    'energcocinar2' : 4,
}
  
mc(cook_dt, 'cookingsource')

for i in cook_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)
    

In [None]:
# Rubbish Disposal

train['rubbishdisp'] = 0
test['rubbishdisp'] = 0

rubbish_dt = {
    'elimbasu6' : 1,
    'elimbasu5' : 2,
    'elimbasu4' : 1,
    'elimbasu3' : 3,
    'elimbasu2' : 1,
    'elimbasu1' : 4
}
  
mc(rubbish_dt, 'rubbishdisp')

for i in rubbish_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Own House Status

# This feature will be divided into owned or not, and the precarious feature will remain as a separated one

train = train.rename(columns = {'tipovivi4': 'isprecarious'})
test = test.rename(columns = {'tipovivi4': 'isprecarious'})

                     
train['houseowned'] = 0
test['houseowned'] = 0

houseown_dt = {
    'tipovivi5' : 0,
    'tipovivi3' : 0,
    'tipovivi2' : 1,
    'tipovivi1' : 1,
}
  
mc(houseown_dt, 'houseowned')

for i in houseown_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Location

train['region'] = 0
test['region'] = 0

region_dt = {
    'lugar1' : 1,
    'lugar2' : 2,
    'lugar3' : 3,
    'lugar4' : 4,
    'lugar5' : 5,
    'lugar6' : 6
}
  
mc(region_dt, 'region')

for i in region_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
# Removing reduntant area column

train = train.rename(columns = {'area1': 'isurban'})
test = test.rename(columns = {'area1': 'isurban'})

train = train.drop('area2', axis=1)
test = test.drop('area2', axis=1)

In [None]:
train.head()

In [None]:
train[['dependency', 'edjefe', 'edjefa']].head(10)

In [None]:

train['dependency'].value_counts()

# The dependency column seems really weird
# The challenge describes dependency as (number of residents 19< y.o. or >64 y.o.) / (number of residents between 19 and 64 y.o
# Therefore it doesnt make sense to be 'yes' or 'no' values in this column
# Luckly we have the age of each resident in the database, so we can recalculate this column


In [None]:
train['n_depend'] = 0
train['n_indep'] = 0

n_dep = train[['age', 'idhogar']].loc[(train['age'] < 19) | (train['age'] > 64)].groupby('idhogar').count()
n_ind = train[['age', 'idhogar']].loc[(train['age'] >= 19) & (train['age'] <= 64)].groupby('idhogar').count()


for i in n_dep.index:
    train.loc[train['idhogar'] == i, 'n_depend'] = int(n_dep.loc[n_dep.index == i, 'age'].values)
    
for i in n_ind.index:
    train.loc[train['idhogar'] == i, 'n_indep'] = int(n_ind.loc[n_ind.index == i, 'age'].values)
        
train['dependencynew'] = train['n_depend']/train['n_indep']
train.loc[train['dependencynew'] == np.inf, 'dependencynew'] = 10

train[['idhogar', 'n_depend', 'n_indep', 'age']].head(10)

In [None]:
train = train.drop(['n_depend', 'n_indep', 'dependency'], axis=1)

In [None]:
test['n_depend'] = 0
test['n_indep'] = 0

n_dep = test[['age', 'idhogar']].loc[(test['age'] < 19) | (test['age'] > 64)].groupby('idhogar').count()
n_ind = test[['age', 'idhogar']].loc[(test['age'] >= 19) & (test['age'] <= 64)].groupby('idhogar').count()

In [None]:
for i in n_dep.index:
    test.loc[test['idhogar'] == i, 'n_depend'] = int(n_dep.loc[n_dep.index == i, 'age'].values)


In [None]:
for i in n_ind.index:
    test.loc[test['idhogar'] == i, 'n_indep'] = int(n_ind.loc[n_ind.index == i, 'age'].values)

In [None]:
test['dependencynew'] = test['n_depend']/test['n_indep']
test.loc[test['dependencynew'] == np.inf, 'dependencynew'] = 10

test[['idhogar', 'n_depend', 'n_indep', 'age']].head(10)

In [None]:
test = test.drop(['n_depend', 'n_indep', 'dependency'], axis=1)

In [None]:
train[['edjefe', 'edjefa']].value_counts()

# edjefe and edjefa seem to have the same weird problem, with both columns
# We'll remake these two columns as well
# They will be converted into one

In [None]:
train['headescolari'] = 0
test['headescolari'] = 0

train.loc[train['parentesco1'] == 1, 'headescolari'] = train['escolari']
test.loc[test['parentesco1'] == 1, 'headescolari'] = test['escolari']

train = train.drop(['edjefe', 'edjefa'], axis=1)
test = test.drop(['edjefe', 'edjefa'], axis=1)


In [None]:
train.dtypes

In [None]:
# We converted into ordinal all columns household related where the trend was clear
# Now lets do the same for columns in individual level

# The variables r4 shows a clear correlation and redundancy others
# Sex and Age are already given by different columns
# A column showing whether someone is under or over 12 could be useful though
# So all columns will be dropped except r4t1 and r4t3

drop_cols = ['r4h1', 'r4h2', 'r4h3', 'r4m1', 'r4m2', 'r4m3', 'r4t2']
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [None]:
# Civil State

train['civilstate'] = 0
test['civilstate'] = 0

civil_dt = {
    'estadocivil1' : 1,
    'estadocivil2' : 3,
    'estadocivil3' : 4,
    'estadocivil4' : 5,
    'estadocivil5' : 5,
    'estadocivil6' : 6,
    'estadocivil7' : 2
}
  
mc(civil_dt, 'civilstate')

for i in civil_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
train['civilstate'].value_counts()

In [None]:
# We'll drop all the columns of kinship besides parentesco1, since they are individual features without huge relevance

drop_cols = ['parentesco2', 'parentesco3', 'parentesco4', 'parentesco5', 'parentesco6', 'parentesco7', 'parentesco8', 'parentesco9', 'parentesco10', 'parentesco11', 'parentesco12']

train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

train = train.rename(columns = {'parentesco1': 'ishousehead'})
test = test.rename(columns = {'parentesco1': 'ishousehead'})

In [None]:
# Individual education

train['education'] = 0
test['education'] = 0

educ_dt = {
    'instlevel1' : 1,
    'instlevel2' : 2,
    'instlevel3' : 3,
    'instlevel4' : 4,
    'instlevel5' : 5,
    'instlevel6' : 6,
    'instlevel7' : 7,
    'instlevel8' : 8,
    'instlevel9' : 9
}
  
mc(educ_dt, 'education')

for i in educ_dt:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)

In [None]:
train['education'].value_counts()

In [None]:
train.loc[train['education'] == 0, 'education'] = 1

In [None]:
train.columns

In [None]:
# There are a few columns related to electronics the household has
# Lets try to aggregate these columns in order to increase their correlation with target

# refrig, v18q, v18q1, computer, television, mobilephone, qmobilephone
# Its expected to be a linear correlation

In [None]:
def plot_frac(col_plot, x_vals, x_label, gtype):
    
    # Creates the dataframe that will be filled with values to be plotted
    df_plot = pd.DataFrame()
    
    # Defines range used in the for loop
    loop_range = x_vals
    
    # Loop that fills the df_plot
    for i in loop_range:
        temp_train = train[train[col_plot] == i]
        temp_val = temp_train.groupby(['Target']).count()[col_plot]/temp_train[col_plot].count()
        df_plot = df_plot.append(temp_val)
    df_plot.set_index(np.array(range(len(loop_range))), inplace=True)
    df_plot.fillna(0, inplace=True)
    df_plot = df_plot.rename(columns = {1:'ExtermePoverty', 2:'ModeratePoverty', 3:'Vulnerable', 4:'NonVulnerable'})

    # Creating aliases to reduce the length of code
    ds1d, ds2d, ds3d, ds4d = df_plot['ExtermePoverty'], df_plot['ModeratePoverty'], df_plot['Vulnerable'], df_plot['NonVulnerable']
    
    fig, ax = plt.subplots()
    # Creating each of the bars, passing the the bottom parameter as the sum of the bars under it
    ax.bar(df_plot.index, ds1d, label='ExtermePoverty')
    ax.bar(df_plot.index, ds2d, label='ModeratePoverty', bottom=ds1d)
    ax.bar(df_plot.index, ds3d, label='Vulnerable', bottom=np.array(ds1d)+np.array(ds2d))
    ax.bar(df_plot.index, ds4d, label='NonVulnerable', bottom=np.array(ds1d)+np.array(ds2d)+np.array(ds3d))
    ax.legend()
    # Set the x-axis to the animal gender
    ax.set_xticklabels(x_label)
    ax.xaxis.set_major_locator(matplotlib.ticker.FixedLocator(range(50)))
    plt.xticks(rotation=30)
    plt.yticks(np.linspace(0,1,11))
    plt.title('Target Fraction based on ' + gtype)
    plt.ylabel('Fraction of Target')
    plt.show()
    return None

def calc_corrs(x, y):
    """Plot data and show the spearman and pearson correlation."""
    
    # Calculate correlations
    spr = spearmanr(x, y).correlation
    pcr = np.corrcoef(x, y)[0, 1]
    
    print(' '*14+'Spearman: '+str(round(spr, 2))+'; Pearson: '+str(round(pcr, 2)))

In [None]:
plot_frac('refrig', [0,1], ['Not Have', 'Have'], 'Having Refrigerators')

calc_corrs(train['v2a1'], train['Target'])

In [None]:
plot_frac('v18q', [0,1], ['Not Have', 'Have'], 'Having Tablets')

calc_corrs(train['v18q'], train['Target'])

In [None]:
plot_frac('v18q1', range(0,7), range(0,7), 'Number of Tablets')

calc_corrs(train['v18q1'], train['Target'])

In [None]:
plot_frac('computer', [0,1], ['Not Have', 'Have'], 'Having Computer')

calc_corrs(train['computer'], train['Target'])

In [None]:
plot_frac('television', [0,1], ['Not Have', 'Have'], 'Having Television')

calc_corrs(train['television'], train['Target'])

In [None]:
plot_frac('mobilephone', [0,1], ['Not Have', 'Have'], 'Having Mobile Phone')

calc_corrs(train['mobilephone'], train['Target'])

In [None]:
plot_frac('qmobilephone', range(0,11), range(0,11), 'Number of Mobile Phones')

calc_corrs(train['qmobilephone'], train['Target'])

In [None]:
# It seems that Having a Tablet, a Computer, Number of Tablets and of Mobile Phones are the variables with higher correlation with Target
# Lets try aggregating the number of computers, tablets and mobile phones into a new feature and see if it represents a higher correlation

train['sumelectronics'] = train[['qmobilephone', 'v18q1', 'computer']].apply(np.sum, axis=1)
test['sumelectronics'] = test[['qmobilephone', 'v18q1', 'computer']].apply(np.sum, axis=1)

In [None]:
plot_frac('sumelectronics', range(0,15), range(0,15), 'Number of Electronics')

calc_corrs(train['sumelectronics'], train['Target'])

In [None]:
# We managed to create a feature with higher correlation than the others separately, so this feature will substitute the others

drop_cols = ['refrig', 'v18q', 'v18q1', 'computer', 'television', 'mobilephone', 'qmobilephone']

train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [None]:
train.columns

In [None]:
train[['tamhog', 'hhsize']]
train[['tamviv', 'hogar_total']]

train = train.drop(['hhsize', 'hogar_total'], axis=1)
test = test.drop(['hhsize', 'hogar_total'], axis=1)


# Duplicated Column

In [None]:
# Dependency column already makes the relation between dependent and independent individuals in the household
# The columns indicating the number of individuals of each age will be dropped

drop_cols = ['hogar_nin', 'hogar_adul', 'hogar_mayor']

train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [None]:
train[['hacdor', 'rooms', 'hacapo', 'tamviv', 'bedrooms', 'overcrowding']].head(20)

In [None]:
train['hacapo'].value_counts()

In [None]:
train.loc[train['hacapo'] == 1]

In [None]:
train[['overcrowding', 'hacapo', 'hacdor', 'tamviv', 'rooms', 'tamhog']]

In [None]:
train['rooms'].value_counts()

In [None]:
train['tamhog'].value_counts()

In [None]:
# There are some columns related to the size of the house and the number of persons in the house
# We'll aggregate all these columns in a new feature that represents the number of people/size of the house

overcrowd_df = train[['bedrooms', 'hacdor', 'rooms', 'hacapo', 'tamviv', 'tamhog', 'overcrowding', 'Target']]

corr_matrix = overcrowd_df.corr()

fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True)

In [None]:
# No combination were able to create a feature with higher correlation than overcrowding
# So it will be preserved
# The others will be dropped

col_drops = ['bedrooms', 'hacdor', 'rooms', 'hacapo', 'tamviv', 'tamhog']

train['overcrowding'] = train['overcrowding'].apply(lambda x: round(x, 1))
test['overcrowding'] = test['overcrowding'].apply(lambda x: round(x, 1))

train = train.drop(col_drops, axis=1)
test = test.drop(col_drops, axis=1)

In [None]:
train.head()

In [None]:
# We will rebuild the dis column into a feature that sums the total of disable people in the house

train['totaldisabled'] = 0

tot_dis = train.groupby('idhogar')['dis'].apply(np.sum, axis=0)

for i in tot_dis.index:
    train.loc[train['idhogar'] == i, 'totaldisabled'] = int(tot_dis.loc[tot_dis.index == i].values)

In [None]:
train[['totaldisabled', 'Target']].corr()

train = train.drop('totaldisabled', axis=1)

# The correlation is still to low to be worth keeping the columns

In [None]:
# We'll do the same to rez_esc

train['totalrezesc'] = 0

tot_dis = train.groupby('idhogar')['rez_esc'].apply(np.sum, axis=0)

for i in tot_dis.index:
    train.loc[train['idhogar'] == i, 'totalrezesc'] = int(tot_dis.loc[tot_dis.index == i].values)

In [None]:
train[['totalrezesc', 'Target']].corr()

train = train.drop('totalrezesc', axis=1)

# The correlation is still to low to be worth keeping the columns

In [None]:
train.head()

In [None]:
# We now will drop the last of the features individual related

drop_cols = ['escolari', 'rez_esc', 'rez_esc-missing', 'dis', 'male', 'female', 'age', 'civilstate', 'education']

train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [None]:
# Now that all features are household related, we can drop all rows that aren't the household head row

train = train.loc[train['ishousehead']  == 1]
test = test.loc[test['ishousehead'] == 1]

train = train.drop('ishousehead', axis=1)
test = test.drop('ishousehead', axis=1)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
# We already built enough features, now lets analyse the correlation of each one to target and keep dropping redundant features

corr_matrix = train.corr()
print(corr_matrix['Target'].sort_values(ascending=False))

# We will drop most of the features that is between 0.2 and -0.2, unless we have a good reason to keep it


drop_cols = ['cookingsource', 'v2a1', 'toiletdwel', 'rubbishdisp', 'isurban', 'waterprov', 'v14a', 'houseowned', 'rooftype', 'elecsource', 'r4t3', 'v2a1-missing', 'Id','idhogar']

train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

train = train.rename(columns = {'r4t1': 'numchilds', 'dependencynew' : 'dependency'})
test = test.rename(columns = {'r4t1': 'numchilds', 'dependencynew' : 'dependency'})

In [None]:
train.head()

In [None]:
test.head()

In [None]:
y = train['Target']
X = train.drop('Target', axis=1)
print(y)
print(X)

In [None]:
# Here we initialize all the modeling methods we'll test
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
scaler = MinMaxScaler()

kneigh = KNeighborsClassifier()
dectree = DecisionTreeClassifier(random_state=42)
forest = RandomForestClassifier(random_state=42)
adab = AdaBoostClassifier(random_state=42)
gb = xgb.XGBClassifier(eval_metric=scorer, random_state=42)


# Create the train_test_split for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, stratify=y , random_state=42)
y_test = y_test.to_numpy()

In [None]:
def bayes_search(model, param_grid):

    # Initialize the cross validation method
    n_iter = 5
    cv = StratifiedKFold(n_splits=n_iter, shuffle=True, random_state=42)

    # Execute the bayes search
    bsearch = BayesSearchCV(model, param_grid, n_iter=n_iter, scoring=scorer, cv=cv, verbose=True).fit(X,y)
    # Print the values to be used in each parameter for best result in the final fitting
    print(' ',bsearch.best_score_)
    print(' ',bsearch.best_params_)
    
    return None

In [None]:
# Searching for KNeighbors

'''
# Define the parameters to be tested in the bayes search
param_grid = {'n_neighbors': Integer(2, 20),
              'weights': Categorical(['uniform','distance']),
              'leaf_size': Integer(10, 100)}

bayes_search(kneigh, param_grid)
'''

# Results: 
# ('leaf_size', 69), ('n_neighbors', 18), ('weights', 'distance')


kneigh = KNeighborsClassifier(leaf_size=69, n_neighbors=18, weights='distance')
kneigh.fit(X_train, y_train)
y_pred = kneigh.predict(X_test)
result = f1_score(y_pred, y_test, average='macro')
print(result)

# Result:
# macro F1: 0.2989191209747389

In [None]:
# Searching for DecisionTreeClassifier

'''
# Define the parameters to be tested in the bayes search
param_grid = {'criterion': Categorical(['gini','entropy']),
              'splitter': Categorical(['best','random']),
              'max_depth': Integer(10, 200),
              'min_samples_split': Integer(5, 50),
              'max_leaf_nodes': Integer(10, 200),
              }

bayes_search(dectree, param_grid)
'''

# Results:
# ('criterion', 'gini'), ('max_depth', 69), ('min_samples_split', 43), ('splitter', 'random')

dectree = DecisionTreeClassifier(criterion='gini', max_depth=69, min_samples_split=43, splitter='random', random_state=42)
dectree.fit(X_train, y_train)
y_pred = dectree.predict(X_test)
result = f1_score(y_pred, y_test, average='macro')
print(result)

# Result:
# macro F1 0.31521827047080664

In [None]:
# Searching for RandomForestRegressor

'''
# Define the parameters to be tested in the bayes search
param_grid = {'n_estimators': Integer(100, 2000),
              'criterion': Categorical(['gini','entropy']),
              'max_leaf_nodes': Integer(20, 500),
              'min_samples_split': Integer(5, 50),
              'max_leaf_nodes': Integer(10, 200),
              }

bayes_search(forest, param_grid)
'''

# Results:
# ('criterion', 'entropy'), ('max_leaf_nodes', 154), ('min_samples_split', 15), ('n_estimators', 682)

forest = RandomForestClassifier(criterion='entropy', max_leaf_nodes=154, min_samples_split=15, n_estimators=682, random_state=42)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
result = f1_score(y_pred, y_test, average='macro')
print(result)

# Result
# macro F1 0.2880104516588272


In [None]:
# Searching for AdaBoostClassifier

'''
# Define the parameters to be tested in the bayes search
param_grid = {'n_estimators': Integer(50, 1000),
              'learning_rate': Real(0.01, 1, prior='log-uniform')
              }

bayes_search(adab, param_grid)
'''

# Results:
# ('learning_rate', 0.3935549480126014), ('n_estimators', 336)

adab = AdaBoostClassifier(learning_rate=0.394, n_estimators=336 ,random_state=42)
adab.fit(X_train, y_train)
y_pred = adab.predict(X_test)
result = f1_score(y_pred, y_test, average='macro')
print(result)

# Result
# macro F1 0.327066295591346


In [None]:
# Searching for XGBoost

'''
# Define the parameters to be tested in the bayes search
param_grid = {'max_depth': Integer(1, 90),
              'learning_rate': Real(0.01, 1, prior='log-uniform'),
              'reg_alpha': Real(0.01, 100),
              'colsample_bytree': Real(0.2e0, 0.8e0),
              'subsample': Real(0.2e0, 0.8e0),
              'n_estimators': Integer(50, 200)}

bayes_search(gb, param_grid)
'''

# Results:
# ('colsample_bytree', 0.3641610047267118), ('learning_rate', 0.8433396554938758), ('max_depth', 3), ('n_estimators', 94), ('reg_alpha', 7.175005707630737), ('subsample', 0.681950796538646)


gb = xgb.XGBClassifier(colsample_bytree=0.36, learning_rate=0.84, max_depth=3, n_estimators=94, reg_alpha=7.18, subsample=0.68, random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
result = f1_score(y_pred, y_test, average='macro')
print(result)

# Result:
# macro F1 0.36306483376974086

In [None]:
# Out of all models tried, XGBClassifier got the better result

gb = xgb.XGBClassifier(colsample_bytree=0.36, learning_rate=0.84, max_depth=3, n_estimators=94, reg_alpha=7.18, subsample=0.68, random_state=42)
gb.fit(X, y)
y_pred = gb.predict(test)

In [None]:
# Lets analyse the results of the xgboost model

y_pred_conf = gb.predict(X_test)
labels_cm = ['ExtermePoverty','ModeratePoverty','Vulnerable','NonVulnerable'] 
cm = confusion_matrix(y_test, y_pred_conf)

df_cm = pd.DataFrame(cm, index = [i for i in labels_cm], columns = [i for i in labels_cm])
plt.figure(figsize = (8,6))
sns.heatmap(df_cm, annot=True, fmt="d", cmap='YlGnBu')

# The model manages to predict fairly well people on the ExtremePoverty and NonVulnerable targets
# It doesn't work so well on ModeratePoverty

In [None]:
# Lets see the relevancy the model gave to each feature

# print the JS visualization code to the notebook
shap.initjs()

# use Kernel SHAP to explain test set predictions
explainer = shap.TreeExplainer(gb)
shap_values = explainer.shap_values(X_train)

In [None]:
# summarize the effects of all the features
print('---------------------------------ExtermePoverty----------------------------------')
shap.summary_plot(shap_values[0], X_train)
print('---------------------------------ModeratePoverty---------------------------------')
shap.summary_plot(shap_values[1], X_train)
print('-----------------------------------Vulnerable------------------------------------')
shap.summary_plot(shap_values[2], X_train)
print('---------------------------------NonVulnerable-----------------------------------')
shap.summary_plot(shap_values[3], X_train)

In [None]:
for i in range(0,18):
    y_pred = np.append(y_pred, 4)

In [None]:
pd.set_option('mode.chained_assignment', None)

submission['Target'] = 0

n_idho = submission['idhogar'].unique()
vi = 0

for i in n_idho:
    submission.loc[submission['idhogar'] == i, 'Target'] = y_pred[vi]
    vi += 1

submission = submission.drop('idhogar', axis=1)

submission.to_csv("submission.csv", index = False)