In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

1. Understand the problem (we're almost there already)
Exploratory Data Analysis
2.Feature engineering to create a dataset for machine learning
3.Compare several baseline machine learning models
4.Try more complex machine learning models
5.Optimize the selected model
6.Investigate model predictions in context of problem
7.Draw conclusions and lay out next steps

**Exploratory Data Analysis**

Start with imports

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
# Set a few plotting defaults
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18
plt.rcParams['patch.edgecolor'] = 'k'
from scipy.stats import spearmanr

In [None]:
pd.options.display.max_columns = 150

# Read in data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()

Taking a look into items

In [None]:
train.info()

In [None]:
test.info()


Let us find umber of unique integer value

In [None]:
train.select_dtypes(np.int64).nunique().value_counts().sort_index().plot.bar(color = 'blue', 
                                                                             figsize = (8, 6),
                                                                            edgecolor = 'k', linewidth = 2);
plt.xlabel('Number of Unique Values'); plt.ylabel('Count');
plt.title('Count of Unique Values in Integer Columns');

For floats columns

In [None]:
plt.figure(figsize = (20, 16))
plt.style.use('fivethirtyeight')
# Color mapping
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
poverty_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable', 4: 'non vulnerable'})
for i, col in enumerate(train.select_dtypes('float')):
    ax=plt.subplot(4,2,i+1)
    for poverty_levels,color in colors.items():
        
        sns.kdeplot(train.loc[train['Target']==poverty_levels,col].dropna(),
                   ax=ax,color=color,label=poverty_mapping[poverty_levels])
        plt.title(f'{col.capitalize()} Distribution'); plt.xlabel(f'{col}'); plt.ylabel('Density')
plt.subplots_adjust(top=2)
    

These graphs give basic understanding like in sqbmeaned we see higher value with non vulnerable households similarly overcrowding we see non vulnerable not in higher overcrowding.

**object columns**

In [None]:
train.select_dtypes('object').head()

thus 4 columns with object dtypes dependency,edjefe,edjefa seem to be a mix of values

we replace 0s with no and 1 with yes

In [None]:
maps={'yes':1,'no':0}
for df in [train,test]:
    df['dependency'] = df['dependency'].replace(maps).astype(np.float64)
    df['edjefa'] = df['edjefa'].replace(maps).astype(np.float64)
    df['edjefe'] = df['edjefe'].replace(maps).astype(np.float64)

In [None]:
train.select_dtypes('object').head()

In [None]:
train[['dependency', 'edjefa', 'edjefe']].describe()


In [None]:
plt.figure(figsize = (16, 12))
for i,col in enumerate(['dependency','edjefa','edjefe']):
    ax=plt.subplot(3,1,i+1)
    color_map={1:'Red',2:'orange',3:'blue',4:'green'}
    poverty_map={1:'extreme',2:'moderate',3:'vulnerable',4:'non vulnerable'}
    for level,color in color_map.items():
        sns.kdeplot(train.loc[train['Target']==level,col].dropna(),
                   ax=ax,color=color,label=poverty_map[level])
        plt.title(f'{col.capitalize()} Distribution'); plt.xlabel(f'{col}'); plt.ylabel('Density')
plt.subplots_adjust(top=2)
        

We add null target to test data and append to train data too as we want to do feature engineering to both test and train we will later find test and train based on nan in targets

In [None]:
test['Target'] = np.nan
data = train.append(test, ignore_index = True)

**Exploring Label Distribution
**

We use only subset columns where parentesco1==1 because him/her being head of household

In [None]:
#heads of household
heads=data.loc[data['parentesco1']==1]

#labels for training
train_labels=data.loc[(data['Target'].notnull()) & (data['parentesco1']==1),['Target','idhogar']]

#value_counts of target
label_counts=train_labels['Target'].value_counts().sort_index()
# bar plot

label_counts.plot.bar(figsize=(8,6),color=colors.values(),edgecolor='k',linewidth=2)

#formatting
plt.xlabel("levels of poverty")
plt.ylabel("count")
plt.xticks([x-1 for x in poverty_map.keys()],
          list(poverty_mapping.values()),rotation=60)
plt.title("Poverty level")
print(label_counts)

Thus we see imbalanced class problem as far more non vulnerable households than extreme 
this makes difficulty in predicting extreme as far less cases

Also from previous graphs we can determine there is case of wrong label as same household has different levels of poverty which is wrong.We are told to use head of household as true label.

**Identification**

In [None]:
# Groupby the household and figure out the number of unique values
all_equal = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

# Households where targets are not all equal
not_equal = all_equal[all_equal != True]
print('There are {} households where the family members do not all have the same target.'.format(len(not_equal)))

Families without heads of household

In [None]:
households_leader = train.groupby('idhogar')['parentesco1'].sum()

# Find households without a head
households_no_head = train.loc[train['idhogar'].isin(households_leader[households_leader == 0].index), :]

print('There are {} households without a head.'.format(households_no_head['idhogar'].nunique()))

Now we try to find households without a head but different labels

In [None]:
households_no_head_equal = households_no_head.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
print('{} Households with no head have different labels.'.format(sum(households_no_head_equal == False)))

Thus there are no households with no heads and different targets

**Correction**

We start with correcting households with target level different than household leader

In [None]:
#not_equal--all households with incorrect target than their heads
for household in not_equal.index:
    true_target=int(train[(train['idhogar']==household) & (train['parentesco1']==1.0)]["Target"])
    train.loc[train["idhogar"]==household,"Target"]=true_target
#now check again
all_equal=train.groupby("idhogar")["Target"].apply(lambda x:x.nunique()==1)
not_equal=all_equal[all_equal !=True]
print('There are {} households where the family members do not all have the same target.'.format(len(not_equal)))

**Missing values**

In [None]:
missing=pd.DataFrame(data.isnull().sum())
missing=missing.rename(columns={0:'total'})
missing["percentage"]=missing['total']/len(data)
missing.sort_values('percentage',ascending=False).head(10).drop("Target")

We make a function to count values for different columns

In [None]:
def plot_value_counts(df,col,heads_only=False):
    if heads_only:
        df=df.loc[df["parentesco1"]==1].copy()
    plt.figure(figsize=(10,7))
    df[col].value_counts().sort_index().plot.bar(color="blue",edgecolor='k',linewidth=2)
    plt.xlabel(f'{col}')
    plt.title(f'{col} value counts')
    plt.ylabel('Count')
    plt.show()

In [None]:
#print(heads)
plot_value_counts(heads, 'v18q1')


v18q1 signifies whether or not a family contain tablet we can replace NAN with 1, however NAN may also mean that families dont own tablet **v18q**  signifies whether or not a family owns a tablet we combine values to see if our hypothesis is right

In [None]:
heads.groupby('v18q')['v18q1'].apply(lambda x:x.isnull().sum())

Thus we see everyone with no tablets has associated NAN value with it we simply fillna with 0 for v18q1

In [None]:
data['v18q1'] = data['v18q1'].fillna(0)


Next feature is **v2a1** i.e. ***Monthly rent payments***  

v2a1 having having NAN that there must be no rent associated this maybe a cause of ownership of house we have tipovivi1=1 for own and fully paid house.


In [None]:
ownership_cols=[x for x in data if x.startswith('tipo')]

data.loc[data['v2a1'].isnull(),ownership_cols].sum().plot.bar(figsize=(10,8),color='blue',edgecolor='k',linewidth=2)
plt.xticks([0,1,2,3,4],['Owns and Paid Off', 'Owns and Paying', 'Rented', 'Precarious', 'Other'],rotation=60)
plt.title("Home ownership")

**Thus we see houses owned are the ones paying NAN as rent ie no rent we fill values with 1 in tipovivi1 and we can use imputer later to impute remaining values but we put a flag indicating these values**

In [None]:
data.loc[data['tipovivi1']==1,'v2a1']=0

data["v2a1-missing"]=data["v2a1"].isnull()
data["v2a1-missing"].value_counts()

Now we go to next column rez_esc which signifies years behind school

For this it maybe possible that household has no childrens.We check max age of anyone going to school

In [None]:
data.loc[data['rez_esc'].notnull()]['age'].describe()

This tells us oldest age with a missing values is 17 ie oldest one going to school is 17.
We next find max age when NAN. 

In [None]:
data.loc[data['rez_esc'].isnull()]['age'].describe()


 For this variable, if the individual is over 19 and they have a missing value, or if they are younger than 7 and have a missing value we can set it to zero. For anyone else, we'll leave the value to be imputed and add a boolean flag.

In [None]:
data.loc[((data['age'] > 19) | (data['age'] < 7)) & (data['rez_esc'].isnull()), 'rez_esc'] = 0

# Add a flag for those between 7 and 19 with a missing value
data['rez_esc-missing'] = data['rez_esc'].isnull()

any values above 5 should be set to 5.


In [None]:
data.loc[data['rez_esc'] > 5, 'rez_esc'] = 5


**Plot Two Categoricals**

We draw a value count plot for where these values missing

In [None]:
plot_value_counts(data[data['rez_esc-missing']==1],'Target')

In [None]:
plot_value_counts(data[(data['v2a1-missing'] == 1)], 
                  'Target')

Thus we see higher prevelence of 2 in more poverty

In [None]:
import featuretools as ft

# creating and entity set 'es'
es = ft.EntitySet(id = 'Target')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'costa rica', dataframe = data, index = 'Id')

Our data has two levels- Household level and personal level. We use featuretools on idhogar

In [None]:
es.normalize_entity(base_entity_id='costa rica', new_entity_id='households', index = 'idhogar',
                    additional_variables = ['v2a1', 'hhsize'])


In [None]:
print(es)

Now we will use Deep Feature Synthesis to create new features automatically

In [None]:
'''feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'costa rica', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)'''

In [None]:
#feature_matrix.columns


In [None]:
#feature_matrix.head()


However we will manually adjust features as automated tools have created too many new fwatures and they may lead to overfit

There are several different categories of variables:

***1 Individual Variables: these are characteristics of each individual rather than the household
**         
          
            Boolean: Yes or No (0 or 1)
            
            Ordered Discrete: Integers with an ordering
            
            
***2.Household variables
**            
        
            Boolean: Yes or No
            
            Ordered Discrete: Integers with an ordering
            
            Continuous numeric
            
            
***3,Squared Variables: derived from squaring variables in the data
**

***4.Id variables: identifies the data and should not be used as features**

In [None]:
id_ = ['Id', 'idhogar', 'Target']


In [None]:
ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone', 'rez_esc-missing']
ind_ordered = ['rez_esc', 'escolari', 'age']



In [None]:
hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
           'abastaguadentro', 'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area1', 'area2', 'v2a1-missing']

hh_ordered = [ 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1','r4m2','r4m3', 'r4t1',  'r4t2', 
              'r4t3', 'v18q1', 'tamhog','tamviv','hhsize','hogar_nin',
              'hogar_adul','hogar_mayor','hogar_total',  'bedrooms', 'qmobilephone']
hh_cont = ['v2a1', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'overcrowding']

In [None]:
sqr_ = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 
        'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']

Checking that we have'nt done any repetition or we have'nt missed any variables.

In [None]:
x = ind_bool + ind_ordered + id_ + hh_bool + hh_ordered + hh_cont + sqr_
#print(x)
from collections import Counter


print('There are no repeats: ', np.all(np.array(list(Counter(x).values())) == 1))
print('We covered every variable: ', len(x) == data.shape[1])

We now start to remove squared variables as we will be dealing with complex models rather than just linear as a result such variables add to redundancy

We plot SQBage vs age

In [None]:
sns.lmplot('age','SQBage',data=data,fit_reg=False)
plt.title("Squared Age V/S Age")

These are highly correlated so we would drop square_Variables

In [None]:
data = data.drop(columns = sqr_)
data.shape

Now we deal with redundant variables

In [None]:
corr_matrix = heads.corr().abs()
print(corr_matrix)

In [None]:
upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
to_drop=[column for column in upper.columns if any(upper[column]>0.95)]
to_drop

In [None]:
corr_matrix.loc[corr_matrix['tamhog'].abs() > 0.9, corr_matrix['tamhog'].abs() > 0.9]


In [None]:
sns.heatmap(corr_matrix.loc[corr_matrix['hhsize'].abs() > 0.9, corr_matrix['tamhog'].abs() > 0.9],
            annot=True, cmap = plt.cm.autumn_r, fmt='.3f');

In [None]:
heads = heads.drop(columns = ['tamhog', 'hogar_total', 'r4t3'])

tamviv is not necessarily the same as hhsize because there might be family members that are not living in the household. Let's visualize this difference in a scatterplot.

In [None]:
sns.lmplot('tamviv','hhsize',data,fit_reg=False,size=8);
plt.title('Household size vs num of persons in a household')

Thus number of cases where hhsize!=tamviv. This gives us a good idea for a new feature: the difference between these two measurements

In [None]:
heads['hhsize-diff']=heads['tamviv']-heads['hhsize']

Moving on to coopele (Redundant variable)

0: No electricity
1: Electricity from cooperative
2: Electricity from CNFL, ICA, ESPH/JASEC
3: Electricity from private plant


In [None]:
elec = []

# Assign values
for i, row in heads.iterrows():
    if row['noelec'] == 1:
        elec.append(0)
    elif row['coopele'] == 1:
        elec.append(1)
    elif row['public'] == 1:
        elec.append(2)
    elif row['planpri'] == 1:
        elec.append(3)
    else:
        elec.append(np.nan)
        
# Record the new variable and missing flag
heads['elec'] = elec
heads['elec-missing'] = heads['elec'].isnull()

In [None]:
heads = heads.drop(columns = ['noelec', 'coopele', 'public', 'planpri'])

Coming to area2 redundant variable, we drop it cause we have area 1

In [None]:
heads=heads.drop(columns=['area2'])
heads.groupby('area1')['Target'].value_counts(normalize=True)

In [None]:
heads['walls']=np.argmax(np.array(heads[['epared1', 'epared2', 'epared3']]),axis=1)

In [None]:
heads = heads.drop(columns = ['epared1', 'epared2', 'epared3'])

Similarly for roof and floors

In [None]:
# Roof ordinal variable
heads['roof'] = np.argmax(np.array(heads[['etecho1', 'etecho2', 'etecho3']]),
                           axis = 1)
heads = heads.drop(columns = ['etecho1', 'etecho2', 'etecho3'])

# Floor ordinal variable
heads['floor'] = np.argmax(np.array(heads[['eviv1', 'eviv2', 'eviv3']]),
                           axis = 1)
heads = heads.drop(columns = ['eviv1', 'eviv2', 'eviv3'])

Feature construction 

In [None]:
heads['walls+roof+floor'] = heads['walls'] + heads['roof'] + heads['floor']



In [None]:
series=heads.groupby("Target")['walls+roof+floor'].apply(lambda x:np.sum(x))
print(series)
series.plot.bar(color = 'purple',figsize = (8, 6),edgecolor = 'k', linewidth = 2)
plt.xlabel("Target")
plt.ylabel("Total walls+roof+floor")
plt.title("Target vs cost")
#plt.bar([1.0,2.0,3.0,4.0],[series[i] for i in range(1,5)],color = 'blue',figsize = (8, 6),edgecolor = 'k', linewidth = 2)

Thus we see households with a 4 ie non vulnerable have more spending on floor wall and roof 1 having least.

Next variable will be warning about house no floor no water,no cieling -1 point for each.

In [None]:
heads['warning'] = -1 * (heads['sanitario1'] + 
                         (heads['elec'] == 0) + 
                         heads['pisonotiene'] + 
                         heads['abastaguano'] + 
                         (heads['cielorazo'] == 0))

We draw violinplot which shows the distribution of a variable on the y axis with the width of each plot showing the number of observations in that category.

In [None]:
plt.figure(figsize = (10, 6))
sns.violinplot(x = 'warning', y = 'Target', data = heads);
plt.title('Target vs Warning Variable');

Lesser value of warning should corrospond to lower target and higher value of warning corrospond to higher target as we see in 0 more concentration on 4.

The final household feature we can make for now is a bonus where a family gets a point for having a refrigerator, computer, tablet, or television.

In [None]:
heads['bonus'] = 1 * (heads['refrig'] + 
                      heads['computer'] + 
                      (heads['v18q1'] > 0) + 
                      heads['television'])

sns.violinplot('bonus', 'Target', data = heads,
                figsize = (10, 6));
plt.title('Target vs Bonus Variable');

More Features

In [None]:
heads['phones-per-capita'] = heads['qmobilephone'] / heads['tamviv']
heads['tablets-per-capita'] = heads['v18q1'] / heads['tamviv']
heads['rooms-per-capita'] = heads['rooms'] / heads['tamviv']
heads['rent-per-capita'] = heads['v2a1'] / heads['tamviv']

Measuring Relationships


1. **Pearson Correlation**-Measuring linear relationshipbetween 2 variables.


2.**Spearman Correlation**:Measuring monotonic relationship between two variables.

In [None]:
def plot_corrs(x,y):
    spr=spearmanr(x,y).correlation
    pcr=np.corrcoef(x,y)[0,1]
    
    #scatterplot
    data2=pd.DataFrame({'x':x,'y':y})
    plt.figure(figsize=(6,4))
    sns.regplot('x','y',data=data2,fit_reg=False)
    plt.title(f'Spearman:{round(spr,2)}; Pearson {round(pcr,2)}')

In [None]:
x = np.array(range(100))
y = x ** 2

plot_corrs(x, y)

In [None]:
#Using only training data

train_heads=heads.loc[heads.Target.notnull(),:].copy()

pcorrs = pd.DataFrame(train_heads.corr()['Target'].sort_values()).rename(columns = {'Target': 'pcorr'}).reset_index()
pcorrs = pcorrs.rename(columns = {'index': 'feature'})

print('Most negatively correlated variables:')
print(pcorrs.head())

print('\nMost positively correlated variables:')
print(pcorrs.dropna().tail())

In [None]:
'''import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)

feats = []
scorr = []
pvalues = []

# Iterate through each column
for c in heads:
    #print(c)
    # Only valid for numbers
    if heads[c].dtype != 'object':
        feats.append(c)
        print(scorr)
        # Calculate spearman correlation
        scorr.append(spearmanr(train_heads[c], train_heads['Target']).correlation)
        pvalues.append(spearmanr(train_heads[c], train_heads['Target']).pvalue)

scorrs = pd.DataFrame({'feature': feats, 'scorr': scorr, 'pvalue': pvalues}).sort_values('scorr')


print('Most negative Spearman correlations:')
print(scorrs.head())
print('\nMost positive Spearman correlations:')
print(scorrs.dropna().tail())

For the most part, the two methods of calculating correlations are in agreement. Just out of curiousity, we can look for the values that are furthest apart.

corrs = pcorrs.merge(scorrs, on = 'feature')
corrs['diff'] = corrs['pcorr'] - corrs['scorr']

corrs.sort_values('diff').head()

corrs.sort_values('diff').dropna().tail()



'''

The largest discrepancy in the correlations is dependency. We can make a scatterplot of the Target versus the dependency to visualize the relationship. We'll add a little jitter to the plot because these are both discrete variables.


In [None]:
sns.lmplot('dependency', 'Target', fit_reg = True, data = train_heads, x_jitter=0.05, y_jitter=0.05);
plt.title('Target vs Dependency');

In [None]:
sns.lmplot('rooms-per-capita', 'Target', fit_reg = True, data = train_heads, x_jitter=0.05, y_jitter=0.05);
plt.title('Target vs Rooms Per Capita');

Correlation HeatMap

In [None]:
variables = ['Target', 'dependency', 'warning', 'walls+roof+floor', 'meaneduc',
             'floor', 'r4m1', 'overcrowding']

#Calculating correlations
corr_mat=train_heads[variables].corr().round(2)

# Draw a correlation heatmap
plt.rcParams['font.size'] = 18
plt.figure(figsize = (12, 12))

sns.heatmap(corr_mat, vmin = -0.5, vmax = 0.8, center = 0, 
            cmap = plt.cm.RdYlGn_r, annot = True);


 There are also high correlations between some variables (such as floor and walls+roof+floor) which could pose an issue because of collinearity.

Features Plot

This shows scatterplots on the upper triangle, kernel density estimate (kde) plots on the diagonal, and 2D KDE plots on the lower triangle.

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Copy the data for plotting
plot_data = train_heads[['Target', 'dependency', 'walls+roof+floor',
                         'meaneduc', 'overcrowding']]

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 4, diag_sharey=False,
                    hue = 'Target', hue_order = [4, 3, 2, 1], 
                    vars = [x for x in list(plot_data.columns) if x != 'Target'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.8, s = 20)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);
grid = grid.add_legend()
plt.suptitle('Feature Plots Colored By Target', size = 32, y = 1.05);


In [None]:
household_feats = list(heads.columns)


**Individual Level Variables
**

There are two types of individual level variables: Boolean (1 or 0 for True or False) and ordinal (discrete values with a meaningful ordering).

In [None]:
ind = data[id_ + ind_bool + ind_ordered]
ind.shape

**Redundant Individual Variables
**

We carry out same procedure as in household variables.
We'll focus on any variables that have an absolute magnitude of the correlation coefficient greater than 0.95.

In [None]:
corr_matrix=ind.corr()

upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape).astype(np.bool)))
to_drop=[column for column in upper.columns if any(abs(upper[column])>0.95)]
to_drop

In [None]:
ind = ind.drop(columns = 'male')#for male is just not female


In [None]:
ind[[c for c in ind if c.startswith('instl')]].head()


In [None]:
ind['inst'] = np.argmax(np.array(ind[[c for c in ind if c.startswith('instl')]]), axis = 1)
ls=[1,2,3,4]
color_maps={1:'r',2:'y',3:'b',4:'g'}
plt.figure(figsize = (10,10))
s=25
for i in ls:
    #print(ind.loc[ind['Target']==i]['inst'].value_counts())
    sns.scatterplot(data=ind.loc[ind['Target']==i]['inst'].value_counts(),color=color_maps[i],label=str(i),s=s*(i+4))
plt.xlabel("Years of education")
plt.ylabel("Count of people")
plt.title("years of education V/S count of people")

Thus we see poverty level 4 are most educated ,1 being the least also there are nominal poverty level 1 people who are educated over 4 years of age. Also as education years increase poverty level tends to be more towards 4 i.e increase.

In [None]:
plt.figure(figsize = (10, 8))
sns.violinplot(x = 'Target', y = 'inst', data = ind);
plt.title('Education Distribution by Target');

Feature construction

In [None]:
ind['escolari/age'] = ind['escolari'] / ind['age']

plt.figure(figsize = (10, 8))
sns.violinplot('Target', 'escolari/age', data = ind);

We can also take our new variable, inst, and divide this by the age. The final variable we'll name tech: this represents the combination of tablet and mobile phones

In [None]:
ind['inst/age'] = ind['inst'] / ind['age']
ind['tech'] = ind['v18q'] + ind['mobilephone']
ind['tech'].describe()

In order to incorporate the individual data into the household data, we need to aggregate it for each household. The simplest way to do this is to groupby the family id idhogar and then agg the data. For the aggregations for ordered or continuous variables, we can use six, five of which are built in to pandas, and one of which we define ourselves range_. The boolean aggregations can be the same, but this will create many redundant columns which we will then need to drop. For this case, we'll use the same aggregations and then go back and drop the redundant columns.

In [None]:
# Define custom function
range_ = lambda x: x.max() - x.min()
range_.__name__ = 'range_'

# Group and aggregate
ind_agg = ind.drop(columns = 'Target').groupby('idhogar').agg(['min', 'max', 'sum', 'count', 'std', range_])
ind_agg.head().describe()

Now we have 180  features from 30. Renaming the columns

In [None]:
new_cols=[]
for c in ind_agg.columns.levels[0]:
    for stat in ind_agg.columns.levels[1]:
        new_cols.append(f'{c}-{stat}')
ind_agg.columns=new_cols
ind_agg.head().describe()

In [None]:
ind_agg.iloc[:, [0, 1, 2, 3, 6, 7, 8, 9]].head()


**Feature Selection**

As a first round of selection we remove one out of every pair of variable with a correlation >0.95

In [None]:
# Create correlation matrix
corr_matrix = ind_agg.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

print(f'There are {len(to_drop)} correlated columns to remove.')

In [None]:
to_drop


We'll drop the columns and then merge with the heads data to create a final dataframe.

In [None]:
ind_agg = ind_agg.drop(columns = to_drop)
ind_feats = list(ind_agg.columns)


# Merge on the household id
final = heads.merge(ind_agg, on = 'idhogar', how = 'left')

print('Final features shape: ', final.shape)

In [None]:
final=final.drop(["v2a1","rooms",'v18q1','rez_esc','male',],axis=1)

In [None]:
final.head()

**Final Data Exploration***

In [None]:
corrs = final.corr()['Target']
corrs.sort_values().head()

In [None]:
corrs.sort_values().dropna().tail()

In [None]:
plt.figure(figsize = (10, 6))
sns.violinplot(x = 'Target', y = 'escolari-max', data = final);
plt.title('Max Schooling by Target');

In [None]:
plt.figure(figsize = (10, 6))
sns.boxplot(x = 'Target', y = 'escolari-max', data = final);
plt.title('Max Schooling by Target');

In [None]:
plt.figure(figsize = (10, 6))
sns.boxplot(x = 'Target', y = 'meaneduc', data = final);
plt.xticks([0, 1, 2, 3], poverty_mapping.values())
plt.title('Average Schooling by Target');

In [None]:
plt.figure(figsize = (10, 6))
sns.boxplot(x = 'Target', y = 'overcrowding', data = final);
plt.xticks([0, 1, 2, 3], poverty_mapping.values())
plt.title('Overcrowding by Target');

One other feature that might be useful is the gender of the head of household. Since we aggregated the data, we'll have to go back to the individual level data and find the gender for the head of household.

In [None]:
head_gender = ind.loc[ind['parentesco1'] == 1, ['idhogar', 'female']]
final = final.merge(head_gender, on = 'idhogar', how = 'left').rename(columns = {'female': 'female-head'})

In [None]:
head_gender = ind.loc[ind['parentesco1'] == 1, ['idhogar', 'female']]
final = final.merge(head_gender, on = 'idhogar', how = 'left').rename(columns = {'female': 'female-head'})

It looks like households where the head is female are slightly more likely to have a severe level of poverty.

In [None]:
sns.violinplot(x = 'female-head', y = 'Target', data = final);
plt.title('Target by Female Head of Household');

**Machine Learning modelling**

In [None]:
scorer=make_scorer(f1_score,greater_is_better=True,average='macro')

In [None]:
train_labels = np.array(list(final[final['Target'].notnull()]['Target'].astype(np.uint8)))

# Extract the training data
train_set = final[final['Target'].notnull()].drop(columns = ['Id', 'idhogar', 'Target'])
test_set = final[final['Target'].isnull()].drop(columns = ['Id', 'idhogar', 'Target'])
print(test_set.shape)


submission_base = test[['Id', 'idhogar']].copy()


Feature scaling and imputing pipeline.

In [None]:


pipeline=Pipeline([('imputer',Imputer(strategy='median')),
                  ('scaler',MinMaxScaler())])
features=list(train_set.columns)
train_set=pipeline.fit_transform(train_set)
test_set=pipeline.transform(test_set)

model RandomForestClassifier

In [None]:
model=RandomForestClassifier(n_estimators=100,random_state=10,n_jobs=-1)
cv_score=cross_val_score(model,train_set,train_labels,cv=10,scoring=scorer)
print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

**Feature Importances**


If we want to view the feature importances, we'll have to train a model on the whole training set. Cross validation does not return the feature importances.

In [None]:
model.fit(train_set,train_labels)

#Feature importances into a dataframe

feature_importances=pd.DataFrame({'feature':features,'importance':model.feature_importances_})
feature_importances.head()

Function to plot feature importances

In [None]:

def plot_feature_importances(df, n = 10, threshold = None):
    """Plots n most important features. Also plots the cumulative importance if
    threshold is specified and prints the number of features needed to reach threshold cumulative importance.
    Intended for use with any tree-based feature importances. 
    
    Args:
        df (dataframe): Dataframe of feature importances. Columns must be "feature" and "importance".
    
        n (int): Number of most important features to plot. Default is 15.
    
        threshold (float): Threshold for cumulative importance plot. If not provided, no plot is made. Default is None.
        
    Returns:
        df (dataframe): Dataframe ordered by feature importances with a normalized column (sums to 1) 
                        and a cumulative importance column
    
    Note:
    
        * Normalization in this case means sums to 1. 
        * Cumulative importance is calculated by summing features from most to least important
        * A threshold of 0.9 will show the most important features needed to reach 90% of cumulative importance
    
    """
    plt.style.use('fivethirtyeight')
    
    # Sort features with most important at the head
    df = df.sort_values('importance', ascending = False).reset_index(drop = True)
    
    # Normalize the feature importances to add up to one and calculate cumulative importance
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])
    
    plt.rcParams['font.size'] = 12
    
    # Bar plot of n most important features
    df.loc[:n, :].plot.barh(y = 'importance_normalized', 
                            x = 'feature', color = 'darkgreen', 
                            edgecolor = 'k', figsize = (12, 8),
                            legend = False, linewidth = 2)

    plt.xlabel('Normalized Importance', size = 18); plt.ylabel(''); 
    plt.title(f'{n} Most Important Features', size = 18)
    plt.gca().invert_yaxis()
    
    
    if threshold:
        # Cumulative importance plot
        plt.figure(figsize = (8, 6))
        plt.plot(list(range(len(df))), df['cumulative_importance'], 'b-')
        plt.xlabel('Number of Features', size = 16); plt.ylabel('Cumulative Importance', size = 16); 
        plt.title('Cumulative Feature Importance', size = 18);
        
        # Number of features needed for threshold cumulative importance
        # This is the index (will need to add 1 for the actual number)
        importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
        
        # Add vertical line to plot
        plt.vlines(importance_index + 1, ymin = 0, ymax = 1.05, linestyles = '--', colors = 'red')
        plt.show();
        
        print('{} features required for {:.0f}% of cumulative importance.'.format(importance_index + 1, 
                                                                                  100 * threshold))
    
    return df

In [None]:
norm_fi = plot_feature_importances(feature_importances, threshold=0.95)


However, feature importances don't tell us which direction of the feature is important (for example, we can't use these to tell whether more or less education leads to more severe poverty) they only tell us which features the model considered relevant.

In [None]:
def kde_target(df, variable):
    """Plots the distribution of `variable` in `df` colored by the `Target` column"""
    
    colors = {1: 'red', 2: 'orange', 3: 'blue', 4: 'green'}

    plt.figure(figsize = (12, 8))
    
    df = df[df['Target'].notnull()]
    
    for level in df['Target'].unique():
        subset = df[df['Target'] == level].copy()
        sns.kdeplot(subset[variable].dropna(), 
                    label = f'Poverty Level: {level}', 
                    color = colors[int(subset['Target'].unique())])

    plt.xlabel(variable); plt.ylabel('Density');
    plt.title('{} Distribution'.format(variable.capitalize()));

In [None]:
kde_target(final, 'meaneduc')

In [None]:
kde_target(final, 'escolari/age-range_')

In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

In [None]:
import warnings 
from sklearn.exceptions import ConvergenceWarning

# Filter out warnings from models
warnings.filterwarnings('ignore', category = ConvergenceWarning)
warnings.filterwarnings('ignore', category = DeprecationWarning)
warnings.filterwarnings('ignore', category = UserWarning)

# Dataframe to hold results
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

def cv_model(train, train_labels, model, name, model_results=None):
    """Perform 10 fold cross validation of a model"""
    
    cv_scores = cross_val_score(model, train, train_labels, cv = 10, scoring=scorer, n_jobs = -1)
    print(f'10 Fold CV Score: {round(cv_scores.mean(), 5)} with std: {round(cv_scores.std(), 5)}')
    
    if model_results is not None:
        model_results = model_results.append(pd.DataFrame({'model': name, 
                                                           'cv_mean': cv_scores.mean(), 
                                                            'cv_std': cv_scores.std()},
                                                           index = [0]),
                                             ignore_index = True)

        return model_results

In [None]:
model_results = cv_model(train_set, train_labels, LinearSVC(), 
                         'LSVC', model_results)

We see a low performance of scores so we dont use this

In [None]:
model_results = cv_model(train_set, train_labels, 
                         GaussianNB(), 'GNB', model_results)

A very low score again.

In [None]:
model_results = cv_model(train_set, train_labels, 
                         MLPClassifier(hidden_layer_sizes=(32, 64, 128, 64, 32)),
                         'MLP', model_results)

A high score with multi layer perceptron ,we can hyper tune its parameters ,limited amount of data may cause problem as thousands of examples needed with neural networks.

In [None]:
model_results = cv_model(train_set, train_labels, 
                          LinearDiscriminantAnalysis(), 
                          'LDA', model_results)

If you run LinearDiscriminantAnalysis without filtering out the UserWarnings, you get many messages saying "Variables are collinear." This might give us a hint that we want to remove some collinear features! We might want to try this model again after removing the collinear variables because the score is comparable to the random forest.

In [None]:
model_results = cv_model(train_set, train_labels, 
                         RidgeClassifierCV(), 'RIDGE', model_results)

In [None]:
for n in [5, 10, 20]:
    print(f'\nKNN with {n} neighbors\n')
    model_results = cv_model(train_set, train_labels, 
                             KNeighborsClassifier(n_neighbors = n),
                             f'knn-{n}', model_results)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model_results = cv_model(train_set, train_labels, 
                         ExtraTreesClassifier(n_estimators = 100, random_state = 10),
                         'EXT', model_results)

In [None]:
model_results

In [None]:
model_results = cv_model(train_set, train_labels,
                          RandomForestClassifier(100, random_state=10),
                              'RF', model_results)

In [None]:
model_results.index

In [None]:
model_results.set_index('model',inplace=True)
model_results['cv_mean'].plot.bar(color='purple',figsize=(8,6),
                                 yerr=list(model_results['cv_std']),
                                 edgecolor='k',linewidth=2)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10],[x for x in model_results.index ],rotation=60)
plt.title("F1 scores models")
plt.ylabel('Mean F1 Score (with error bar)');
model_results.reset_index(inplace = True)


In [None]:
test_ids = list(final.loc[final['Target'].isnull(), 'idhogar'])


The function below takes in a model, a training set, the training labels, and a testing set and performs the following operations:

Trains the model on the training data using fit
Makes predictions on the test data using predict
Creates a submission dataframe that can be saved and uploaded to the competition

In [None]:
def submit(model, train, train_labels, test, test_ids):
    """Train and test a model on the dataset"""
    
    # Train on the data
    model.fit(train, train_labels)
    predictions = model.predict(test)
    predictions = pd.DataFrame({'idhogar': test_ids,
                               'Target': predictions})

     # Make a submission dataframe
    submission = submission_base.merge(predictions, 
                                       on = 'idhogar',
                                       how = 'left').drop(columns = ['idhogar'])
    
    # Fill in households missing a head
    submission['Target'] = submission['Target'].fillna(4).astype(np.int8)

    return submission

In [None]:
rf_submission = submit(RandomForestClassifier(n_estimators = 100, 
                                              random_state=10, n_jobs = -1), 
                         train_set, train_labels, test_set, test_ids)
print(rf_submission)

rf_submission.to_csv('rf_submission.csv', index = False)

**Feature Selection**

We try identify and keep only essential features.


For feature selection in this notebook, we'll first remove any columns with greater than 0.95 correlation (we already did some of this during feature engineering) and then we'll apply recursive feature elimination with the Scikit-Learn library.

Starting with removing features with >0.95 correlation


In [None]:
train_set=pd.DataFrame(train_set,columns=features)
corr_matrix=train_set.corr()

upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

to_drop

In [None]:
train_set.shape
train_set = train_set.drop(columns = to_drop)
train_set.shape

In [None]:
train_set=pd.DataFrame(train_set,columns=features)
corr_matrix=train_set.corr()

upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

to_drop

All 0.95 > correlations have been removed

In [None]:
test_set = pd.DataFrame(test_set, columns = features)
print(test_set.shape)
train_set, test_set = train_set.align(test_set, axis = 1, join = 'inner')
features = list(train_set.columns)

**Recursive Feature Elimination with Random Forest**


The RFECV in Sklearn stands for Recursive Feature Elimination with Cross Validation. The selector operates using a model with feature importances in an iterative manner. At each iteration, it removes either a fraction of features or a set number of features. The iterations continue until the cross validation score no longer improves.

To create the selector object, we pass in the the model, the number of features to remove at each iteration, the cross validation folds, our custom scorer, and any other parameters to guide the selection.

In [None]:
from sklearn.feature_selection import RFECV

# Create a model for feature selection
estimator = RandomForestClassifier(random_state = 10, n_estimators = 100,  n_jobs = -1)

# Create the object
selector = RFECV(estimator, step = 1, cv = 3, scoring= scorer, n_jobs = -1)


Then we fit the selector on the training data as with any other sklearn model. This will continue the feature selection until the cross validation scores no longer improve.

In [None]:
new_drops=[cols for cols in train_set.columns if train_set[cols].isnull().sum()>2000]
train_set=train_set.drop(columns=new_drops)
test_set=test_set.drop(columns=new_drops)
train_set.isnull().sum().sum()

In [None]:
#test_set=test_set.drop(columns=new_drops)
test_set.shape
train_set.shape

Then we fit the selector on the training data as with any other sklearn model. This will continue the feature selection until the cross validation scores no longer improve.

In [None]:
selector.fit(train_set, train_labels)

In [None]:
plt.plot(selector.grid_scores_);

plt.xlabel('Number of Features'); plt.ylabel('Macro F1 Score'); plt.title('Feature Selection Scores');
selector.n_features_

We can see that the score improves as we add features up until 97 features. According to the selector, this is the optimal number of features.

The rankings of each feature can be found by inspecting the trained object. These represent essentially the importance of features averaged over the iterations. Features can share the same ranking, and only features with a rank of 1 are retained.

In [None]:
rankings = pd.DataFrame({'feature': list(train_set.columns), 'rank': list(selector.ranking_)}).sort_values('rank')
rankings.head(10)

Finally, we select the features and then evaluate in cross validation.



In [None]:
train_selected = selector.transform(train_set)
test_selected = selector.transform(test_set)

In [None]:
# Convert back to dataframe
selected_features = train_set.columns[np.where(selector.ranking_==1)]
train_selected = pd.DataFrame(train_selected, columns = selected_features)
test_selected = pd.DataFrame(test_selected, columns = selected_features)

In [None]:
model_results = cv_model(train_selected, train_labels, model, 'RF-SEL', model_results)

In [None]:
model_results.set_index('model', inplace = True)
model_results['cv_mean'].plot.bar(color = 'orange', figsize = (8, 6),
                                  yerr = list(model_results['cv_std']),
                                 edgecolor = 'k', linewidth = 2)
plt.title('Model F1 Score Results');
plt.ylabel('Mean F1 Score (with error bar)');
model_results.reset_index(inplace = True)

**Upgrading Our Model: Gradient Boosting Machine
**


After using the Random Forest and getting decent scores, it's time to step up and use the gradient boosting machine. If you spend any time on Kaggle, you'll notice that the Gradient Boosting Machine (GBM) wins a high percentage of competitions where the data is structured (in tables) and the datasets are not that large (less than a million observations).

We will focus on the implementation. We'll use the GBM in LightGBM, although there are also options in Scikit-Learn, XGBOOST, and CatBoost. The first set of hyperparameters we'll use were based on those I've found have worked well for other problems.


In [None]:
def macro_f1_score(labels, predictions):
    # Reshape the predictions as needed
    predictions = predictions.reshape(len(np.unique(labels)), -1 ).argmax(axis = 0)
    
    metric_value = f1_score(labels, predictions, average = 'macro')
    
    # Return is name, value, is_higher_better
    return 'macro_f1', metric_value, True

**Light Gradient Boosting Machine Implementation**



In [None]:
print(test.shape)

In [None]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from IPython.display import display

def model_gbm(features, labels, test_features, test_ids, 
              nfolds = 5, return_preds = False, hyp = None):
    """Model using the GBM and cross validation.
       Trains with early stopping on each fold.
       Hyperparameters probably need to be tuned."""
    
    feature_names = list(features.columns)
    print(feature_names)

    # Option for user specified hyperparameters
    if hyp is not None:
        # Using early stopping so do not need number of esimators
        if 'n_estimators' in hyp:
            del hyp['n_estimators']
        params = hyp
    
    else:
        # Model hyperparameters
        params = {'boosting_type': 'dart', 
                  'colsample_bytree': 0.88, 
                  'learning_rate': 0.028, 
                   'min_child_samples': 10, 
                   'num_leaves': 36, 'reg_alpha': 0.76, 
                   'reg_lambda': 0.43, 
                   'subsample_for_bin': 40000, 
                   'subsample': 0.54, 
                   'class_weight': 'balanced'}
    
    # Build the model
    model = lgb.LGBMClassifier(**params, objective = 'multiclass', 
                               n_jobs = -1, n_estimators = 10000,
                               random_state = 10)
    
    # Using stratified kfold cross validation
    strkfold = StratifiedKFold(n_splits = nfolds, shuffle = True)
    
    # Hold all the predictions from each fold
    predictions = pd.DataFrame()
    importances = np.zeros(len(feature_names))# to the size of features present
    
    # Convert to arrays for indexing
    features = np.array(features)
    print(features)
    test_features = np.array(test_features)
    labels = np.array(labels).reshape((-1 ))
    
    valid_scores = []
    modeld=lgb.LGBMClassifier(**params, objective = 'multiclass', 
                               n_jobs = -1, n_estimators = 10000,
                               random_state = 10)
    # Iterate through the folds
    for i, (train_indices, valid_indices) in enumerate(strkfold.split(features, labels)):
        
        # Dataframe for fold predictions
        fold_predictions = pd.DataFrame()
        
        # Training and validation data
        X_train = features[train_indices]
        X_valid = features[valid_indices]
        y_train = labels[train_indices]
        y_valid = labels[valid_indices]
        modeld=model
        # Train with early stopping
        model.fit(X_train, y_train, early_stopping_rounds = 100, 
                  eval_metric = macro_f1_score,
                  eval_set = [(X_train, y_train), (X_valid, y_valid)],
                  eval_names = ['train', 'valid'],#(X_train, y_train) >>>>>> train and (X_valid, y_valid)>>>>>>>>>>>
                  verbose = 200)
        display(model)
        display(model.best_score_)
        # Record the validation fold score
        valid_scores.append(model.best_score_['valid']['macro_f1'])
        
        # Make predictions from the fold as probabilities
        fold_probabilitites = model.predict_proba(test_features) #Returns prediction probabilities for each class of each output.
        display(fold_probabilitites)
        # Record each prediction for each class as a separate column
        for j in range(4):
            fold_predictions[(j + 1)] = fold_probabilitites[:, j]
        display(fold_predictions)    
        # Add needed information for predictions 
        fold_predictions['idhogar'] = test_ids
        fold_predictions['fold'] = (i+1)
        
        # Add the predictions as new rows to the existing predictions
        predictions = predictions.append(fold_predictions)
        
        # Feature importances
        importances += model.feature_importances_ / nfolds   
        display(model.feature_importances_)
        display(importances)
        # Display fold information
        display(f'Fold {i + 1}, Validation Score: {round(valid_scores[i], 5)}, Estimators Trained: {model.best_iteration_}')

    # Feature importances dataframe
    feature_importances = pd.DataFrame({'feature': feature_names,
                                        'importance': importances})
    display("feature_importances")
    display(feature_importances)
    valid_scores = np.array(valid_scores)
    display(f'{nfolds} cross validation score: {round(valid_scores.mean(), 5)} with std: {round(valid_scores.std(), 5)}.')
    display(valid_scores)
    # If we want to examine predictions don't average over folds
    if return_preds:
        predictions['Target'] = predictions[[1, 2, 3, 4]].idxmax(axis = 1)
        predictions['confidence'] = predictions[[1, 2, 3, 4]].max(axis = 1)
        return predictions, feature_importances
    
    # Average the predictions over folds
    predictions = predictions.groupby('idhogar', as_index = False).mean()
    
    # Find the class and associated probability
    predictions['Target'] = predictions[[1, 2, 3, 4]].idxmax(axis = 1)
    predictions['confidence'] = predictions[[1, 2, 3, 4]].max(axis = 1)
    predictions = predictions.drop(columns = ['fold'])
    display(predictions)
    # Merge with the base to have one prediction for each individual
    submission = submission_base.merge(predictions[['idhogar', 'Target']], on = 'idhogar', how = 'left').drop(columns = ['idhogar'])
        
    # Fill in the individuals that do not have a head of household with 4 since these will not be scored
    submission['Target'] = submission['Target'].fillna(4).astype(np.int8)
    display(submission)
    display("Model Name")
    display(model)
    # return the submission and feature importances along with validation scores
    return submission, feature_importances, valid_scores

In [None]:
%%capture --no-display
%%capture --no-display
predictions, gbm_fi = model_gbm(train_set, train_labels, test_set, test_ids, return_preds=True)

In [None]:
print(type(predictions))
'''predictions_2 = pd.DataFrame({'Id': predictions['idhogar'],
                               'Target': predictions['Target']})'''

In [None]:
predictions.to_csv('lgb1_submission.csv', index = False)

In [None]:
predictions.head()

For each fold, the 1, 2, 3, 4 columns represent the probability for each Target. The Target is the maximum of these with the confidence the probability. We have the predictions for all 5 folds, so we can plot the confidence in each Target for the different folds.



In [None]:
plt.figure(figsize = (24, 12))
sns.violinplot(x = 'Target', y = 'confidence', hue = 'fold', data = predictions);

In [None]:
predictions = predictions.groupby('idhogar', as_index = False).mean()
predictions['Target']=predictions[[1,2,3,4]].idxmax(axis=1)
predictions['confidence'] = predictions[[1, 2, 3, 4]].max(axis = 1)
predictions = predictions.drop(columns = ['fold'])


plt.figure(figsize=(10,6))
sns.boxplot(x='Target',y='confidence',data=predictions)
plt.title("Confidence vs Target")

plt.figure(figsize=(10,6))
sns.violinplot(x='Target',y='confidence',data=predictions)
plt.title("Confidence vs Target")


In [None]:
%%capture
submission, gbm_fi, valid_scores = model_gbm(train_set, train_labels, 
                                             test_set, test_ids, return_preds=False)

submission.to_csv('gbm_baseline.csv')

In [None]:
_ = plot_feature_importances(gbm_fi, threshold=0.95)


The next step with the LightGBM is to try the features that were selected through recursive feature elimination.

In [None]:
%%capture --no-display
submission, gbm_fi_selected, valid_scores_selected = model_gbm(train_selected, train_labels, 
                                                               test_selected, test_ids)


In [None]:
model_results = model_results.append(pd.DataFrame({'model': ["GBM_10Fold", "GBM_10Fold_SEL"], 
                                                   'cv_mean': [valid_scores.mean(), valid_scores_selected.mean()],
                                                   'cv_std':  [valid_scores.std(), valid_scores_selected.std()]}),
                                    sort = True)

In [None]:
model_results.set_index('model', inplace = True)
model_results['cv_mean'].plot.bar(color = 'orange', figsize = (8, 6),
                                  yerr = list(model_results['cv_std']),
                                 edgecolor = 'k', linewidth = 2)
plt.title('Model F1 Score Results');
plt.ylabel('Mean F1 Score (with error bar)');
model_results.reset_index(inplace = True)


We try 10 fold with both sets and add them to plot.



In [None]:
%%capture
submission, gbm_fi, valid_scores = model_gbm(train_set, train_labels, test_set, test_ids, 
                                             nfolds=10, return_preds=False)

In [None]:
submission.to_csv('gbm_10fold.csv', index = False)


In [None]:
%%capture
submission, gbm_fi_selected, valid_scores_selected = model_gbm(train_selected, train_labels, test_selected, test_ids,
                                                               nfolds=10)

In [None]:
submission.to_csv('gmb_10fold_selected.csv', index = False)


In [None]:
model_results = model_results.append(pd.DataFrame({'model': ["GBM_10Fold", "GBM_10Fold_SEL"], 
                                                   'cv_mean': [valid_scores.mean(), valid_scores_selected.mean()],
                                                   'cv_std':  [valid_scores.std(), valid_scores_selected.std()]}),
                                    sort = True)

In [None]:
model_results.set_index('model', inplace = True)
model_results['cv_mean'].plot.bar(color = 'orange', figsize = (8, 6), 
                                  edgecolor = 'k', linewidth = 2,
                                  yerr = list(model_results['cv_std']))
plt.title('Model F1 Score Results');
plt.ylabel('Mean F1 Score (with error bar)');
model_results.reset_index(inplace = True)