In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Income Qualification
### DESCRIPTION
Identify the level of income qualification needed for the families in Latin America.

Problem Statement Scenario:
Many social programs have a hard time ensuring that the right people are given enough aid. It’s tricky when a program focuses on the poorest segment of the population. This segment of the population can’t provide the necessary income and expense records to prove that they qualify.

In Latin America, a popular method called Proxy Means Test (PMT) uses an algorithm to verify income qualification. With PMT, agencies use a model that considers a family’s observable household attributes like the material of their walls and ceiling or the assets found in their homes to
classify them and predict their level of need.

While this is an improvement, accuracy remains a problem as the region’s population grows and poverty declines.

The Inter-American Development Bank (IDB)believes that new methods beyond traditional econometrics, based on a dataset of Costa Rican household characteristics, might help improve PMT’s performance.<br>

### <b>Following actions should be performed:<b>
<ol>
<li>Identify the output variable.</li>
<li>Understand the type of data.</li>
<li>Check if there are any biases in your dataset.</li>
<li>Check whether all members of the house have the same poverty level.</li>
<li>Check if there is a house without a family head.</li>
<li>Set poverty level of the members and the head of the house within a family.</li>
<li>Count how many null values are existing in columns.</li>
<li>Remove null value rows of the target variable.</li>
<li>Predict the accuracy using random forest classifier.</li>
<li>Check the accuracy using random forest with cross validation.</li>
<ol>

### Core Data fields
Id - a unique identifier for each row.<br>
Target - the target is an ordinal variable indicating groups of income levels.<br>
<ul>
    <li>1 = extreme poverty </li>
    <li>2 = moderate poverty </li>
    <li>3 = vulnerable households </li>
    <li>4 = non vulnerable households</li>
</ul><br>
idhogar - this is a unique identifier for each household. This can be used to create household-wide features, etc. All rows in a given household will have a matching value for this identifier.<br>
parentesco1 - indicates if this person is the head of the household.<br>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()


import warnings
warnings.filterwarnings('ignore')

# <span style='background:blue;color:white'> Understand the Data </span>

In [None]:
df_income_train = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")
df_income_test =  pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")

In [None]:
df_income_train.head()

In [None]:
df_income_train.info()

In [None]:
df_income_test.head()

### NOTE
The important piece of information here is that we don’t have ‘Target’ feature in Test Dataset. There are 3 Types of the features:
    <ul>
    <li>5 object type</li>
    <li>130(Train set)/ 129 (test set) integer type</li>
    <li>8 float type </li>
    </ul>


Lets analyze features:

In [None]:
### List the columns for different datatypes:
print('Integer Type: ')
print(df_income_train.select_dtypes(np.int64).columns)
print('\n')
print('Float Type: ')
print(df_income_train.select_dtypes(np.float64).columns)
print('\n')
print('Object Type: ')
print(df_income_train.select_dtypes(np.object).columns)

In [None]:
df_income_train.select_dtypes('int64').head()

In [None]:
#Find columns with null values
null_counts=df_income_train.select_dtypes('int64').isnull().sum()
null_counts[null_counts > 0]

In [None]:
df_income_train.select_dtypes('float64').head()

In [None]:
#Find columns with null values
null_counts=df_income_train.select_dtypes('float64').isnull().sum()
null_counts[null_counts > 0]

In [None]:
df_income_train.select_dtypes('object').head()

In [None]:
#Find columns with null values
null_counts=df_income_train.select_dtypes('object').isnull().sum()
null_counts[null_counts > 0]

### NOTE
Looking at the different types of data and null values for each feature. We found the following: 
1. No null values for Integer type features. 
2. No null values for object type features. 
3. For float64 types below featufres has null value
   1. v2a1 6860 
   2. v18q1 7342 
   3. rez_esc 7928 
   4. meaneduc 5 
   5. SQBmeaned 5

We also noticed that object type features dependency, edjefe, edjefa have mixed values.<br>
Lets fix the data for features with null values and features with mixed values

# <span style='background:blue;color:white'> Data Cleaning </span>

Let's fix first the column with mixed value:

ddependency, Dependency rate, calculated = 
(number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)

edjefe=
years of education of male head of household, based on the interaction of
escolari (years of education), head of household and gender, yes=1 and no=0

edjefa: years of education of female head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0

<font color='red'>For these three variables, it seems “yes” = 1 and “no” = 0. We can correct the variables using a mapping and convert to floats.</font>


In [None]:
mapping={'yes':1,'no':0}

for df in [df_income_train, df_income_test]:
    df['dependency'] =df['dependency'].replace(mapping).astype(np.float64)
    df['edjefe'] =df['edjefe'].replace(mapping).astype(np.float64)
    df['edjefa'] =df['edjefa'].replace(mapping).astype(np.float64)
    
df_income_train[['dependency','edjefe','edjefa']].describe()

### NOTE

Lets fix the column with null values<br>
According to the documentation for these columns:<br>

v2a1 (total nulls: 6860) : Monthly rent payment<br>
v18q1 (total nulls: 7342) : number of tablets household owns<br>
rez_esc (total nulls: 7928) : Years behind in school<br>
meaneduc (total nulls: 5) : average years of education for adults (18+)<br>
SQBmeaned (total nulls: 5) : square of the mean years of education of adults (>=18) in the household 142<br>

Lets look at v2a1 (total nulls: 6860) : Monthly rent payment 

why the null values, Lets look at few rows with nulls in v2a1:

1. Columns related to  Monthly rent payment
2. tipovivi1, =1 own and fully paid house
3. tipovivi2, "=1 own,  paying in installments"
4. tipovivi3, =1 rented
5. tipovivi4, =1 precarious 
6. tipovivi5, "=1 other(assigned,  borrowed)"

In [None]:
data = df_income_train[df_income_train['v2a1'].isnull()].head()

columns=['tipovivi1','tipovivi2','tipovivi3','tipovivi4','tipovivi5']
data[columns]

In [None]:
# Variables indicating home ownership
own_variables = [x for x in df_income_train if x.startswith('tipo')]


# Plot of the home ownership variables for home missing rent payments
df_income_train.loc[df_income_train['v2a1'].isnull(), own_variables].sum().plot.bar(figsize = (10, 8),
                                                                        color = 'green',
                                                              edgecolor = 'k', linewidth = 2);
plt.xticks([0, 1, 2, 3, 4],
           ['Owns and Paid Off', 'Owns and Paying', 'Rented', 'Precarious', 'Other'],
          rotation = 20)
plt.title('Home Ownership Status for Households Missing Rent Payments', size = 18);

In [None]:
#Looking at the above data it makes sense that when the house is fully paid, there will be no monthly rent payment.
#Lets add 0 for all the null values.
for df in [df_income_train, df_income_test]:
    df['v2a1'].fillna(value=0, inplace=True)

df_income_train[['v2a1']].isnull().sum()

### NOTE
Lets look at v18q1 (total nulls: 7342) : number of tablets household owns<br>
why the null values, Lets look at few rows with nulls in v18q1<br>
Columns related to  number of tablets household owns <br>
v18q, owns a tablet<br>

Since this is a household variable, it only makes sense to look at it on a household level, so we'll only select the rows for the head of household.

In [None]:
# Heads of household### NOTE
heads = df_income_train.loc[df_income_train['parentesco1'] == 1].copy()
heads.groupby('v18q')['v18q1'].apply(lambda x: x.isnull().sum())

In [None]:
plt.figure(figsize = (8, 6))
col='v18q1'
df_income_train[col].value_counts().sort_index().plot.bar(color = 'blue',
                                             edgecolor = 'k',
                                             linewidth = 2)
plt.xlabel(f'{col}'); plt.title(f'{col} Value Counts'); plt.ylabel('Count')
plt.show();

### NOTE
Looking at the above data it makes sense that when owns a tablet column is 0, there will be no number of tablets household owns. Lets add 0 for all the null values.

In [None]:
for df in [df_income_train, df_income_test]:
    df['v18q1'].fillna(value=0, inplace=True)

df_income_train[['v18q1']].isnull().sum()

### NOTE
Lets look at rez_esc    (total nulls: 7928) : Years behind in school <br> 
 why the null values, Lets look at few rows with nulls in rez_esc <br> 
 Columns related to Years behind in school  <br> 
 Age in years

In [None]:
# Lets look at the data with not null values first.
df_income_train[df_income_train['rez_esc'].notnull()]['age'].describe()

### NOTE
From the above , we see that when min age is 7 and max age is 17 for Years, then the 'behind in school' column has a value.<br>
Lets confirm

In [None]:
df_income_train.loc[df_income_train['rez_esc'].isnull()]['age'].describe()

In [None]:
df_income_train.loc[(df_income_train['rez_esc'].isnull() & 
                     ((df_income_train['age'] > 7) & (df_income_train['age'] < 17)))]['age'].describe()
#There is one value that has Null for the 'behind in school' column with age between 7 and 17 

In [None]:
df_income_train[(df_income_train['age'] ==10) & df_income_train['rez_esc'].isnull()].head()
df_income_train[(df_income_train['Id'] =='ID_f012e4242')].head()
#there is only one member in household for the member with age 10 and who is 'behind in school'. This explains why the member is 
#behind in school.

In [None]:
#from above we see that  the 'behind in school' column has null values 
# Lets use the above to fix the data
for df in [df_income_train, df_income_test]:
    df['rez_esc'].fillna(value=0, inplace=True)
df_income_train[['rez_esc']].isnull().sum()

### NOTE
Lets look at meaneduc   (total nulls: 5) : average years of education for adults (18+)  <br>
why the null values, Lets look at few rows with nulls in meaneduc  <br>
Columns related to average years of education for adults (18+)    <br>
edjefe, years of education of male head of household, based on the interaction of escolari (years of education),  <br>
head of household and gender, yes=1 and no=0  <br>
edjefa, years of education of female head of household, based on the interaction of escolari (years of education),   <br>
head of household and gender, yes=1 and no=0   <br>
instlevel1, =1 no level of education  <br>
instlevel2, =1 incomplete primary   <br>

In [None]:
data = df_income_train[df_income_train['meaneduc'].isnull()].head()

columns=['edjefe','edjefa','instlevel1','instlevel2']
data[columns][data[columns]['instlevel1']>0].describe()

In [None]:
#from the above, we find that meaneduc is null when no level of education is 0
#Lets fix the data
for df in [df_income_train, df_income_test]:
    df['meaneduc'].fillna(value=0, inplace=True)
df_income_train[['meaneduc']].isnull().sum()

### NOTE
Lets look at SQBmeaned  (total nulls: 5) : square of the mean years of education of adults (>=18) in the household 142  <br>
why the null values, Lets look at few rows with nulls in SQBmeaned<br>
Columns related to average years of education for adults (18+)  <br>
edjefe, years of education of male head of household, based on the interaction of escolari (years of education),<br>
head of household and gender, yes=1 and no=0<br>
edjefa, years of education of female head of household, based on the interaction of escolari (years of education), <br>
head of household and gender, yes=1 and no=0 <br>
instlevel1, =1 no level of education<br>
instlevel2, =1 incomplete primary <br>

In [None]:
data = df_income_train[df_income_train['SQBmeaned'].isnull()].head()

columns=['edjefe','edjefa','instlevel1','instlevel2']
data[columns][data[columns]['instlevel1']>0].describe()

In [None]:
#from the above, we find that SQBmeaned is null when no level of education is 0
#Lets fix the data
for df in [df_income_train, df_income_test]:
    df['SQBmeaned'].fillna(value=0, inplace=True)
df_income_train[['SQBmeaned']].isnull().sum()

In [None]:
#Lets look at the overall data
null_counts = df_income_train.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

In [None]:
# Groupby the household and figure out the number of unique values
all_equal = df_income_train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

# Households where targets are not all equal
not_equal = all_equal[all_equal != True]
print('There are {} households where the family members do not all have the same target.'.format(len(not_equal)))

In [None]:
#Lets check one household
df_income_train[df_income_train['idhogar'] == not_equal.index[0]][['idhogar', 'parentesco1', 'Target']]

In [None]:
#Lets use Target value of the parent record (head of the household) and update rest. But before that lets check
# if all families has a head. 

households_head = df_income_train.groupby('idhogar')['parentesco1'].sum()

# Find households without a head
households_no_head = df_income_train.loc[df_income_train['idhogar'].isin(households_head[households_head == 0].index), :]

print('There are {} households without a head.'.format(households_no_head['idhogar'].nunique()))

In [None]:
# Find households without a head and where Target value are different
households_no_head_equal = households_no_head.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
print('{} Households with no head have different Target value.'.format(sum(households_no_head_equal == False)))

In [None]:
#Lets fix the data
#Set poverty level of the members and the head of the house within a family.
# Iterate through each household
for household in not_equal.index:
    # Find the correct label (for the head of household)
    true_target = int(df_income_train[(df_income_train['idhogar'] == household) & (df_income_train['parentesco1'] == 1.0)]['Target'])
    
    # Set the correct label for all members in the household
    df_income_train.loc[df_income_train['idhogar'] == household, 'Target'] = true_target
    
    
# Groupby the household and figure out the number of unique values
all_equal = df_income_train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

# Households where targets are not all equal
not_equal = all_equal[all_equal != True]
print('There are {} households where the family members do not all have the same target.'.format(len(not_equal)))

### NOTE
Lets look at the dataset and plot head of household and Target

In [None]:
# 1 = extreme poverty 2 = moderate poverty 3 = vulnerable households 4 = non vulnerable households 
target_counts = heads['Target'].value_counts().sort_index()
target_counts

In [None]:
target_counts.plot.bar(figsize = (8, 6),linewidth = 2,edgecolor = 'k',title="Target vs Total_Count")

# Note
extreme poverty is the smallest count in the train dataset. The dataset is biased.

Lets look at the Squared Variables<br>
‘SQBescolari’<br>
‘SQBage’<br>
‘SQBhogar_total’<br>
‘SQBedjefe’<br>
‘SQBhogar_nin’<br>
‘SQBovercrowding’<br>
‘SQBdependency’<br>
‘SQBmeaned’<br>
‘agesq’<br>

In [None]:
#Lets remove them
print(df_income_train.shape)
cols=['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 
        'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']


for df in [df_income_train, df_income_test]:
    df.drop(columns = cols,inplace=True)

print(df_income_train.shape)

In [None]:
id_ = ['Id', 'idhogar', 'Target']

ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone']

ind_ordered = ['rez_esc', 'escolari', 'age']

hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
           'abastaguadentro', 'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area1', 'area2']

hh_ordered = [ 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1','r4m2','r4m3', 'r4t1',  'r4t2', 
              'r4t3', 'v18q1', 'tamhog','tamviv','hhsize','hogar_nin',
              'hogar_adul','hogar_mayor','hogar_total',  'bedrooms', 'qmobilephone']

hh_cont = ['v2a1', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'overcrowding']

In [None]:
#Check for redundant household variables
heads = df_income_train.loc[df_income_train['parentesco1'] == 1, :]
heads = heads[id_ + hh_bool + hh_cont + hh_ordered]
heads.shape

In [None]:
# Create correlation matrix
corr_matrix = heads.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

to_drop

In [None]:
['coopele', 'area2', 'tamhog', 'hhsize', 'hogar_total']

In [None]:
corr_matrix.loc[corr_matrix['tamhog'].abs() > 0.9, corr_matrix['tamhog'].abs() > 0.9]

In [None]:
sns.heatmap(corr_matrix.loc[corr_matrix['tamhog'].abs() > 0.9, corr_matrix['tamhog'].abs() > 0.9],
            annot=True, cmap = plt.cm.Accent_r, fmt='.3f');

# Note
There are several variables here having to do with the size of the house:<br>
 r4t3, Total persons in the household<br>
 tamhog, size of the household<br>
 tamviv, number of persons living in the household<br>
 hhsize, household size<br>
 hogar_total, # of total individuals in the household<br>
 These variables are all highly correlated with one another.<br>

In [None]:
cols=['tamhog', 'hogar_total', 'r4t3']
for df in [df_income_train, df_income_test]:
    df.drop(columns = cols,inplace=True)

df_income_train.shape

In [None]:
#Check for redundant Individual variables
ind = df_income_train[id_ + ind_bool + ind_ordered]
ind.shape

In [None]:
# Create correlation matrix
corr_matrix = ind.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

to_drop

In [None]:
# This is simply the opposite of male! We can remove the male flag.
for df in [df_income_train, df_income_test]:
    df.drop(columns = 'male',inplace=True)

df_income_train.shape

In [None]:
#lets check area1 and area2 also
# area1, =1 zona urbana 
# area2, =2 zona rural 
#area2 redundant because we have a column indicating if the house is in a urban zone

for df in [df_income_train, df_income_test]:
    df.drop(columns = 'area2',inplace=True)

df_income_train.shape

In [None]:
#Finally lets delete 'Id', 'idhogar'
cols=['Id','idhogar']
for df in [df_income_train, df_income_test]:
    df.drop(columns = cols,inplace=True)

df_income_train.shape

# <span style='background:blue;color:white'> Predict the accuracy using random forest classifier. </span>

In [None]:
df_income_train.iloc[:,0:-1]

In [None]:
df_income_train.iloc[:,-1]

In [None]:
x_features=df_income_train.iloc[:,0:-1] # feature without target
y_features=df_income_train.iloc[:,-1] # only target
print(x_features.shape)
print(y_features.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report

x_train,x_test,y_train,y_test=train_test_split(x_features,y_features,test_size=0.2,random_state=1)
rmclassifier = RandomForestClassifier()

<b>x_features, y_features:</b> The first parameter is the dataset you're selecting to use.<br>
<b>train_size</b>: This parameter sets the size of the training dataset. There are three options: None, which is the default, Int, which requires the exact number of samples, and float, which ranges from 0.1 to 1.0.<br>
<b>test_size</b>: This parameter specifies the size of the testing dataset. The default state suits the training size. It will be set to 0.25 if the training size is set to default.<br>
<b>random_state</b>: The default mode performs a random split using np.random. Alternatively, you can add an integer using an exact number.

In [None]:
rmclassifier.fit(x_train,y_train)

In [None]:
y_predict = rmclassifier.predict(x_test)

In [None]:
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))
print(classification_report(y_test,y_predict))

In [None]:
y_predict_testdata = rmclassifier.predict(df_income_test)

In [None]:
y_predict_testdata

# <span style='background:blue;color:white'> Check the accuracy using random forest with cross validation. </span>

In [None]:
from sklearn.model_selection import KFold,cross_val_score

In [None]:
seed=7
kfold=KFold(n_splits=5,random_state=seed,shuffle=True)

rmclassifier=RandomForestClassifier(random_state=10,n_jobs = -1)
print(cross_val_score(rmclassifier,x_features,y_features,cv=kfold,scoring='accuracy'))
results=cross_val_score(rmclassifier,x_features,y_features,cv=kfold,scoring='accuracy')
print(results.mean()*100)

In [None]:
num_trees= 100

rmclassifier=RandomForestClassifier(n_estimators=100, random_state=10,n_jobs = -1)
print(cross_val_score(rmclassifier,x_features,y_features,cv=kfold,scoring='accuracy'))
results=cross_val_score(rmclassifier,x_features,y_features,cv=kfold,scoring='accuracy')
print(results.mean()*100)

In [None]:
rmclassifier.fit(x_features,y_features)
labels = list(x_features)
feature_importances = pd.DataFrame({'feature': labels, 'importance': rmclassifier.feature_importances_})
feature_importances=feature_importances[feature_importances.importance>0.015]
feature_importances.head()

In [None]:
y_predict_testdata = rmclassifier.predict(df_income_test)
y_predict_testdata

In [None]:
feature_importances.sort_values(by=['importance'], ascending=True, inplace=True)
feature_importances['positive'] = feature_importances['importance'] > 0
feature_importances.set_index('feature',inplace=True)
feature_importances.head()

feature_importances.importance.plot(kind='barh', figsize=(11, 6),color = feature_importances.positive.map({True: 'blue', False: 'red'}))
plt.xlabel('Importance')

From the above figure, meaneduc,dependency,overcrowding has significant influence on the model.
<br>----THE END ---