### Importing Libraries 

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
plt.style.use('ggplot')

import seaborn as sns

from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

seed = np.random.seed(21)

###  Loading Train and Test Data

In [None]:
#Loading Train Data 
data_train = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")
display(data_train.shape, data_train.head())

In [None]:
#Loading Test Data
data_test = pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")
display(data_test.shape, data_test.head())

### Inferences from the first view of data -
- There are 142 features and 1 Target field
- **Id** is the unique identifier for each datapoint
- **Target** is an ordinal variable indicating the income levels 
        - 1 : Extreme Poverty
        - 2 : Moderate Poverty
        - 3 : Vulnerable Households
        - 4 : Non-vulnerable Households
            
- **idhogar** is an unique identifier for each houshold. Hence, for household level analysis this feature should be considered.
- **parentesco1** indicates if the person is head of the family
- There are 9556 datapoints in Train Dataset and 23856 datapoints in Test Dataset



## Identify the output variable

The output variable is **Target** column from the dataset.

Test dataset does not have the **Target** column.

It has zero null values.Datatype is int64.

Possible values of **Target** column -
1, 2, 3, 4






## Understand the type of data

In [None]:
data_train_info = pd.DataFrame(columns=['Name of Col', 'Num of Null', 'Dtype', 'N_Unique'])

for i in range(0, len(data_train.columns)):
    data_train_info.loc[i] = [data_train.columns[i],
                        data_train[data_train.columns[i]].isnull().sum(),
                        data_train[data_train.columns[i]].dtypes,
                        data_train[data_train.columns[i]].nunique()] 
data_train_info

#data_train_info.iloc(["Num of Null" != 0])

## Check if there are any biases in your dataset

In [None]:
#count of unique rows in Target column
data_train["Target"].value_counts()

In [None]:
data_train['Target'].value_counts().plot.bar(width = 0.4, color = 'c', edgecolor = 'k', linewidth = 1)
plt.xlabel("Target Values")
plt.ylabel("Count of Households")
plt.title("Target Column Spread")
plt.show()

From the above output, we infer that -

As the extreme poverty is the smallest count, hence the dataset is not biased.

##  Check whether all members of the house have the same poverty level.

In [None]:
#households with same poverty level
poverty_equal = data_train.groupby('idhogar')['Target'].apply(lambda X: X.nunique() == 1) 
#households with un-equal poverty level
poverty_nequal = poverty_equal[poverty_equal != True] 
print("There are {} households where all the family members of the house do not have same poverty level." .format(len(poverty_nequal)))

## Check if there is a house without a family head.

In [None]:
#households with head
household_head = data_train.groupby('idhogar')['parentesco1'].sum()

In [None]:
#households without head
household_without_head = data_train.loc[data_train['idhogar'].isin(household_head[household_head == 0].index), :]
household_without_head["idhogar"].nunique()

There are 15 houses without a Family Head

## Set poverty level of the members and the head of the house within a family.

As we found out that there are 15 households without a head. 
First lets check if members of these households have same Target/Povertylevel or different.

In [None]:
#Finding the Households without head and poverty level(Target Value) is different

household_without_head_equal = household_without_head.groupby('idhogar')['Target'].apply(lambda X: X.nunique() == 1) 
print("{} households without head have different PovertyLevel/Target Value." .format(sum(household_without_head_equal == False)))

Hence, we need to focus on only the households with head for setting the poverty levels.

In [None]:
#Now setting poverty level of the members same as the head of the house within a family.
#Iterating through each household

for household in poverty_nequal.index:
    #Finding the correct label(label of house head)
    head_target = int(data_train[(data_train['idhogar'] == household) & (data_train['parentesco1'] == 1.0)] ['Target'])
    #Setting the correct label for family members
    data_train.loc[data_train['idhogar'] == household, 'Target'] = head_target

##  Count how many null values are existing in columns.

In [None]:
data_train_info = pd.DataFrame(columns=['Name of Col', 'Num of Null', 'Dtype', 'N_Unique'])

for i in range(0, len(data_train.columns)):
    data_train_info.loc[i] = [data_train.columns[i],
                        data_train[data_train.columns[i]].isnull().sum(),
                        data_train[data_train.columns[i]].dtypes,
                        data_train[data_train.columns[i]].nunique()] 
data_train_info

In [None]:
data_train_info[data_train_info["Num of Null"] > 0]

In [None]:
data_train_info["Num of Null"].sum()

Looking at the different datatypes and null values, we infer that -

1. No Null values for integer datatype features.
2. No Null values for object datatype features.
3. For float64 datatype below features has Null values -
   - v2a1 - 6860 values
   - v18q1 - 7342 values
   - rez_esc - 7928 values
   - meaneduc - 5 values
   - SQBmeaned - 5 values 

4. There are total 22140 Null values in the train dataset.


##  Remove null value rows of the target variable.

In [None]:
data_train[data_train["Target"] == 0]

Looking at the **Target** column, we observed that there are no null values in Target variable 

## Data Cleaning


### Treating Mixed Values 

In [None]:
# Analysing the top 5 values of dependency, edjefe and dejefa columns 
data_train.loc[:, ["dependency", "edjefe", "edjefa"]].head()

Below listed features have Mixed values-

**dependency**, Dependency rate, calculated = (number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)

**edjefe**, years of education of male head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0

**edjefa**, years of education of female head of household, based on the interaction ofescolari (years of education), head of household and gender, yes=1 and no=0

For these features, 
"yes" = 1
"no" = 0 

Lets correct these features using a map and coverting these to float

In [None]:
mapping = {'yes' :1, 'no' :0}

for data in [data_train, data_test]:
    data['dependency'] = data['dependency'].replace(mapping).astype(float)
    data['edjefe']=data['edjefe'].replace(mapping).astype(float)
    data['edjefa']=data['edjefa'].replace(mapping).astype(float)
# head output of treated features
data_train.loc[:, ["dependency", "edjefe", "edjefa"]].head()

As identified in cell 13, below listed features have NaN values- 

 - v2a1 - Monthly rent payment - 6860 values

 - v18q1 - number of tablets household owns - 7342 values

 - rez_esc -Years behind in school - 7928 values

 - meaneduc - average years of education for adults (18+) - 5 values

 - SQBmeaned - square of the mean years of education of adults (>=18) in the household - 5 values

**---------------------------------------------------------------------------------------------------------------------------**

Lets fix v2a1 feature first. Features related to v2a1 (monthly rent payment) -

tipovivi1, =1 own and fully paid house

tipovivi2, =1 own, paying in installments

tipovivi3, =1 rented

tipovivi4, =1 precarious

tipovivi5, =1 other(assigned, borrowed)



In [None]:
data = data_train[data_train['v2a1'].isnull()].head()
data.loc[:, ["v2a1","tipovivi1","tipovivi2","tipovivi3","tipovivi4","tipovivi5"]]

In [None]:
#Features indicating home-ownership

own_feature = [x for x in data_train if x.startswith('tipo')]

#Plotting the home-ownership features for NaN v2a1 rows 

data_train.loc[data_train['v2a1'].isnull(), own_feature].sum().plot.bar(figsize = (5,3), color ='c', edgecolor ='black', linewidth =2)

plt.xticks([0,1,2,3,4],['Owns and Paid Off', 'Owns and Paying', 'Rented', 'Precarious', 'Other'],rotation =20, size=8)
plt.title('Home-ownership status for Households Missing Rent Payments', size=12)

From the above counts we infer that, if the house is owned and paid off, then the house rent should be 0. 
Lets add 0 for all the **NaN** values.

In [None]:
for data in [data_train, data_test]:
    data['v2a1'].fillna(value=0, inplace=True)
    
data_train[['v2a1']].isnull().sum()

Now, lets fix v18q1 (7342 NaN values) -  number of tablets household owns

Lets analyse few rows with v18q1 feature as NaN, to understand the related features. 
Since this is an household level feature,
therefore we consider the rows for head of the household.


In [None]:
data_train.loc[data_train['parentesco1'] ==1,["v18q","v18q1"]].head()


In [None]:
data_train.groupby(by="v18q1")['v18q1'].count()

Looking at above data, we infer that when **owns a tablet** column is 0, then there will be no number of tablets owned by household.

So, lets add 0 for all the NaN values.

In [None]:
for df in [data_train, data_test]:
    df['v18q1'].fillna(value=0, inplace=True)
    
data_train['v18q1'].isnull().sum()

Now lets treat rez_esc(Years behind in school) feature - 7928 NaN values

In [None]:
#Checking related features of rez_sec when the value is not Null
data_train[data_train['rez_esc'].notnull()]['age'].describe()

From the above, we infer that the **Years behind in school** has some value for age value between 7 and 17 Years.
Lets check if there are any NaN values in 7 to 17 Years of age.

In [None]:
data_train[data_train['rez_esc'].isna() & ((data_train['age'] > 7) & (data_train['age'] < 17))]

There is only one value as NaN in age group 7 to 17 years.
Hence, now we can fill the NaN values with 0.

In [None]:
for data in [data_train, data_test]:
    data['rez_esc'].fillna(value = 0, inplace = True)

data_train['rez_esc'].isnull().sum()

Lets analyze and treat **meaneduc** - average years of education for adults (18+) - 5 values

Few related features -
- edjefe, years of education of male head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0
- edjefa, years of education of female head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0
- instlevel1, =1 no level of education
- instlevel2, =1 incomplete primary


In [None]:
#Lets analyze related features when meaneduc is NaN
data_train[data_train['meaneduc'].isnull()].loc[:,['age','meaneduc','edjefe','edjefa','instlevel1','instlevel2','instlevel3','instlevel4','instlevel5','instlevel6','instlevel7','instlevel8','instlevel9']]

In [None]:
#Lets analyze related features when meaneduc is not NaN
data_train.loc[:,['Id','meaneduc','edjefe','edjefa','instlevel1','instlevel2','instlevel3','instlevel4','instlevel5','instlevel6','instlevel7','instlevel8','instlevel9']].head()

From above outputs we infer that -
There are five datapoints with **meaneduc** as NaN. And all have 18+ age.
The value of **meaneduc** feature is same as 'edjefe' if the person is male and 'edjefa' if the person is female for majority of datapoints.

Hence, we treat the 5 NaN values in similar way. 


In [None]:
for data in [data_train,data_test]:
    data['meaneduc'].fillna(value = data['edjefe'], inplace = True)
    
data_train['meaneduc'].isnull().sum()

Now, finally lets treat SQBmeaned - square of the mean years of education of adults (>=18) in the household - 5 values

First understand the dependent features to analyze why the 5 values are NaN -

In [None]:
#Related features when SQBmeaned is NaN
data_train[data_train['SQBmeaned'].isnull()].loc[:, ['SQBmeaned','meaneduc','edjefe','edjefa','instlevel1','instlevel2']]

In [None]:
#Related features when SQBmeaned is not NaN
data_train.loc[:,['SQBmeaned','meaneduc','edjefe','edjefa','instlevel1','instlevel2']].head()

As per **SQBmeaned** description and above outputs, it appears that SQBmeaned is square of the **meaneduc**. 
Hence, treating the NaN accordingly -

In [None]:
for data in [data_train,data_test]:
    data['SQBmeaned'].fillna(value = data['meaneduc']**2, inplace = True)
    
data_train['SQBmeaned'].isnull().sum()

There are some Squared Variables and we understand that these would not add any value to the classification model.
Hence dropping these features -
SQBescolari, SQBage, SQBhogar_total, SQBedjefe, SQBhogar_nin, SQBovercrowding, SQBdependency, SQBmeaned, agesq

In [None]:
#Dropping squared features
cols = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']

for df in [data_train, data_test]:
    df.drop(columns=cols, inplace=True)

print(data_train.shape, data_test.shape)

In [None]:
#Checking for the household redundant variables 

heads = data_train.loc[data_train['parentesco1'] == 1, :]
heads.shape

In [None]:
corr_matrix = heads.corr()
corr_matrix

In [None]:
#Selecting the upper traingle of corr_matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))

In [None]:
#Finding the index of feature columns with correlation greater than 0.95
cols_to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

cols_to_drop

In [None]:
#Dropping the cols_to_drop features
for df in [data_train, data_test]:
    df.drop(columns=cols_to_drop, inplace=True)

#Removing the male as well, as this would not be needed in model creation
for df in [data_train, data_test]:
    df.drop(columns = 'male',inplace=True)


#dropping 'Id' and 'idhogar' columns 
cols = ['Id','idhogar']
for df in [data_train,data_test]:
    df.drop(columns= cols, inplace=True)

print(data_train.shape, data_test.shape)

##  Predict the accuracy using random forest classifier.

In [None]:
#Defining features and target variable
X = data_train.drop('Target', axis=1)
y = data_train['Target']

In [None]:
#Checking shape of X and y
print("Shape of X is {}, and shape of y is {}".format(X.shape, y.shape))

In [None]:
#Splitting train and test data
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1)

In [None]:
#instantiating the Random Forest Classifier with n_estimator as 150
rfc_model = RandomForestClassifier(n_estimators=150)

In [None]:
#Fitting the model
rfc_model.fit(X_train,y_train)

In [None]:
#Predicting the y_pred_test
y_pred_test = rfc_model.predict(X_test)

In [None]:
#Checking accuracy score, confusion matrix and classification report on test data
print("Accuracy score of RFC model on test dataset is : ")
print(accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

 ##  Check the accuracy using random forest with cross validation.

In [None]:
#Defining cross validation generator and calculating cross_val_score
kfold = KFold(n_splits=4,random_state=seed,shuffle=True)
print(cross_val_score(rfc_model, X, y, cv=kfold, scoring='accuracy'))

In [None]:
#Mean of cross_val_score
print(cross_val_score(rfc_model, X, y, cv=kfold, scoring='accuracy').mean())

## Checking for the important and impactful features

In [None]:
feature_labels = list(X)
feature_importance = pd.DataFrame({'Feature' : feature_labels, 'Importance' : rfc_model.feature_importances_})

In [None]:
feature_importance[feature_importance['Importance']>0.025]

Note - From the above we infer that features **room, hogar_nin, dependency, edjefe, meaneduc, overcrowding and qmobilephone** 
play an important role deciding the Income Qualification Level.

## End of the Notebook