In [29]:
import pandas as pd
import copy
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import tqdm.notebook as tq
import scipy.cluster.hierarchy as spc
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.feature_selection import VarianceThreshold
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split as tts, cross_val_score as cv, RepeatedStratifiedKFold as rsk
from sklearn.ensemble import RandomForestClassifier as rf, ExtraTreesClassifier as et, BaggingClassifier as bc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, precision_score
from sklearn.utils import class_weight
import lightgbm as lgb

In [30]:
test_df = pd.read_csv("test_df_renamed_new.csv")

In [31]:
column = list((test_df.columns.drop('=1 if household head')))
column.append('=1 if household head')

In [32]:
test_df = test_df[column]
test_df = test_df.drop(columns = ['Household level identifier','Id'])

In [33]:
test_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level,=1 if household head
0,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,4,17,3,3,3,1,0
1,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,41,17,3,3,3,8,0
2,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,41,17,3,3,3,9,1
3,0.0,0,14,0,1,1.0,0,1,1,0,...,0,0,0,59,16,3,3,3,8,1
4,175000.0,0,4,0,1,1.0,0,0,0,0,...,0,0,0,18,11,2,3,3,5,1


In [34]:
#need to normalise some of the columns
#need to normalise some of the columns
def prepData(df,test_size = 0.3,normalize = True,include_eval = False):
    unnormal_cols = selectUnnormalised(df)
    x = df.drop("=1 if household head",axis = 1)
    y = df['=1 if household head'] -1
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = test_size, random_state = 42)
    if include_eval:
        xtest, x_eval, ytest, y_eval = tts(xtest,ytest,test_size = 0.1/0.3, random_state = 42)
        return xtest, xtest, x_eval, ytest, ytest, y_eval
    else:
        if normalize:
            xtrain, xtest = normalise(unnormal_cols, xtrain, xtest)
            ytrain, ytest = ytrain.values.ravel(), ytest.values.ravel()
            return  xtrain, xtest, ytrain, ytest
        return  xtrain, xtest, ytrain, ytest
    
#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    return unnormCols

#normalising data in testing set
def normalise(unnormCols, xtest, xTest):
    #normalise testing data
    toBeNorm_test = xtest[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_test)
    xtest_norm = std_scale.transform(toBeNorm_test)
    
    #covert numpy array to df
    xtest_normCol = pd.DataFrame(xtest_norm, index = toBeNorm_test.index,
                                 columns = toBeNorm_test.columns)
    xtest.update(xtest_normCol)
    
    #normalise test data using mean and SD of testing set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xtest, xTest


In [35]:
xtrain, xtest, ytrain, ytest = prepData(test_df,test_size = 0.1)

In [36]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03,
                             random_state=None, silent=True, 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf.fit(xtrain.values, ytrain, eval_set=[(xtest.values, ytest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf.predict(xtest)



[100]	valid_0's binary_logloss: 0.0523509
[200]	valid_0's binary_logloss: 0.0231232
[300]	valid_0's binary_logloss: 0.0206546
[400]	valid_0's binary_logloss: 0.0224787
[500]	valid_0's binary_logloss: 0.0250928
[600]	valid_0's binary_logloss: 0.0277075


In [37]:
cm = confusion_matrix(ytest, y_pred)
f1 = f1_score(ytest, y_pred, average='macro')
print("confusion matrix: \n", cm)
# print("macro F1 score: \n", f1)
print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
print("F1 score : {:.2f}".format(f1_score(ytest, y_pred, average = 'macro')))

confusion matrix: 
 [[1606   15]
 [   1  764]]
Classification accuracy: 0.99
F1 score : 0.99


# Machine Learning Testing

In [38]:
train_df = pd.read_csv("train_df_renamed_new.csv")
train_df = train_df.drop(columns = ['Household level identifier','Id'])
train_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level,Target
0,190000.0,0,3,0,1,0.0,0,1,1,0,...,0,0,0,43,10,2,1,1,4,4
1,135000.0,0,4,0,1,1.0,0,1,1,0,...,0,0,0,67,12,2,2,2,8,4
2,0.0,0,8,0,1,0.0,0,0,0,0,...,0,0,0,92,11,2,3,3,5,4
3,180000.0,0,5,0,1,1.0,0,2,2,1,...,0,0,0,17,11,3,3,3,4,4
4,180000.0,0,5,0,1,1.0,0,2,2,1,...,0,0,0,37,11,3,3,3,5,4


In [39]:
test_df = pd.read_csv("test_df_renamed_new.csv")
identity = test_df.iloc[:, 1:2]
test_df = test_df.drop(columns = ['Household level identifier','Id'])
test_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Brunca,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level
0,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,0,4,17,3,3,3,1
1,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,0,41,17,3,3,3,8
2,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,0,41,17,3,3,3,9
3,0.0,0,14,0,1,1.0,0,1,1,0,...,0,0,0,0,59,16,3,3,3,8
4,175000.0,0,4,0,1,1.0,0,0,0,0,...,0,0,0,0,18,11,2,3,3,5


## Helper Functions

In [40]:
#need to normalise some of the columns
#need to normalise some of the columns
def prepData(df,test_size = 0.3,normalize = True,include_eval = False):
    unnormal_cols = selectUnnormalised(df)
    x = df.drop("Target",axis = 1)
    y = df['Target'] -1
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = test_size, random_state = 42)
    if include_eval:
        xtest, x_eval, ytest, y_eval = tts(xtest,ytest,test_size = 0.1/0.3, random_state = 42)
        return xtest, xtest, x_eval, ytest, ytest, y_eval
    else:
        if normalize:
            xtrain, xtest = normalise(unnormal_cols, xtrain, xtest)
            ytrain, ytest = ytrain.values.ravel(), ytest.values.ravel()
            return  xtrain, xtest, ytrain, ytest
        return  xtrain, xtest, ytrain, ytest
    
    
    

#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    unnormCols.remove('Target')
    return unnormCols

#normalising data in testing set
def normalise(unnormCols, xtest, xTest):
    #normalise testing data
    toBeNorm_test = xtest[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_test)
    xtest_norm = std_scale.transform(toBeNorm_test)
    
    #covert numpy array to df
    xtest_normCol = pd.DataFrame(xtest_norm, index = toBeNorm_test.index,
                                 columns = toBeNorm_test.columns)
    xtest.update(xtest_normCol)
    
    #normalise test data using mean and SD of testing set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xtest, xTest



In [41]:
# xtest, xtest, ytest, ytest = prepData(test_df,normalize=False)
xtest, xtest, ytest, ytest = prepData(train_df,test_size = 0.1)

## LightGBM

In [42]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf.fit(xtest.values, ytest, eval_set=[(xtest.values, ytest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf.predict(xtest)



[100]	valid_0's multi_logloss: 0.81775
[200]	valid_0's multi_logloss: 0.654172
[300]	valid_0's multi_logloss: 0.532157
[400]	valid_0's multi_logloss: 0.437504
[500]	valid_0's multi_logloss: 0.363236
[600]	valid_0's multi_logloss: 0.30561
[700]	valid_0's multi_logloss: 0.256978
[800]	valid_0's multi_logloss: 0.215759
[900]	valid_0's multi_logloss: 0.181824
[1000]	valid_0's multi_logloss: 0.153171
[1100]	valid_0's multi_logloss: 0.129897
[1200]	valid_0's multi_logloss: 0.110145
[1300]	valid_0's multi_logloss: 0.0931238
[1400]	valid_0's multi_logloss: 0.0793482
[1500]	valid_0's multi_logloss: 0.0677099
[1600]	valid_0's multi_logloss: 0.0579041
[1700]	valid_0's multi_logloss: 0.0493417
[1800]	valid_0's multi_logloss: 0.0419994
[1900]	valid_0's multi_logloss: 0.0360109
[2000]	valid_0's multi_logloss: 0.0310027
[2100]	valid_0's multi_logloss: 0.0267503
[2200]	valid_0's multi_logloss: 0.0231668
[2300]	valid_0's multi_logloss: 0.020105
[2400]	valid_0's multi_logloss: 0.0173733
[2500]	valid_0's

In [43]:
cm = confusion_matrix(ytest, y_pred)
f1 = f1_score(ytest, y_pred, average='macro')
print("confusion matrix: \n", cm)
# print("macro F1 score: \n", f1)
print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
print("F1 score : {:.2f}".format(f1_score(ytest, y_pred, average = 'macro')))

confusion matrix: 
 [[ 79   0   0   0]
 [  0 156   0   0]
 [  0   0 131   0]
 [  0   0   0 590]]
Classification accuracy: 1.00
F1 score : 1.00


In [44]:
final_df = copy.deepcopy(xtest)
final_df['Target'] = ytest
final_df['Pred'] = y_pred
final_df

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level,Target,Pred
8503,-0.421354,0,-0.652043,0,1,-0.467602,-0.571187,-1.506516,-1.642677,-0.573838,...,0,1,1.184136,-0.450839,-2.180114,-2.055057,-2.299565,-0.388781,1,1
7752,-0.421354,0,-0.652043,0,1,-0.467602,-0.571187,0.426580,0.045033,-0.573838,...,0,1,-0.570200,-1.133406,-2.180114,-0.644399,-0.811225,0.501977,0,0
9350,-0.421354,0,-0.652043,0,1,-0.467602,0.901028,0.426580,0.888888,-0.573838,...,1,1,-1.170368,-1.360929,-2.180114,-0.644399,-2.299565,-0.834159,0,0
8414,-0.421354,0,0.026590,0,1,-0.467602,0.901028,-0.539968,0.045033,-0.573838,...,0,1,0.445468,-0.450839,0.791073,0.766259,0.677116,-0.388781,2,2
7671,-0.421354,0,0.026590,0,1,-0.467602,0.901028,-0.539968,0.045033,3.765767,...,0,1,-1.124201,0.686773,-2.180114,-2.055057,-2.299565,-0.834159,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,-0.421354,0,1.383856,0,1,-0.467602,-0.571187,1.393128,0.888888,-0.573838,...,0,0,-0.200866,0.459251,0.791073,0.766259,0.677116,-0.388781,3,3
1982,2.215327,0,0.026590,0,1,3.823125,-0.571187,0.426580,0.045033,-0.573838,...,0,0,-0.893367,1.369341,0.791073,-0.644399,0.677116,0.947355,3,3
4387,0.285800,0,-1.330676,0,1,-0.467602,0.901028,-0.539968,0.045033,0.872697,...,0,0,-0.108533,0.004206,0.791073,0.766259,0.677116,0.056598,3,3
5489,-0.421354,0,0.026590,0,1,-0.467602,-0.571187,0.426580,0.045033,-0.573838,...,0,1,1.184136,-0.450839,0.791073,0.766259,0.677116,-0.388781,3,3


In [45]:
final_hh_df = final_df[final_df["=1 if household head"]==1]
# print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
print("F1 score : {:.2f}".format(f1_score(final_hh_df['Target'].values, final_hh_df['Pred'].values, average = 'macro')))

F1 score : 1.00


### Just household heads

In [46]:
train_hh_df = train_df[train_df["=1 if household head"]==1]

In [47]:
train_hh_df = train_hh_df.drop("=1 if household head",axis =1)

In [48]:
xTrain, xTest, yTrain, yTest = prepData(train_hh_df,test_size = 0.3)

In [49]:
len(yTrain)

2081

In [50]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  1.0, min_child_samples = 20, num_leaves = 31, subsample = 1.0)

# fit data into the model and predict the test set
lgb_clf.fit(xTrain.values, yTrain, eval_set=[(xTest.values, yTest)], 
            early_stopping_rounds=400, verbose=100)
yPred = lgb_clf.predict(xTest.values)



[100]	valid_0's multi_logloss: 0.979315
[200]	valid_0's multi_logloss: 0.962947
[300]	valid_0's multi_logloss: 0.986862
[400]	valid_0's multi_logloss: 1.0429
[500]	valid_0's multi_logloss: 1.10682


In [51]:
cm = confusion_matrix(yTest, yPred)
f1 = f1_score(yTest, yPred, average='macro')
print("confusion matrix: \n", cm)
print("macro F1 score: \n", f1)
print('recall: ', recall_score(yTest, yPred, average='macro'))
print('precision: ', precision_score(yTest, yPred, average='macro'))
# print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
# print("F1 score : {}".format(f1_score(ytest.values, y_pred, average = 'macro')))

confusion matrix: 
 [[ 26  22  11   9]
 [ 24  58  24  34]
 [  8  26  27  47]
 [ 22  53  69 432]]
macro F1 score: 
 0.43803447151696717
recall:  0.44915966386554623
precision:  0.43086823783484896


In [52]:
list(test_df)

['Monthly rent payment',
 '=1 Overcrowding by bedrooms',
 ' number of all rooms in the house',
 '=1 Overcrowding by rooms',
 '=1 if the household has refrigerator',
 'number of tablets household owns',
 'Males younger than 12 years of age',
 'Males 12 years of age and older',
 'Total males in the household',
 'Females younger than 12 years of age',
 'Females 12 years of age and older',
 'Total females in the household',
 'persons younger than 12 years of age',
 'persons 12 years of age and older',
 'years of schooling',
 'Years behind in school',
 '=1 if predominant material on the outside wall is block or brick',
 '=1 if predominant material on the outside wall is socket (wood, zinc or absbesto',
 '=1 if predominant material on the outside wall is prefabricated or cement',
 '=1 if predominant material on the outside wall is waste material',
 '=1 if predominant material on the outside wall is wood ',
 '=1 if predominant material on the outside wall is zink',
 '=1 if predominant materia

# Submission