In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split as tts, cross_val_score as cv, RepeatedStratifiedKFold as rsk
from sklearn.ensemble import RandomForestClassifier as rf, ExtraTreesClassifier as et, BaggingClassifier as bc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.utils import class_weight
import lightgbm as lgb

In [None]:
%cd 

In [None]:
%ls

In [None]:
df_train = pd.read_csv("/kaggle/input/halooo/train_df_renamed_1.csv")
df_test = pd.read_csv("/kaggle/input/halooo/test_df_renamed_1.csv")

In [None]:
all_equal = df_train.groupby('Household level identifier')['Target'].apply(lambda x: x.nunique() == 1)
not_equal = all_equal[all_equal != True]
print('No of households where target values are not all the same: %s'%(len(not_equal)))

In [None]:
# e.g. of households with different target values

# COMMENT OUT
df_train[df_train['Household level identifier'] == not_equal.index[0]][['Household level identifier', '=1 if household head', 'Target']]

In [None]:
households_with_head = df_train.groupby('Household level identifier')['=1 if household head'].sum()


households_no_heads = df_train.loc[df_train['Household level identifier'].isin(households_with_head[households_with_head == 0].index), :]
print('No. of households with no heads: %s' %households_no_heads['Household level identifier'].nunique())

In [None]:
households_nh_equal = households_no_heads.groupby('Household level identifier')['Target'].apply(lambda x:x.nunique()==1)

In [None]:
print('No. of households with no heads & have different labels: %s' %sum(households_nh_equal==False))

In [None]:
df_train.shape

In [None]:
# change households' targets with different target labels to be same as that household head
for h in not_equal.index:
    label = int(df_train[(df_train['Household level identifier'] == h) & (df_train['=1 if household head'] == 1)]['Target'])
    df_train.loc[df_train['Household level identifier'] == h, 'Target'] = label
    
all_equal_1 = df_train.groupby('Household level identifier')['Target'].apply(lambda x:x.nunique()==1)

In [None]:
a = df_train.groupby('Household level identifier')['Target'].apply(lambda x: x.nunique() == 1)
not_equal = a[a != True]
print('No of households where target values are not all the same: %s'%(len(not_equal)))

In [None]:
#let's look at the distribution of the classes
target = df_train.values[:, -1]
counter = Counter(target)
for i, j in counter.items():
    percent = j/len(target)*100
    print('Class = %s, Count = %d, Percentage = %.3f%%' %(i, j, percent))

So we see that not all the classes are equally distributed. This indicates that we may have to use means such as adding weights for multiclass classification for imbalanced dataset.

In [None]:
df_train.drop(df_train.loc[:, 'escolari squared':'Age squared'].columns,
                            axis = 1, inplace = True)
df_train.head()

We don't actually need to include household level identifier and Id during training as they are actually irrelevant data. Therefore, we should drop them too.

In [None]:
df_train.drop(['Household level identifier', 'Id'], axis = 1,
             inplace = True)
df_train.head()

## Functions for preprocessing data

In [None]:
#need to normalise some of the columns
def prepData(df):
    unnormal_cols = selectUnnormalised(df)
    x = df.iloc[:, :-1]
    y = df.iloc[:, -1:]
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = 0.3, random_state = 42)
    xTrain, xTest = normalise(unnormal_cols, xtrain, xtest)
    yTrain, yTest = ytrain.values.ravel(), ytest.values.ravel()
    return xTrain.values, xTest.values, yTrain, yTest

#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    unnormCols.remove('Target')
    return unnormCols

#normalising data in training set
def normalise(unnormCols, xTrain, xTest):
    #normalise training data
    toBeNorm_train = xTrain[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_train)
    xTrain_norm = std_scale.transform(toBeNorm_train)
    
    #covert numpy array to df
    xTrain_normCol = pd.DataFrame(xTrain_norm, index = toBeNorm_train.index,
                                 columns = toBeNorm_train.columns)
    xTrain.update(xTrain_normCol)
    
    #normalise test data using mean and SD of training set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xTrain, xTest



## Fuctions for training models

In [None]:
#training models
# we start with bagging classifier
def trainBG(xtrain, xtest, ytrain,  ytest):
    #get list of accuracies
    accEst_train = []
    accEst_test = []
    accFeat_train = []
    accFeat_test = []
    accSam_train = []
    accSam_test = []
    
    #to vary the parameters
    num_estimators = [500,800,1000,1250,1650]
    max_feature = [20,50,75,100,129]
    max_sample = [100,200,225,275,300]
    
    
    
    #train with varying num_estimators
    print("Training with varying num_estimators...")
    print("Completed run: \t")
    for i in range(0,len(num_estimators)):
       
        bc_clf = bc(n_estimators = num_estimators[i],
                   max_features = max_feature[0], max_samples = max_sample[0], n_jobs = 5)
        bc_clf.fit(xtrain, ytrain)
        accEst_train.append(bc_clf.score(xtrain, ytrain))
        accEst_test.append(bc_clf.score(xtest,ytest))
        print(str(i+1), end = "\t")
        
    best_est = num_estimators[np.argmax(accEst_test)]
    
    #train with varying max_features
    print("\nTraining with varying max_features...")
    print("Completed run: \t")
    for i in range(0,len(max_feature)):
        
        bc_clf = bc(n_estimators = best_est,
                   max_features = max_feature[i], max_samples = max_sample[0], n_jobs = 5)
        bc_clf.fit(xtrain, ytrain)
        accFeat_train.append(bc_clf.score(xtrain, ytrain))
        accFeat_test.append(bc_clf.score(xtest,ytest))
        print(str(i+1), end = "\t")
        
    best_numFeat = max_feature[np.argmax(accFeat_test)]
    
    #train with varying max_sample
    print("\nTraining with varying max_sample...")
    print("Completed run: \t")
    for i in range(0,len(max_sample)):
        
        bc_clf = bc(n_estimators = best_est,
                   max_features = best_numFeat, max_samples = max_sample[i], n_jobs = 5)
        bc_clf.fit(xtrain, ytrain)
        accSam_train.append(bc_clf.score(xtrain, ytrain))
        accSam_test.append(bc_clf.score(xtest,ytest))
        print(str(i+1), end = "\t")
        
    best_numSam = max_sample[np.argmax(accSam_test)]
    
    print("\n\nBest parameters for...\nnum_estimators: %d\tmax_features: %d\tmax_samples: %d" %(best_est, best_numFeat, best_numSam))
    
    return accEst_train, accEst_test, accFeat_train, accFeat_test, accSam_train, accSam_test, best_est, best_numFeat, best_numSam

def trainRF_or_ET(modelName, xtrain, xtest, ytrain,  ytest):
    accEst_train = []
    accEst_test = []
    accDepth_train = []
    accDepth_test = []
    
    #to vary the parameters
    num_estimators = [500,800,1000,1250,1650]
    max_deep = [35,50,75,100,125]
    
    #train for varying num_estimators
    print("Training with varying num_estimators...")
    print("Completed run: \t")
    for i in range(0,len(num_estimators)):
        if(modelName == "random forest"):
            model = rf(n_estimators = num_estimators[i],
                       max_depth = max_deep[0], class_weight ='balanced')
            
        elif(modelName == "extra tree"):
            model = et(n_estimators = num_estimators[i],
                      max_depth = max_deep[0], class_weight = 'balanced')
        
        model.fit(xtrain,ytrain)
        accEst_train.append(model.score(xtrain, ytrain))
        accEst_test.append(model.score(xtest, ytest))
        print(str(i+1), end = "\t")
    
    best_est = num_estimators[np.argmax(accEst_test)]
    
    #train for varying max_depth
    print("\nTraining with varying max_depth...")
    print("Completed run: \t")
    for i in range(0,len(max_deep)):
        if(modelName == "random forest"):
            model = rf(n_estimators = best_est,
                       max_depth = max_deep[i], class_weight ='balanced')
            
        elif(modelName == "extra tree"):
            model = et(n_estimators = best_est,
                      max_depth = max_deep[i], class_weight = 'balanced')
        
        model.fit(xtrain,ytrain)
        accDepth_train.append(model.score(xtrain, ytrain))
        accDepth_test.append(model.score(xtest, ytest))
        print(str(i+1), end = "\t")
    
    best_depth = max_deep[np.argmax(accDepth_test)]
    
    print("\n\nBest parameters for...\nnum_estimators: %d\tmax_depth: %d" %(best_est, best_depth))

    return accEst_train, accEst_test, accDepth_train, accDepth_test, best_est, best_depth

In [None]:
xTrain, xTest, yTrain, yTest = prepData(df_train)

In [None]:
estTrain_bg, estTest_bg, featureTrain_bg, featureTest_bg, sampleTrain_bg, sampleTest_bg, num_est_bg, max_feat_bg, max_samp_bg= trainBG(xTrain, xTest, yTrain, yTest)

In [None]:
estTrain_rf, estTest_rf, depthTrain_rf, depthTest_rf, num_est_rf, max_deep_rf = trainRF_or_ET("random forest", xTrain, xTest, yTrain, yTest)

In [None]:
estTrain_et, estTest_et, depthTrain_et, depthTest_et, num_est_et, max_deep_et = trainRF_or_ET("extra tree", xTrain, xTest, yTrain, yTest)

## Plot graphs

In [None]:
#plot graph for bagging classifier
title = ["Varying n_estimators", "Varying max_features with best n_estimators", "Varying max_samples with best n_estimators and max_features"]
fig, ax = plt.subplots(3,1, figsize = (10,15))

num_estimators = [500,800,1000,1250,1650]
max_feature = [20,50,75,100,129]
max_sample = [100,200,225,275,300]

ax[0].plot(num_estimators, estTrain_bg, "-b", label = "Train")
ax[0].plot(num_estimators, estTest_bg, "-r", label = "Test")
ax[0].legend(loc = "upper right")
ax[0].set_title(title[0])
ax[0].set_xlabel("Numbers of num_estimators")
ax[0].set_ylabel("Accuracy")

ax[1].plot(max_feature, featureTrain_bg, "-b", label = "Train")
ax[1].plot(max_feature, featureTest_bg, "-r", label = "Test")
ax[1].legend(loc = "upper right")
ax[1].set_title(title[1])
ax[1].set_xlabel("Numbers of max_features")
ax[1].set_ylabel("Accuracy")

ax[2].plot(max_sample, sampleTrain_bg, "-b", label = "Train")
ax[2].plot(max_sample, sampleTest_bg, "-r", label = "Test")
ax[2].legend(loc = "upper right")
ax[2].set_title(title[2])
ax[2].set_xlabel("Numbers of max_samples")
ax[2].set_ylabel("Accuracy")

plt.show()

From the graphs above, as we continue using the best parameters that give us the highest accuracy for test set, it is clear that the accuracy of the model rises. 

The optimal paramaters will be set as such:
- n_estimators = 500
- max_features = 129
- max_samples = 300

We shall take a look at the classification accuracy of the bagging classifier with the optimal parameters

In [None]:
bag = bc(n_estimators = num_est_bg, max_features = max_feat_bg, max_samples = max_samp_bg)
bag.fit(xTrain, yTrain)
y_pred = bag.predict(xTest)
print("Classification accuracy: {:.2f}".format(bag.score(xTest, yTest)))
print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
#plot graph for random forest
fig, ax = plt.subplots(2,1, figsize = (10,10))

title = ["Varying n_estimators", "Varying max_depth with best n_estimators"]

num_estimators = [500,800,1000,1250,1650]
max_deep = [35,50,75,100,125]

ax[0].plot(num_estimators, estTrain_rf, "-b", label = "Train")
ax[0].plot(num_estimators, estTest_rf, "-r", label = "Test")
ax[0].legend(loc = "upper right")
ax[0].set_title(title[0])
ax[0].set_xlabel("Number of n_estimators")
ax[0].set_ylabel("Accuracy")

ax[1].plot(max_deep, depthTrain_rf, "-b", label = "Train")
ax[1].plot(max_deep, depthTest_rf, "-r", label = "Test")
ax[1].legend(loc = "upper right")
ax[1].set_title(title[1])
ax[1].set_xlabel("Number of max_depth")
ax[1].set_ylabel("Accuracy")

plt.show()

Random forest classifier seems to perform much more accurately and the test accuracy is relatively consistent. Therefore, we shall follow the indicated optimal parameters and create a model using them. 

Optimal parameters:
- n_estimators = 500
- max_depth = 100
- class_weight = 'balanced'

Next, we shall assess the classification accuracy of the random forest classifier.

In [None]:
rff = rf(n_estimators = num_est_rf, max_depth = max_deep_rf, class_weight = 'balanced')
rff.fit(xTrain, yTrain)
y_pred = rff.predict(xTest)
print("Classification accuracy: {:.2f}".format(rff.score(xTest, yTest)))
print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
#plotting graph for extra trees
fig, ax = plt.subplots(2,1, figsize = (10,10))

title = ["Varying n_estimators", "Varying max_depth with best n_estimators"]

num_estimators = [500,800,1000,1250,1650]
max_deep = [35,50,75,100,125]

ax[0].plot(num_estimators, estTrain_et, "-b", label = "Train")
ax[0].plot(num_estimators, estTest_et, "-r", label = "Test")
ax[0].legend(loc = "upper right")
ax[0].set_title(title[0])
ax[0].set_xlabel("Number of n_estimators")
ax[0].set_ylabel("Accuracy")

ax[1].plot(max_deep, depthTrain_et, "-b", label = "Train")
ax[1].plot(max_deep, depthTest_et, "-r", label = "Test")
ax[1].legend(loc = "upper right")
ax[1].set_title(title[1])
ax[1].set_xlabel("Number of max_depth")
ax[1].set_ylabel("Accuracy")

plt.show()

It seems like extra trees is performing slightly better than random forest. As there are no indications of extreme overfitting and that the test accuracy is pretty consistent, we shall use the recommended optimal parameters and assess the accuracy score of the extra trees classifier model. 

Optimal parameters:
- n_estimators = 500
- max_depth = 35
- class_weight = 'balanced'

In [None]:
ett = et(n_estimators =num_est_et, max_depth = max_deep_et, class_weight = 'balanced')
ett.fit(xTrain, yTrain)
y_pred = ett.predict(xTest)
print("Classification accuracy: {:.2f}".format(ett.score(xTest, yTest)))
print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

## Using LightGBM classifier

In [None]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf.fit(xTrain, yTrain, eval_set=[(xTest, yTest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf.predict(xTest)

In [None]:
print("Classification accuracy: {:.2f}".format(lgb_clf.score(xTest, yTest)))
print("F1 score : {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

We conclude that **LightGBM** classifier model is the most suitable for classification of the poverty classes.

## Using only household heads

In [None]:
df_headsOnly = df_train[df_train['=1 if household head'] == 1]
df_headsOnly.head()

In [None]:
xTrain, xTest, yTrain, yTest = prepData(df_headsOnly)

In [None]:
#Bagging classifier
estTrain_bg, estTest_bg, featureTrain_bg, featureTest_bg, sampleTrain_bg, sampleTest_bg, num_est_bg, max_feat_bg, max_samp_bg= trainBG(xTrain, xTest, yTrain, yTest)

In [None]:
#Random forest classifier
estTrain_rf, estTest_rf, depthTrain_rf, depthTest_rf, num_est_rf, max_deep_rf = trainRF_or_ET("random forest", xTrain, xTest, yTrain, yTest)

In [None]:
#Extra trees classifier
estTrain_et, estTest_et, depthTrain_et, depthTest_et, num_est_et, max_deep_et = trainRF_or_ET("extra tree", xTrain, xTest, yTrain, yTest)

Now we assess the models' performance...

In [None]:
bag_hh = bc(n_estimators = num_est_bg, max_features = max_feat_bg, max_samples = max_samp_bg)
bag_hh.fit(xTrain, yTrain)
y_pred = bag_hh.predict(xTest)
print("Bagging")
print("Classification accuracy: {:.2f}".format(bag_hh.score(xTest, yTest)))
print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
rff_hh = rf(n_estimators = num_est_rf, max_depth = max_deep_rf, class_weight = 'balanced')
rff_hh.fit(xTrain, yTrain)
y_pred = rff_hh.predict(xTest)
print("Random Forest:")
print("Classification accuracy: {:.2f}".format(rff_hh.score(xTest, yTest)))
print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
ett_hh = et(n_estimators =num_est_et, max_depth = max_deep_et, class_weight = 'balanced')
ett_hh.fit(xTrain, yTrain)
y_pred = ett_hh.predict(xTest)
print("Extra Tree:")
print("Classification accuracy: {:.2f}".format(ett_hh.score(xTest, yTest)))
print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
# build the model
lgb_hh = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_hh.fit(xTrain, yTrain, eval_set=[(xTest, yTest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_hh.predict(xTest)

In [None]:
print("Classification accuracy: {:.2f}".format(lgb_clf.score(xTest, yTest)))
print("F1 score : {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

Restricting the dataset to only using the household heads seems to result in a much lower accuracy score for the first 3 models. For LGBM, the accuracy was still high at 93%. However, the f1 score across all models were very poor. Hence, we shall train and test on the entire dataset instead.

## Predicting the test set

We will pass the test dataset into our best model and write a new csv which will consists of the following columns: <br>
1. Household id
2. Individual's ID
3. Predicted target/class

In [None]:
toTest_data = df_test.iloc[:, 2:131]
identity = df_test.iloc[:, 1:2]

In [None]:
#need to prepare special function to get normalised test set
#need to normalise some of the columns
def prepData2(df):
    unnormal_cols = selectUnnormalised2(df)
    x_df = normalise2(unnormal_cols, df)
    return x_df

#getting the columns with non-normalised values
def selectUnnormalised2(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    return unnormCols

#normalising data in training set
def normalise2(unnormCols, df):
    #normalise training data
    toBeNorm = df[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm)
    x_norm = std_scale.transform(toBeNorm)
    
    #covert numpy array to df
    x_normCols = pd.DataFrame(x_norm, index = toBeNorm.index,
                                 columns = toBeNorm.columns)
    df.update(x_normCols)
    
    return df

In [None]:
xTest_true = prepData2(toTest_data) #generate normalised actual test set
xTest_true.head()

In [None]:
# ett_hh = et(n_estimators =num_est_et, max_depth = max_deep_et, class_weight = 'balanced')
# ett_hh.fit(xTrain, yTrain)
xTest_truePred = ett_hh.predict(xTest_true)


# xTest_truePred = lgb_clf.predict(xTest_true)

In [None]:
identity['Target'] = xTest_truePred
identity.head()

In [None]:
#for submission
identity.to_csv('/kaggle/working/submission.csv',index = False)