In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split as tts, cross_val_score as cv, RepeatedStratifiedKFold as rsk
from sklearn.ensemble import RandomForestClassifier as rf, ExtraTreesClassifier as et, BaggingClassifier as bc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.utils import class_weight
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

In [None]:
train_set = pd.read_csv("../input/halooo/train_df_renamed_1.csv")

In [None]:
test_set = pd.read_csv("../input/halooo/test_df_renamed_1.csv")

In [None]:
cb = pd.read_csv("../input/costa-rican-household-poverty-prediction/codebook.csv")

In [None]:
train_set.rename(columns=cb.set_index('Variable description')['Variable name'], inplace=True)
test_set.rename(columns=cb.set_index('Variable description')['Variable name'], inplace=True)

In [None]:
train_set['adult'] = train_set['hogar_adul'] - train_set['hogar_mayor']
train_set['dependency_count'] = train_set['hogar_nin'] + train_set['hogar_mayor']
train_set['dependency'] = train_set['dependency_count'] / train_set['adult']
train_set['child_percent'] = train_set['hogar_nin']/train_set['hogar_total']
train_set['elder_percent'] = train_set['hogar_mayor']/train_set['hogar_total']
train_set['adult_percent'] = train_set['hogar_adul']/train_set['hogar_total']
test_set['adult'] = test_set['hogar_adul'] - test_set['hogar_mayor']
test_set['dependency_count'] = test_set['hogar_nin'] + test_set['hogar_mayor']
test_set['dependency'] = test_set['dependency_count'] / test_set['adult']
test_set['child_percent'] = test_set['hogar_nin']/test_set['hogar_total']
test_set['elder_percent'] = test_set['hogar_mayor']/test_set['hogar_total']
test_set['adult_percent'] = test_set['hogar_adul']/test_set['hogar_total']

train_set['rent_per_adult'] = train_set['v2a1']/train_set['hogar_adul']
train_set['rent_per_person'] = train_set['v2a1']/train_set['hhsize']
test_set['rent_per_adult'] = test_set['v2a1']/test_set['hogar_adul']
test_set['rent_per_person'] = test_set['v2a1']/test_set['hhsize']

train_set['overcrowding_room_and_bedroom'] = (train_set['hacdor'] + train_set['hacapo'])/2
test_set['overcrowding_room_and_bedroom'] = (test_set['hacdor'] + test_set['hacapo'])/2

train_set['no_appliances'] = train_set['refrig'] + train_set['computer'] + train_set['television']
test_set['no_appliances'] = test_set['refrig'] + test_set['computer'] + test_set['television']

train_set['r4h1_percent_in_male'] = train_set['r4h1'] / train_set['r4h3']
train_set['r4m1_percent_in_female'] = train_set['r4m1'] / train_set['r4m3']
train_set['r4h1_percent_in_total'] = train_set['r4h1'] / train_set['hhsize']
train_set['r4m1_percent_in_total'] = train_set['r4m1'] / train_set['hhsize']
train_set['r4t1_percent_in_total'] = train_set['r4t1'] / train_set['hhsize']
test_set['r4h1_percent_in_male'] = test_set['r4h1'] / test_set['r4h3']
test_set['r4m1_percent_in_female'] = test_set['r4m1'] / test_set['r4m3']
test_set['r4h1_percent_in_total'] = test_set['r4h1'] / test_set['hhsize']
test_set['r4m1_percent_in_total'] = test_set['r4m1'] / test_set['hhsize']
test_set['r4t1_percent_in_total'] = test_set['r4t1'] / test_set['hhsize']

train_set['rent_per_room'] = train_set['v2a1']/train_set['rooms']
train_set['bedroom_per_room'] = train_set['bedrooms']/train_set['rooms']
train_set['elder_per_room'] = train_set['hogar_mayor']/train_set['rooms']
train_set['adults_per_room'] = train_set['adult']/train_set['rooms']
train_set['child_per_room'] = train_set['hogar_nin']/train_set['rooms']
train_set['male_per_room'] = train_set['r4h3']/train_set['rooms']
train_set['female_per_room'] = train_set['r4m3']/train_set['rooms']
train_set['room_per_person_household'] = train_set['hhsize']/train_set['rooms']

test_set['rent_per_room'] = test_set['v2a1']/test_set['rooms']
test_set['bedroom_per_room'] = test_set['bedrooms']/test_set['rooms']
test_set['elder_per_room'] = test_set['hogar_mayor']/test_set['rooms']
test_set['adults_per_room'] = test_set['adult']/test_set['rooms']
test_set['child_per_room'] = test_set['hogar_nin']/test_set['rooms']
test_set['male_per_room'] = test_set['r4h3']/test_set['rooms']
test_set['female_per_room'] = test_set['r4m3']/test_set['rooms']
test_set['room_per_person_household'] = test_set['hhsize']/test_set['rooms']

train_set['rent_per_bedroom'] = train_set['v2a1']/train_set['bedrooms']
train_set['edler_per_bedroom'] = train_set['hogar_mayor']/train_set['bedrooms']
train_set['adults_per_bedroom'] = train_set['adult']/train_set['bedrooms']
train_set['child_per_bedroom'] = train_set['hogar_nin']/train_set['bedrooms']
train_set['male_per_bedroom'] = train_set['r4h3']/train_set['bedrooms']
train_set['female_per_bedroom'] = train_set['r4m3']/train_set['bedrooms']
train_set['bedrooms_per_person_household'] = train_set['hhsize']/train_set['bedrooms']

test_set['rent_per_bedroom'] = test_set['v2a1']/test_set['bedrooms']
test_set['edler_per_bedroom'] = test_set['hogar_mayor']/test_set['bedrooms']
test_set['adults_per_bedroom'] = test_set['adult']/test_set['bedrooms']
test_set['child_per_bedroom'] = test_set['hogar_nin']/test_set['bedrooms']
test_set['male_per_bedroom'] = test_set['r4h3']/test_set['bedrooms']
test_set['female_per_bedroom'] = test_set['r4m3']/test_set['bedrooms']
test_set['bedrooms_per_person_household'] = test_set['hhsize']/test_set['bedrooms']

train_set['tablet_per_person_household'] = train_set['v18q1']/train_set['hhsize']
train_set['phone_per_person_household'] = train_set['qmobilephone']/train_set['hhsize']
test_set['tablet_per_person_household'] = test_set['v18q1']/test_set['hhsize']
test_set['phone_per_person_household'] = test_set['qmobilephone']/test_set['hhsize']

train_set['age_12_19'] = train_set['hogar_nin'] - train_set['r4t1']
test_set['age_12_19'] = test_set['hogar_nin'] - test_set['r4t1']    

train_set['escolari_age'] = train_set['escolari']/train_set['age']
test_set['escolari_age'] = test_set['escolari']/test_set['age']

In [None]:
train_set['dependency'] = train_set['dependency'].replace({np.inf: 0})
test_set['dependency'] = test_set['dependency'].replace({np.inf: 0})

In [None]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

aggr_mean_list = [ 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 'parentesco2',
             'parentesco3', 'parentesco4', 'parentesco5', 'parentesco6', 'parentesco7', 'parentesco8', 'parentesco9', 'parentesco10', 'parentesco11', 'parentesco12',
             'instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9',]

other_list = ['escolari', 'age', 'escolari_age']

for item in aggr_mean_list:
    group_train_mean = train_set[item].groupby(train_set['idhogar']).mean()
    group_test_mean = test_set[item].groupby(test_set['idhogar']).mean()
    new_col = item + '_aggr_mean'
    df_train[new_col] = group_train_mean
    df_test[new_col] = group_test_mean

for item in other_list:
    for function in ['mean','std','min','max','sum']:
        group_train = train_set[item].groupby(train_set['idhogar']).agg(function)
        group_test = test_set[item].groupby(test_set['idhogar']).agg(function)
        new_col = item + '_' + function
        df_train[new_col] = group_train
        df_test[new_col] = group_test

print(f'new aggregate train set has {df_train.shape[0]} rows, and {df_train.shape[1]} features')
print(f'new aggregate test set has {df_test.shape[0]} rows, and {df_test.shape[1]} features')

In [None]:
df_test = df_test.reset_index()
df_train = df_train.reset_index()

train_agg = pd.merge(train_set, df_train, on='idhogar')
test = pd.merge(test_set, df_test, on='idhogar')

#fill all na as 0
train_agg.fillna(value=0, inplace=True)
test.fillna(value=0, inplace=True)
print(f'new train set has {train_agg.shape[0]} rows, and {train_agg.shape[1]} features')
print(f'new test set has {test.shape[0]} rows, and {test.shape[1]} features')

In [None]:
train = train_agg.query('parentesco1==1')

In [None]:
train.drop(columns=['idhogar','Id', 'tamhog', 'agesq', 'hogar_adul', 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned'], inplace=True)
test.drop(columns=['idhogar','Id', 'tamhog', 'agesq', 'hogar_adul', 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned'], inplace=True)

In [None]:
train_set.rename(columns=cb.set_index('Variable name')['Variable description'], inplace=True)
test_set.rename(columns=cb.set_index('Variable name')['Variable description'], inplace=True)

In [None]:
df_train = train_set
df_test = test_set

In [None]:
submit = df_test[['Id']]

In [None]:
df_train.head()

In [None]:
# !pip install lightgbm
# df_ori = pd.read_csv("../input/contains-agg/test_df_renamed_1.csv")

In [None]:
all_equal = df_train.groupby('Household level identifier')['Target'].apply(lambda x: x.nunique() == 1)
not_equal = all_equal[all_equal != True]
print('No of households where target values are not all the same: %s'%(len(not_equal)))

In [None]:
df_train['Target']

In [None]:
#let's look at the distribution of the classes
target = df_train['Target']
counter = Counter(target)
for i, j in counter.items():
    percent = j/len(target)*100
    print('Class = %s, Count = %d, Percentage = %.3f%%' %(i, j, percent))

In [None]:
# df_train = df_train.drop(['escolari_age'],axis =1)
# df_train['escolari_age']

In [None]:
# df_train.drop([
# #     'Unnamed: 0', 
#     'Household level identifier','Id'], axis = 1,
#              inplace = True)

# df_train.drop(df_train.loc[:, 'escolari squared':'Age squared'].columns,
#                             axis = 1, inplace = True)
df_train.head()

In [None]:
#need to normalise some of the columns
def prepData(df):
    unnormal_cols = selectUnnormalised(df)
    x = df.iloc[:, :-1]
    y = df.iloc[:, -1:]
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = 0.3, random_state = 42)
    xTrain, xTest = normalise(unnormal_cols, xtrain, xtest)
    yTrain, yTest = ytrain.values.ravel(), ytest.values.ravel()
    return xTrain.values, xTest.values, yTrain, yTest

#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    unnormCols.remove('Target')
    return unnormCols

#normalising data in training set
def normalise(unnormCols, xTrain, xTest):
    #normalise training data
    toBeNorm_train = xTrain[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_train)
    xTrain_norm = std_scale.transform(toBeNorm_train)
    
    #covert numpy array to df
    xTrain_normCol = pd.DataFrame(xTrain_norm, index = toBeNorm_train.index,
                                 columns = toBeNorm_train.columns)
    xTrain.update(xTrain_normCol)
    
    #normalise test data using mean and SD of training set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xTrain, xTest



In [None]:
y = df_train['Target']

In [None]:
df_train.drop(columns=['Target','Id','Household level identifier'], inplace=True)


In [None]:
df_test.drop(columns=['Id','Household level identifier'], inplace=True)

In [None]:
import re
df_train = df_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
clf_gax = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.1, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.93, min_child_samples = 95, num_leaves = 14, subsample = 0.96)

In [None]:
kfold = 5
kf = StratifiedKFold(n_splits=kfold, shuffle=True)

predicts_result = []
for train_index, test_index in kf.split(df_train, y):
    print("###")
    X_train, X_val = df_train.iloc[train_index], df_train.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    clf_gax.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
            early_stopping_rounds=400, verbose=100)
    predicts_result.append(clf_gax.predict(df_test))

In [None]:
submit['Target'] = np.array(predicts_result).mean(axis=0).round().astype(int)

In [None]:
submit

In [None]:
submit.to_csv('submission.csv',index = False)

In [None]:
# ett = et(n_estimators =800, max_depth = 125, class_weight = 'balanced')
# ett.fit(xTrain, yTrain)
# y_pred = ett.predict(xTest)
# print("Classification accuracy: {:.2f}".format(ett.score(xTest, yTest)))
# print("F1 score: {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
# # build the model
# lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
#                              random_state=None, silent=True, metric='multi_logloss', 
#                              n_jobs=4, n_estimators=5000, class_weight='balanced',
#                              colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# clf = lgb.LGBMClassifier(max_depth=9, learning_rate=0.01, objective='multiclass',
#                              random_state=None, silent=True, metric='multi_logloss', 
#                              n_jobs=4, n_estimators=2500, class_weight='balanced',
#                              colsample_bytree =  0.93, min_child_samples = 20, num_leaves = 21, subsample = 0.96)

# clf.fit(xTrain, yTrain, eval_set=[(xTest, yTest)], 
#             early_stopping_rounds=50, verbose=100)
# # fit data into the model and predict the test set
# # lgb_clf.fit(xTrain, yTrain, eval_set=[(xTest, yTest)], 
# #             early_stopping_rounds=400, verbose=100)
# # y_pred = lgb_clf.predict(xTest)
# y_pred = clf.predict(xTest)

In [None]:
# print("Classification accuracy: {:.2f}".format(clf.score(xTest, yTest)))
# print("F1 score : {:.2f}".format(f1_score(yTest, y_pred, average = 'macro')))

In [None]:
# def prepData2(df):
#     unnormal_cols = selectUnnormalised2(df)
#     x_df = normalise2(unnormal_cols, df)
#     return x_df

# #getting the columns with non-normalised values
# def selectUnnormalised2(df):
#     normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
#     dfCols_list = list(df.columns)
#     unnormCols = list(set(dfCols_list)-set(normCol))
    
#     return unnormCols

# #normalising data in training set
# def normalise2(unnormCols, df):
#     #normalise training data
#     toBeNorm = df[[i for i in unnormCols]]
#     ss = StandardScaler()
#     std_scale = ss.fit(toBeNorm)
#     x_norm = std_scale.transform(toBeNorm)
    
#     #covert numpy array to df
#     x_normCols = pd.DataFrame(x_norm, index = toBeNorm.index,
#                                  columns = toBeNorm.columns)
#     df.update(x_normCols)
    
#     return df

In [None]:
# df_test
# xTest_true.head()

In [None]:
# toTest_data = df_test.iloc[:, 2:]
# identity = df_test.iloc[:, 1:2]
# xTest_true = prepData2(toTest_data)
# xTest_true.shape

In [None]:
# df_train['Target'].unique()

In [None]:
# final = lgb_clf.predict(xTest_true)

In [None]:
# df_test['electricity-missing'].unique()

In [None]:
# hhi = df_test['Household level identifier'].tolist()

In [None]:
# predictions = pd.DataFrame({'Target':final,'Household level identifier':hhi})
# predictions.head()

In [None]:
# df_ori['Household level identifier'].nunique()

In [None]:
# df_test['Household level identifier'].nunique()

In [None]:
# final_sub = df_ori[['Id','Household level identifier']]
# # final_sub[Target]
# final_sub.head()


In [None]:
# print(len(final_sub))

In [None]:
# final_sub['Household level identifier'].nunique()

In [None]:
# predictions['Household level identifier'].nunique()

In [None]:
# df3 = pd.merge(predictions,final_sub, on = "Household level identifier", how = "left")

In [None]:
# print(len(df3))

In [None]:
# df3['Target'].unique()

In [None]:
# df4 = df3[['Id','Target']]
# df4["Target"].unique()

In [None]:
# print(len(df4))

In [None]:
# df4.to_csv('/kaggle/working/submission.csv',index = False)