In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import auc,roc_curve,accuracy_score, f1_score, roc_auc_score, log_loss, classification_report, plot_confusion_matrix, recall_score, precision_score, matthews_corrcoef,make_scorer
from sklearn.utils.class_weight import compute_class_weight

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

from statistics import mean
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
plt.style.use('seaborn-whitegrid')

In [None]:
pd.set_option('display.max_rows', 150)
pd.options.display.max_columns = 150
train = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/test.csv')
train.head(10)

In [None]:
print("Train set info:")
train.info()
print('\n')
print("Test set info:")
test.info()

In [None]:
unique_count = []
col = []
for i in list(train.columns):
    unique_count.append(train[i].nunique())
    col.append(i)

row = ['Unique Values']
pd.DataFrame([unique_count],row,col)

In [None]:
# Distribution of the target, imbalanced multiclass classification
train['Target'].value_counts().plot(kind = "bar",color = '#4399c6')
c1 = train['Target'].value_counts()[1]
c2 = train['Target'].value_counts()[2]
c3 = train['Target'].value_counts()[3]
c4 = train['Target'].value_counts()[4]

print('Extreme poverty(1):  %d %.2f%%' % (c1,c1/(c1+c2+c3+c4)*100))
print('Moderate poverty(2):  %d %.2f%%' % (c2,c2/(c1+c2+c3+c4)*100))
print('Vulnerable households(3):  %d %.2f%%' % (c3,c3/(c1+c2+c3+c4)*100))
print('Non vulnerable households(4):  %d %.2f%%' % (c4,c4/(c1+c2+c3+c4)*100))

In [None]:
# from the observation, dependency, edjefe, edjefa can be converted into numerical features
train.select_dtypes('object').head()

In [None]:
# convert dependency, edjefe, edjefa to numerical features in both train and test set
# yes -> 1, no -> 0

mapping = {"yes": 1, "no": 0}

for i in [train,test]:
    i['dependency'] = i['dependency'].replace(mapping).astype(np.float64)
    i['edjefe'] = i['edjefe'].replace(mapping).astype(np.float64)
    i['edjefa'] = i['edjefa'].replace(mapping).astype(np.float64)
    
#train[['dependency', 'edjefa', 'edjefe']].describe()

In [None]:
train_float = list(train.select_dtypes(include = ['float64']).columns)
train_int = list(train.select_dtypes(include = ['int64']).columns)
train_str = list(train.select_dtypes(include = ['object']).columns)
train_int.remove("Target")

In [None]:
# Unify the target under same household with different labels, take head of household as the correct label

# Retrieve the list of the households with different target labels
target_bool = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
household_false = target_bool[target_bool==False].index.values
#train[train['idhogar'] == household_false[0]][['idhogar', 'parentesco1', 'Target']]

# Unify the target label under same household 
for i in household_false:
    correct_target = train[(train['idhogar'] == i) & (train['parentesco1'] == 1.0)]['Target'].values[0]
    train.loc[train['idhogar'] == i, 'Target'] = correct_target

In [None]:
# Check, all household unified
train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1).value_counts()

In [None]:
# Check missing values
row = []
for i in list(train.columns):
    row.append(i)

missing_value = train.isnull().sum().values
col = ['Train_total']

test_dup = test.copy(deep=True)
test_dup['Target'] = 0

missing = pd.DataFrame(missing_value,row,col)
missing['Train_percent'] = round(missing['Train_total'] / len(train),4)
missing['Test_total'] = test_dup.isnull().sum().values
missing['Test_percent'] = round(missing['Test_total'] / len(test_dup),4)
missing['Total'] = missing['Train_total'] + missing['Test_total']
missing['Percent'] = round(missing['Total'] / (len(train) + len(test_dup)),4)

missing.sort_values('Percent', ascending = False).head(10)

In [None]:
# Handle part of missing values, imputation will be carried out in the CV phase to choose the best imputation method

# The missing values in v18q1 are those families without owning the tablet verified by v18q = 0
# So, simply replace na in v18q1 with 0 in both train and test set
train['v18q1'] = train['v18q1'].fillna(0)
test['v18q1'] = test['v18q1'].fillna(0)

# Some missing values in v2a1 are those families owning their own houses verified by tipovivi1 = 1
# So, simply replace those na in v2a1 with 0 in both train and test set
# The rest of na will be imputated
train.loc[(train['tipovivi1'] == 1), 'v2a1'] = 0
test.loc[(test['tipovivi1'] == 1), 'v2a1'] = 0

# For missing values in rez_esc, as defined by the competition, it is only available for those age in between 7 and 19
# So, replace those na which are not in this age range
# The rest of na will be imputated
train.loc[((train['age'] > 19) | (train['age'] < 7)) & (train['rez_esc'].isnull()), 'rez_esc'] = 0
test.loc[((train['age'] > 19) | (test['age'] < 7)) & (test['rez_esc'].isnull()), 'rez_esc'] = 0

In [None]:
# Recheck missing values
row = []
for i in list(train.columns):
    row.append(i)

missing_value = train.isnull().sum().values
col = ['Train_total']

test_dup = test.copy(deep=True)
test_dup['Target'] = 0

missing = pd.DataFrame(missing_value,row,col)
missing['Train_percent'] = round(missing['Train_total'] / len(train),4)
missing['Test_total'] = test_dup.isnull().sum().values
missing['Test_percent'] = round(missing['Test_total'] / len(test_dup),4)
missing['Total'] = missing['Train_total'] + missing['Test_total']
missing['Percent'] = round(missing['Total'] / (len(train) + len(test_dup)),4)

missing.sort_values('Percent', ascending = False).head(5)

In [None]:
#Heat map
train_heat = train.drop(columns = list(train.select_dtypes(include = ['object']).columns))
print("Correlation Matrix","\n")    
correlation=train_heat.corr(method="pearson")
plt.figure(figsize=(25,10))
sns.heatmap(correlation,vmax=1,square=True,annot=False)
plt.show()

In [None]:
def correl(df,thresh):
    cor = df.corr()
    corrm = np.corrcoef(df.transpose())
    corr = corrm - np.diagflat(corrm.diagonal())
    print("max corr:",corr.max(), ", min corr: ", corr.min())
    c1 = cor.abs().unstack().sort_values(ascending=False)
    high_cor = c1[c1.values!=1]        
    print(high_cor[high_cor>thresh].head(100))
correl(train_heat,0.8)

In [None]:
# Without any leakage from validation set in the CV and test set ! ! !
# It is always true that the squared features which are highly correlated with its corresponding base features
# Also, the male completely coorelates with female, area1 completely coorelates with area2, they are either-or relationships
redundant_feature = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 
        'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq','female','area2']

train_drop = train.drop(columns = redundant_feature)
test_drop = test.drop(columns = redundant_feature)

# Rename male to sex, 1 is male and 0 is female
train_drop.rename({"male": "sex"}, axis=1, inplace=True)
test_drop.rename({"male": "sex"}, axis=1, inplace=True)

# Rename area1 to area, 1 is live in urban and 0 is live in rural
train_drop.rename({"area1": "area"}, axis=1, inplace=True)
test_drop.rename({"area1": "area"}, axis=1, inplace=True)

print(train_drop.shape,test_drop.shape)

In [None]:
# Convert idhogar to new features (feature engineering) which are aggregated statistics of the individual under same family
# Which can then uniquely identify each household

# Features for individual except rez_esc with too many missing values
individual_feature = ['v18q', 'dis', 'sex', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone', 'escolari', 'age','idhogar']

train_ind = train_drop[individual_feature]
test_ind = test_drop[individual_feature]

train_agg = train_ind.groupby('idhogar').agg(['min', 'max', 'sum', 'mean', 'count'])
test_agg = test_ind.groupby('idhogar').agg(['min', 'max', 'sum', 'mean', 'count'])

# Rename the columns
col_rename = []
for i in train_agg.columns.levels[0]:
    for j in train_agg.columns.levels[1]:
        col_rename.append(f'{i}-{j}')
        
train_agg.columns = col_rename
test_agg.columns = col_rename

# Merge the dataframes, drop id and idhogar

y = train_drop['Target']
train_merge = train_drop.merge(train_agg, on = 'idhogar', how = 'left').drop(columns = ['Id','idhogar','Target'])
test_merge = test_drop.merge(test_agg, on = 'idhogar', how = 'left').drop(columns = ['Id','idhogar'])

In [None]:
# Xgboost + LightGBM
seed = np.random.seed(2021)


train_x = train_merge.copy(deep=True)
train_y = y.copy(deep=True)
test_x = test_merge.copy(deep=True)


# Imputation, choose different strategies and see which one is better
strategy = ['mean','median','most_frequent']
imp = SimpleImputer(strategy = strategy[1])
na_feature = ['rez_esc','v2a1','meaneduc']
for i in na_feature:
    train_x[i] = imp.fit_transform(train_x[i].values.reshape(-1,1))[:,0]
    test_x[i] = imp.transform(test_x[i].values.reshape(-1,1))[:,0]
    
# Feature selection: remove features with low variance
sel = VarianceThreshold(threshold=0.01)
sel_var=sel.fit_transform(train_x)
train_x_lv = train_x[train_x.columns[sel.get_support(indices=True)]]
test_x_lv = test_x[train_x_lv.columns]

# Feature selection: remove one of the highly correlated features
correlation_train = train_x_lv.corr()
upper = correlation_train.where(np.triu(np.ones(correlation_train.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.8)]
train_x_cf = train_x_lv.drop(columns = to_drop)
test_x_cf = test_x_lv.drop(columns = to_drop)

xgb_params = {'learning_rate': 0.2, 'n_estimators': 400, 'min_child_weight': 3, 
               'subsample':1, 'colsample_bytree': 1, 'gamma': 0.1, 'reg_alpha': 0, 'reg_lambda': 1,
               'objective':'multi:softprob','num_class':4,'verbosity':0}
xgb = XGBClassifier(**xgb_params)

lgbm_params = {
    'objective': 'multiclass',  
    'nthread': -1, #use all
    'verbose': -1,
    'num_iterations' : 200, 
    'learning_rate':0.1,
    'class_weight':'balanced',
    'random_state': 2021,
    'boosting_type':'gbdt', # Tuned Parameter
    'max_depth':8,
    'num_leaves':100,
    'min_child_samples':10,
    'min_child_weight':0.001,
    'feature_fraction':0.9,
    'bagging_fraction':0.9,
    'bagging_freq':2
}
lgbm = lgb.LGBMClassifier(**lgbm_params)


voting = VotingClassifier(estimators=[("XGBoost", xgb),
                                     ("LightGBM",lgbm)], voting='soft',n_jobs=-1)

voting.fit(train_x_cf,train_y)
submission = pd.DataFrame()
submission['Id'] = test['Id']
submission['Target'] = voting.predict(test_x_cf)
print(submission)

In [None]:
submission.to_csv('submission.csv', index = False)