In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import shap
import matplotlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb
from xgboost import XGBClassifier

pd.options.display.max_columns = 150
pd.options.display.max_rows = None
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
train.head()

In [None]:
train.info(), test.info()

In [None]:
test['Target'] = np.nan
df = train.append(test, ignore_index = True)

In [None]:
mapping = {"yes": 1, "no": 0}

df['dependency'] = df['dependency'].replace(mapping).astype(np.float64)
df['edjefa'] = df['edjefa'].replace(mapping).astype(np.float64)
df['edjefe'] = df['edjefe'].replace(mapping).astype(np.float64)

df[['dependency', 'edjefa', 'edjefe']].describe()

In [None]:
labels = df.loc[(df['Target'].notnull()) & (df['parentesco1'] == 1), ['Target', 'idhogar']]
label_counts = labels['Target'].value_counts().sort_index()
label_counts

In [None]:
all_equal = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
not_equal = all_equal[all_equal != True]
len(not_equal)

In [None]:
households_leader = train.groupby('idhogar')['parentesco1'].sum()

households_no_head = train.loc[train['idhogar'].isin(households_leader[households_leader == 0].index), :]

households_no_head['idhogar'].nunique()

In [None]:
households_no_head_equal = households_no_head.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
sum(households_no_head_equal == False)

In [None]:
for household in not_equal.index:
    true_target = int(train[(train['idhogar'] == household) & (train['parentesco1'] == 1.0)]['Target'])
    
    train.loc[train['idhogar'] == household, 'Target'] = true_target
    
all_equal = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

not_equal_new = all_equal[all_equal != True]
len(not_equal_new)

In [None]:
df.isnull().any()

In [None]:
# v2a1, Monthly rent payment
# tipovivi1, =1 own and fully paid house
# tipovivi2, "=1 own,  paying in installments"
# tipovivi3, =1 rented
# tipovivi4, =1 precarious
# tipovivi5, "=1 other(assigned,  borrowed)"
df.loc[df['v2a1'].isnull(), [col for col in df if col.startswith('tipovivi')]].sum()

In [None]:
# own homes
df.loc[(df['tipovivi1'] == 1), 'v2a1'] = 0
# fill unsure and other
df.loc[(df['tipovivi1'] != 1 & df['v2a1'].isnull()), 'v2a1'] = df['v2a1'].mean()
df.loc[df['v2a1'].isnull(), [col for col in df if col.startswith('tipovivi')]].sum()

In [None]:
# v18q, owns a tablet
# v18q1, number of tablets household owns
df.loc[(df['parentesco1'] == 1)].groupby('v18q')['v18q1'].apply(lambda x: x.isnull().sum())

In [None]:
df['v18q1'] = df['v18q1'].fillna(0)

In [None]:
df.loc[df['rez_esc'].isnull()]['age'].describe()


In [None]:
# variable only filled between age 7-19
df.loc[((df['age'] < 7) | (df['age'] > 19)) & (df['rez_esc'].isnull()), 'rez_esc'] = 0
# variable max is 5
df.loc[df['rez_esc'] > 5, 'rez_esc'] = 5
# fill the rest, int with median and float with mean
df.loc[df['rez_esc'].isnull(), 'rez_esc'] = df['rez_esc'].median()
df.loc[df['meaneduc'].isnull(), 'meaneduc'] = df['meaneduc'].mean()

In [None]:
ids = ['Id', 'idhogar', 'Target']

ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone']

ind_ordered = ['rez_esc', 'escolari', 'age']

hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
           'abastaguadentro', 'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area1', 'area2']

hh_ordered = [ 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1','r4m2','r4m3', 'r4t1',  'r4t2', 
              'r4t3', 'v18q1', 'tamhog','tamviv','hhsize','hogar_nin',
              'hogar_adul','hogar_mayor','hogar_total',  'bedrooms', 'qmobilephone']

hh_cont = ['v2a1', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'overcrowding']

sqrs = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 
        'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']

In [None]:
df = df.drop(columns=sqrs)

In [None]:
# aggregate individual level features
ind = df[ids + ind_bool + ind_ordered]
ind_agg = ind.drop(columns = 'Target').groupby('idhogar').agg(['min', 'max', 'sum', 'count', 'mean', 'std'])
ind_agg.head()

In [None]:
new_col = []
for c in ind_agg.columns.levels[0]:
    for stat in ind_agg.columns.levels[1]:
        new_col.append(f'{c}-{stat}')
        
ind_agg.columns = new_col
ind_agg.head()

In [None]:
corr_matrix = ind_agg.corr()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

to_drop = [col for col in upper.columns if any(abs(upper[col]) > 0.95)]

len(to_drop)

In [None]:
reduced_agg = ind_agg.drop(columns = to_drop)
merged_df = df.merge(reduced_agg, on='idhogar', how='left')
head_df = merged_df.loc[(df['parentesco1'] == 1)]
merged_df.shape, head_df.shape

In [None]:
X_train = head_df[head_df['Target'].notnull()].drop(columns = ['Id', 'idhogar', 'Target'])
X_test = merged_df[merged_df['Target'].isnull()].drop(columns = ['Id', 'idhogar', 'Target'])
Y_train = head_df[head_df['Target'].notnull()]['Target']
X_train.shape, X_test.shape, Y_train.shape

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
pipeline = Pipeline([('imputer', imputer), ('scaler', MinMaxScaler())])
x_train = pipeline.fit_transform(X_train)
x_test = pipeline.transform(X_test)

In [None]:
model_0 = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
# all models will be evaluated based on this since 
# the competition scoring criteria is this
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')

cv_score = cross_val_score(model_0, x_train, Y_train, cv = 10, scoring = scorer)
cv_score.mean(), cv_score.std()

In [None]:
explainer = shap.TreeExplainer(model_0.fit(x_train, Y_train))
shap_values = explainer.shap_values(X = pd.DataFrame(x_train).iloc[:,:])

In [None]:
shap.summary_plot(shap_values, x_train)

In [None]:
feature_names = X_train.columns
shap.summary_plot(shap_values[0], x_train, feature_names=feature_names)
shap.summary_plot(shap_values[1], x_train, feature_names=feature_names)
shap.summary_plot(shap_values[2], x_train, feature_names=feature_names)
shap.summary_plot(shap_values[3], x_train, feature_names=feature_names)

In [None]:
top_features = set()

for i in shap_values:
    vals= np.abs(i).mean(0)
    feature_importance = pd.DataFrame(list(zip(X_train.columns,vals)),columns=['col_name','feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
    [top_features.add(_) for _ in feature_importance.head(50)['col_name']]

In [None]:
len(top_features)

In [None]:
remove_cols = [_ for _ in X_train.columns if _ not in top_features]
X_train_reduced = X_train.drop(columns = remove_cols)
x_train_reduced = pipeline.fit_transform(X_train_reduced)

model_0 = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
# all models will be evaluated based on this since 
# the competition scoring criteria is this
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')

cv_score = cross_val_score(model_0, x_train_reduced, Y_train, cv = 10, scoring = scorer)

cv_score.mean(), cv_score.std()

In [None]:
def f1_macro(pred, d_train):  
    pred_labels = pred.argmax(axis=1)
    y_train = d_train.get_label()
    f1 = f1_score(y_train, pred_labels, average='macro')
    return ('f1_macro', f1) 

In [None]:
cv_params = {'n_estimators':300, 'learning_rate':0.15, 'max_depth':35, 'eta':0.15, 
                    'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 5, 'gamma': 2.5, 
                    'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35,
                    'early_stopping_rounds':500,
                    'feval' : f1_macro, }
fit_params = {'n_estimators':300, 'learning_rate':0.15, 'max_depth':35, 'eta':0.15, 
                    'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 5, 'gamma': 2.5, 
                    'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35,
                    'early_stopping_rounds':500,
                    'eval_metric' : f1_macro, }
d_train = xgb.DMatrix(X_train, Y_train)
d_train_reduced = xgb.DMatrix(X_train_reduced, Y_train)

xgb_cv = xgb.cv(dtrain=d_train, params=cv_params, feval=f1_macro)
xgb_cv_reduced = xgb.cv(dtrain=d_train_reduced, params=cv_params, feval=f1_macro)

In [None]:
Y_train.unique()

In [None]:
['full: {}'.format(xgb_cv['test-f1_macro-mean'].mean(), xgb_cv['test-f1_macro-std'].mean()), 
'reduced: {}'.format(xgb_cv_reduced['test-f1_macro-mean'].mean(), xgb_cv_reduced['test-f1_macro-std'].mean())]

In [None]:
xgb_clf = XGBClassifier(params=fit_params)
xgb_clf.fit(X_train_reduced, Y_train)

In [None]:
X_test_reduced = X_test.drop(columns = remove_cols)
x_test_reduced = pipeline.fit_transform(X_test_reduced)
y_pred = xgb_clf.predict(x_test_reduced)

In [None]:
test['Target'] = y_pred.astype(np.int8)
submission = test[['Id', 'Target']]

In [None]:
submission.isnull().any(), submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)

Future iterations(not implemented due to time constraint):
- feature selection/ reduction
    - by discarding highly correlated (redundant) features, using correlation matrix
    - more rigid feature reduction by comparing aggregate shap values from individual folds in cross-validation, or other methods like PCA
- model training/selection
    - experiment with more different algorithms
    - experiment with hyperparameters
- other ideas
    - separately train models for head and non-head rows, and feed aggragate Target values of non-head predictions as new feature in head predictions