## Split the data

We split the data by household to avoid leakage, since rows belonging to the same household usually have the same target. Since we filter the data to only include heads of household this isn't technically necessary, but it provides an easy way to use the entire training data set if we want to do that.

Note that after splitting the data we overwrite the train data with the entire data set so we can train on all of the data. The split_data function does the same thing without overwriting the data, and is used within the training loop to (hopefully) approximate a K-Fold split. 

In [None]:
import numpy as np # linear algebra
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")
test = pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")
test_ids = test.Id

train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

train_length = train.shape[0]
test_length = test.shape[0]

In [None]:
# Problem:
# The whole thing is like hanging on a threshold, literally any structural / logical changes will cause the results to change (drop).

# Current situation: encode train and test's idhogar separely, then process data
    
# If we don't encode / encode together, then the train_test split will be different from original, the result will drop.

# If we encode separately but process data together, then the train and test will share some idhogar, causing things like num_over_18 to mess up.

In [None]:
from sklearn.preprocessing import LabelEncoder

# this only transforms the idhogar field, the other things this function used to do are done elsewhere
def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])
    
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]

    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
    
    # aggregation rules over household
    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }
    
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

    # aggregation over household
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg

    # Drop id's
#     df.drop(['Id'], axis=1, inplace=True)
    
    return df

# convert one hot encoded fields to label encoding
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

def process_df(df_):
    # encode the idhogar
    encode_data(df_)
    
    # create aggregate features
    return do_features(df_)

train = process_df(train)
test = process_df(test)

# some dependencies are Na, fill those with the square root of the square
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# fill "no"s for education with 0s
train.loc[train['edjefa'] == "no", "edjefa"] = 0
train.loc[train['edjefe'] == "no", "edjefe"] = 0
test.loc[test['edjefa'] == "no", "edjefa"] = 0
test.loc[test['edjefe'] == "no", "edjefe"] = 0

# if education is "yes" and person is head of household, fill with escolari
train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "edjefa"] = train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "escolari"]
train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "edjefe"] = train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "escolari"]

test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "edjefa"] = test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "edjefe"] = test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "escolari"]

# this field is supposed to be interaction between gender and escolari, but it isn't clear what "yes" means, let's fill it with 4
train.loc[train['edjefa'] == "yes", "edjefa"] = 4
train.loc[train['edjefe'] == "yes", "edjefe"] = 4

test.loc[test['edjefa'] == "yes", "edjefa"] = 4
test.loc[test['edjefe'] == "yes", "edjefe"] = 4

# convert to int for our models
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

# create feature with max education of either head of household
train['edjef'] = np.max(train[['edjefa','edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa','edjefe']], axis=1)

# fill some nas
train['v2a1']=train['v2a1'].fillna(0)
test['v2a1']=test['v2a1'].fillna(0)

test['v18q1']=test['v18q1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)

train['rez_esc']=train['rez_esc'].fillna(0)
test['rez_esc']=test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), "meaneduc"] = 0
train.loc[train.SQBmeaned.isnull(), "SQBmeaned"] = 0

test.loc[test.meaneduc.isnull(), "meaneduc"] = 0
test.loc[test.SQBmeaned.isnull(), "SQBmeaned"] = 0

# fix some inconsistencies in the data - some rows indicate both that the household does and does not have a toilet, 
# if there is no water we'll assume they do not
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "v14a"] = 0
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "sanitario1"] = 0

test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "v14a"] = 0
test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "sanitario1"] = 0

def train_test_apply_func(train_, test_, func_):
    test_['Target'] = 0
    xx = pd.concat([train_, test_])

    xx_func = func_(xx)
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_  = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1)

    del xx, xx_func
    return train_, test_

# convert the one hot fields into label encoded
train, test = train_test_apply_func(train, test, convert_OHE2LE)

cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 
              'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE',
              'pared_LE']
cols_nums = ['age', 'meaneduc', 'dependency', 
             'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',
             'bedrooms', 'overcrowding']

def convert_geo2aggs(df_):
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar']+cols_nums)],
                        pd.get_dummies(df_[cols_2_ohe], 
                                       columns=cols_2_ohe)],axis=1)

    geo_agg = tmp_df.groupby(['lugar_LE','idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE')

# add some aggregates by geography
train, test = train_test_apply_func(train, test, convert_geo2aggs)

# add some extra features, these were taken from another kernel
def extract_features(df):
#     tmp_df = pd.concat([df[(['lugar_LE', 'idhogar']+cols_nums)],
#                         pd.get_dummies(df[cols_2_ohe], 
#                                        columns=cols_2_ohe)],axis=1)

#     geo_agg = tmp_df.groupby(['lugar_LE','idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
#     geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
#     df = df.join(geo_agg, how = 'left', on = 'lugar_LE')
    
    df['num_over_18'] = 0
    df['num_over_18'] = df[df.age >= 18].groupby('idhogar').transform("count")
    df['num_over_18'] = df.groupby("idhogar")["num_over_18"].transform("max")
    df['num_over_18'] = df['num_over_18'].fillna(0)

    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
    df['rent_to_rooms'] = df['v2a1']/df['rooms']
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # tamhog - size of the household
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] # r4t3 - Total persons in the household
    df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] # rent to people in household
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) # rent to people under age 12
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] # rooms per person
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] # rent to household size
    df['rent_to_over_18'] = df['v2a1']/df['num_over_18']
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1
    
extract_features(train)    
extract_features(test)   

# test["Target"] = 0
# combined = pd.concat([train, test])
# combined.loc[combined.num_over_18 == 0, "rent_to_over_18"] = combined[combined.num_over_18 == 0].v2a1
# train = combined.iloc[:train_length, :]
# test = combined.iloc[train_length:, :]
# test.drop(columns = ["Target"], inplace = True)

# drop duplicated columns
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone', 'female', ]

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]

needless_cols.extend(instlevel_cols)

print(needless_cols)

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)

train.to_csv("train_processed.csv", index = False)

In [None]:
np.random.seed(seed=None)

# We are only interested in heads of household
# There are 2973 heads of household in the train set
X = train[train.parentesco1 == 1]

# Extract their poverty level and set it aside as y
y = X['Target'] - 1
X.drop(['Target'], axis=1, inplace = True)

X_ids = X.idhogar

# Since our training data is so imbalanced, we shall not allocate the same weight to all target classes
# Instead, target classes that are more "rare" shall get higher weight
# y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)
y_weights = class_weight.compute_sample_weight('balanced', y, indices=None)

# Let's take a look
# Indeed, target class 1, which is the rarest, gets the highest weight
# Then weight decreases as the Target class becomes more common
# print(pd.DataFrame(dict(Target = y_train+1, Weight = y_train_weights)).drop_duplicates().sort_values(by = ["Target"]).reset_index(drop = True))
print(pd.DataFrame(dict(Target = y, Weight = y_weights)).drop_duplicates().sort_values(by = ["Target"]).reset_index(drop = True))

In [None]:
def split_data(train, y, sample_weight=None, test_percentage=0.20):   
    # pick some random households to use for the test data
    test_idx = np.random.choice(train.index, size = int(train.shape[0] * test_percentage), replace = False)
    
    X_test = train.loc[test_idx]
    y_test = y.loc[test_idx]

    X_train = train.loc[~train.index.isin(test_idx)]
    y_train = y.loc[~train.index.isin(test_idx)]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~train.index.isin(test_idx)]
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

In [None]:
train_X, train_y, val_X, val_y, train_y_weights = split_data(X, y, y_weights, test_percentage = 0.30)

In [None]:
def evaluate_macroF1_lgb(predictions, truth):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', 1-f1) 

fit_params={"early_stopping_rounds":500,
            "eval_metric" : evaluate_macroF1_lgb, 
            'verbose': False,
           }

xgb_ = xgb.XGBClassifier(random_state=217, n_jobs=-1)
fit_params["eval_set"] = [(val_X.drop(columns = ["idhogar", "parentesco1"]),val_y)]
model = xgb_.fit(train_X.drop(columns = ["idhogar", "parentesco1"]), train_y, sample_weight = train_y_weights, **fit_params)

useless_xgb_cols = train_X.drop(columns = ["idhogar", "parentesco1"]).columns[model.feature_importances_ == 0]

# Useless rf columns
rf_drop_columns = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
                   'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
                   'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
                   'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
                   'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
                   'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
                   'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
                   'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
                   'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
                   'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
                   'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
                   'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
                   'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
                   'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
                   'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
                   'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
                   'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
                   'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
                   'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
                   'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
                   'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN',
                   'parentesco_LE', 'rez_esc', "idhogar", "parentesco1",
                   'fe_rent_per_person', 'fe_rent_per_room','fe_tablet_adult_density', 
                   'fe_tablet_density'
                  ]

rf = RandomForestClassifier(random_state=217, n_jobs=-1)
rf.fit(train_X.drop(columns = rf_drop_columns), train_y)


In [None]:
useless_rf_cols = train_X.drop(columns = rf_drop_columns).columns[rf.feature_importances_ == 0]

In [None]:
xgb_drop_columns = useless_xgb_cols.tolist() + ["idhogar", "parentesco1"]

# XGBoost + Random Forests

The parameters are optimised with a random search in this kernel: https://www.kaggle.com/mlisovyi/lighgbm-hyperoptimisation-with-f1-macro

In [None]:
def evaluate_macroF1_lgb(predictions, truth):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', 1-f1) 

fit_params={"early_stopping_rounds":500,
            "eval_metric" : evaluate_macroF1_lgb, 
            'verbose': False,
           }

In [None]:
np.random.seed(100)

In [None]:
xgbs = []

xgb_opt_parameters = {'n_estimators':300, 'learning_rate':0.15, 'max_depth':35, 'eta':0.15, 
                      'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 
                      'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }

xgb_opt_parameters = {'colsample_bytree': 0.9396309324969985, 'gamma': 0.0019562316326860586, 'learning_rate': 0.013457753609263417, 'max_depth': 26, 'max_leaves': 191, 'min_child_weight': 7, 'subsample': 0.8053153785018299}


for i in range(15):
    xgb_ = xgb.XGBClassifier(random_state=217+i, n_jobs=-1, **xgb_opt_parameters)
    train_X_train, train_y_train, train_X_val, train_y_val, train_y_train_weights = split_data(train_X.drop(columns = xgb_drop_columns), train_y, sample_weight=train_y_weights)
    fit_params["eval_set"] = [(train_X_val,train_y_val)]
    xgb_.fit(train_X_train, train_y_train, sample_weight = train_y_train_weights, **fit_params)
    xgbs.append(xgb_)

In [None]:
rfs = []

rf_opt_parameters = {"max_depth": None, "n_estimators": 500, "min_impurity_decrease": 1e-3, 
                    "min_samples_leaf": 2, "class_weight": "balanced"}

for i in range(10):
    rf = RandomForestClassifier(random_state=217+i, n_jobs=-1, **rf_opt_parameters)
    rf.fit(train_X.drop(columns = rf_drop_columns), train_y)
    rfs.append(rf) 

In [None]:
def combine_voters(data, weights=[0.5, 0.5]):
    vc1_probs = predict_proba(xgbs, data.drop(xgb_drop_columns, axis=1))
    vc2_probs = predict_proba(rfs, data.drop(rf_drop_columns, axis=1))

    final_vote = (vc1_probs * weights[0]) + (vc2_probs * weights[1])
    predictions = np.argmax(final_vote, axis=1)
    return predictions

def predict_proba(estimators, data):
    pred = np.zeros((len(data),4))
    for est in estimators:
        pred += est.predict_proba(data)
    norm_pred = pred/len(estimators)
    
    return norm_pred

In [None]:
# vc1_probs = predict_proba(xgbs, train.drop(xgb_drop_columns + ['Target'], axis=1))
# vc2_probs = predict_proba(rfs, train.drop(rf_drop_columns + ['Target'], axis=1))
    
# combined_probs = pd.concat([pd.DataFrame(vc1_probs), pd.DataFrame(vc2_probs)], axis = 1)

# combined_probs = combined_probs[train.parentesco1 == 1]
# probs_sum = vc1_probs + vc2_probs
# prediction = np.argmax(probs_sum, axis = 1)
# prediction = prediction[train.parentesco1 == 1]

# combined_votes = combine_voters(train.drop(columns = ["Target"]), weights = [0.48, 0.52])[train.parentesco1 == 1]
# combined_probs["Prediction"] = combined_votes + 1
# combined_probs["Actual"] = y + 1
# # We will create another column called Manual Prediction to test our thresholding
# combined_probs["Manual_Prediction"] = combined_votes + 1

# # Before manual tuning: f1 score = 0.8345588156251473
# print(f1_score(combined_probs["Actual"], combined_probs["Prediction"], average = "macro"))

# combined_probs["prob_2"] = 0.48* combined_probs.iloc[:, 1] + 0.52* combined_probs.iloc[:, 5]
# combined_probs["prob_3"] = combined_probs.iloc[:, 2] + combined_probs.iloc[:, 6]

# # Manual tuning:
# # Rule 1: If predict 2 but combined confidence < 0.8, kick to 4
# combined_probs.loc[(combined_probs.Prediction == 2) & (combined_probs.prob_2 < 0.55), "Manual_Prediction"] = 4
# # Rule 2: If predict 3 but combined confidence < 0.8, kick to 4
# # combined_probs.loc[(combined_probs.Prediction == 3) & (combined_probs.prob_3 < 0.575), "Manual_Prediction"] = 4

# # After manual tuningL f1 score
# print(f1_score(combined_probs["Actual"], combined_probs["Manual_Prediction"], average = "macro"))

In [None]:
vc1_probs = predict_proba(xgbs, test.drop(xgb_drop_columns, axis=1))
vc2_probs = predict_proba(rfs, test.drop(rf_drop_columns, axis=1))

combined_probs = pd.concat([pd.DataFrame(vc1_probs), pd.DataFrame(vc2_probs)], axis = 1)

probs_sum = vc1_probs + vc2_probs
prediction = np.argmax(probs_sum, axis = 1)

combined_votes = combine_voters(test, weights = [0.48, 0.52]) + 1

combined_probs["Prediction"] = combined_votes

# Test
# We will create another column called Manual Prediction to test our thresholding
combined_probs["Manual_Prediction"] = combined_votes

combined_probs["prob_1"] = combined_probs.iloc[:, 0] + combined_probs.iloc[:, 4]
combined_probs["prob_2"] = combined_probs.iloc[:, 1] + combined_probs.iloc[:, 5]
combined_probs["prob_3"] = combined_probs.iloc[:, 2] + combined_probs.iloc[:, 6]
combined_probs["prob_4"] = combined_probs.iloc[:, 3] + combined_probs.iloc[:, 7]


combined_probs.loc[(combined_probs.Prediction == 1) & (combined_probs.prob_1 < 0.54), "Manual_Prediction"] = 2
# The next line is useless, since for those rows with prediction == 2, none of the rows is smaller than 0.52
# combined_probs.loc[(combined_probs.Prediction == 2) & (combined_probs.prob_2 < 0.52), "Manual_Prediction"] = 3
combined_probs.loc[(combined_probs.Prediction == 3) & (combined_probs.prob_3 < 0.54), "Manual_Prediction"] = 4
combined_probs.loc[(combined_probs.Prediction == 4) & (combined_probs.prob_4 < 0.52), "Manual_Prediction"] = 3

In [None]:
# How many heads of households have been reclassified?
combined_probs[(test.parentesco1 == 1) & (combined_probs.Prediction != combined_probs.Manual_Prediction)].shape[0]

# Prepare submission

In [None]:
submission = pd.DataFrame()
submission['Id'] = test_ids
# submission['Target'] = combine_voters(test, weights = [0.48, 0.52]) + 1
submission["Target"] = combined_probs["Manual_Prediction"]
submission.to_csv("submission.csv", index=False)