In [1]:
# Import of relevant packages
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import make_pipeline as imb_pipe
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import mutual_info_classif

import joblib


# Set random seed 
RSEED = 42
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('data/Train.csv')  # Training dataset
test = pd.read_csv('data/Test.csv')  # Test dataset (no labels)
ss = pd.read_csv('data/SampleSubmission.csv')  # Sample submission format
variables = pd.read_csv('data/VariableDefinitions.csv')  # Data dictionary

### Train test split

In [3]:
X = train.drop('bank_account', axis = 1)
y = train[['bank_account']]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RSEED)
# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 17643 samples.
Testing set has 5881 samples.


### Cleaning

In [5]:
y_train['bank_account'] = y_train['bank_account'].map({'Yes': 1, 'No': 0})
y_test['bank_account'] = y_test['bank_account'].map({'Yes': 1, 'No': 0})

In [6]:
categorical_variables = ['country', 'year', 'location_type',
       'cellphone_access','gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type']
numerical_variables = ['household_size', 'age_of_respondent']

In [7]:
def clean(df_in):
    df = df_in.copy(deep=True)
    df = df.drop('uniqueid', axis = 1)
    df[numerical_variables] = df[numerical_variables].astype(float)
    return df

In [8]:
X_train = clean(X_train)
X_test = clean(X_test)

### Pipeline for data cleaning and feature engineering

In [9]:
categorical_steps = [('ohe', OneHotEncoder(drop = 'first',sparse_output=False))]
categorical_pipeline = Pipeline(steps=categorical_steps)
numerical_steps = [('scaler', RobustScaler())]
numerical_pipeline = Pipeline(steps=numerical_steps)
transformers = [('categorical_pipeline', categorical_pipeline, categorical_variables),
                  ('numerical_pipeline', numerical_pipeline, numerical_variables)]

preprocessor = ColumnTransformer(transformers=transformers,
                                         remainder = 'drop').set_output(transform='pandas')

In [10]:
preprocessor.fit(X_train)

## Modelling

### Random forest with random search

In [11]:
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("sampler", RandomOverSampler(random_state=RSEED)),
    ("classifier", RandomForestClassifier(
        n_jobs=-1,
        random_state=RSEED
    ))
])

In [12]:
param_grid = [
    {
        "preprocessor__numerical_pipeline__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],
        "sampler": [
            RandomOverSampler(random_state=RSEED, sampling_strategy={1: 15_000}),
            SMOTE(sampling_strategy={1:15_000}, random_state=RSEED),
            ImbPipeline([
                ("smote", SMOTE(sampling_strategy={1:15_000}, random_state=RSEED)),
                ("nearmiss", NearMiss())
            ])
        ],
        "classifier__n_estimators": [200, 300, 500],
        "classifier__max_depth": [None, 10, 20, 30],
        "classifier__min_samples_split": [2, 5, 10],
        "classifier__min_samples_leaf": [1, 2, 4],
        "classifier__max_features": ["sqrt", "log2"]
    }
]

In [13]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_absolute_error',
    cv=5,
    random_state=RSEED,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train.iloc[:, 0])

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [14]:
rf_best_model = random_search.best_estimator_
print(rf_best_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical_pipeline',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False))]),
                                                  ['country', 'year',
                                                   'location_type',
                                                   'cellphone_access',
                                                   'gender_of_respondent',
                                                   'relationship_with_head',
                                                   'marital_status',
                                                   'education_level',
                                                   'job_type']),
                                                 ('nu

In [15]:
y_test_pred_rf_rnd_hyp = rf_best_model.predict(X_test)
y_test_pred_rf_rnd_hyp_probs = rf_best_model.predict_proba(X_test)[:, 1]

In [16]:
print(f"Random forest with random tuning: MAE = {mean_absolute_error(y_test, y_test_pred_rf_rnd_hyp)}")
print(f"Random forest with random tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_rf_rnd_hyp_probs)}")

Random forest with random tuning: MAE = 0.1435130079918381
Random forest with random tuning: roc curve score = 0.8482788958776104


### Random forest with grid search

In [17]:
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("sampler", SMOTE(sampling_strategy={1:15_000}, random_state=RSEED)),
    ("classifier", RandomForestClassifier(
        n_jobs=-1,
        random_state=RSEED
    ))
])

In [18]:
param_grid = {
    "preprocessor__numerical_pipeline__scaler": [StandardScaler()],
    "classifier__n_estimators": [200, 300, 400],
    "classifier__max_depth": [None, 20, 30],
    "classifier__max_features": ["log2"],
    "classifier__min_samples_split": [5, 10, 15],
    "classifier__min_samples_leaf": [1, 2]
}

In [19]:
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error", 
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train.iloc[:, 0])

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [20]:
rf_best_model2 = grid_search.best_estimator_
print(rf_best_model2)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical_pipeline',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False))]),
                                                  ['country', 'year',
                                                   'location_type',
                                                   'cellphone_access',
                                                   'gender_of_respondent',
                                                   'relationship_with_head',
                                                   'marital_status',
                                                   'education_level',
                                                   'job_type']),
                                                 ('nu

In [21]:
y_test_pred_rf_grd_hyp = rf_best_model2.predict(X_test)
y_test_pred_rf_grd_hyp_probs = rf_best_model2.predict_proba(X_test)[:, 1]

In [22]:
print(f"Random forest with grid tuning: MAE = {mean_absolute_error(y_test, y_test_pred_rf_grd_hyp)}")
print(f"Random forest with grid tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_rf_grd_hyp_probs)}")

Random forest with grid tuning: MAE = 0.14402312531882333
Random forest with grid tuning: roc curve score = 0.8410260657322239


In [23]:
joblib.dump(rf_best_model2, "rf_model.joblib")

['rf_model.joblib']

### Adaboost without hyperparameter tuning

In [24]:
pipeline_ad = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1,
    class_weight='balanced'),
    random_state=RSEED))
])

In [25]:
pipeline_ad.fit(X_train, y_train)

In [26]:
y_test_pred_ad = pipeline_ad.predict(X_test)
y_test_pred_ad_probs = pipeline_ad.predict_proba(X_test)[:, 1]

In [27]:
print(f"Adaboost MAE = {mean_absolute_error(y_test, y_test_pred_ad)}")
print(f"Adaboost roc curve score = {roc_auc_score(y_test, y_test_pred_ad_probs)}")

Adaboost MAE = 0.22309131100153035
Adaboost roc curve score = 0.8607698014572105


### Adaboost with hyperparameter tuning and oversampling

In [28]:
pipeline_ad = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", AdaBoostClassifier(estimator=DecisionTreeClassifier(),
    random_state=RSEED))
])

In [29]:
y_train.value_counts()

bank_account
0               15159
1                2484
Name: count, dtype: int64

In [30]:
ros = RandomOverSampler(random_state=RSEED, sampling_strategy={1: 15_000}) 

# pipeline
ros_pipeline = imb_pipe(ros, pipeline_ad)

In [31]:
param_grid = {
    'pipeline__preprocessor__numerical_pipeline__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pipeline__classifier__n_estimators': [100, 200, 300],
    'pipeline__classifier__learning_rate': [0.01, 0.1, 0.5, 1.0],
    'pipeline__classifier__estimator__min_samples_split' : np.arange(2, 8, 2),  
    'pipeline__classifier__estimator__max_depth': [1, 2]
}

In [32]:
random_search = RandomizedSearchCV(
    estimator=ros_pipeline,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_absolute_error',  # or recall, f1, roc_auc
    cv=5,
    random_state=RSEED,
    n_jobs=-1,
    verbose=1
)

In [33]:
random_search.fit(X_train, y_train.iloc[:, 0])

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [34]:
best_model = random_search.best_estimator_
print(best_model)

Pipeline(steps=[('randomoversampler',
                 RandomOverSampler(random_state=42,
                                   sampling_strategy={1: 15000})),
                ('pipeline',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('categorical_pipeline',
                                                                   Pipeline(steps=[('ohe',
                                                                                    OneHotEncoder(drop='first',
                                                                                                  sparse_output=False))]),
                                                                   ['country',
                                                                    'year',
                                                                    'location_type',
                                                                    'cellphone_access',
                   

In [35]:
y_test_pred_ad_rnd_hyp = best_model.predict(X_test)
y_test_pred_ad_rnd_hyp_probs = best_model.predict_proba(X_test)[:, 1]

In [36]:
print(f"Adaboost Random Oversample with tuning: MAE = {mean_absolute_error(y_test, y_test_pred_ad_rnd_hyp)}")
print(f"Adaboost Random Oversample with tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_ad_rnd_hyp_probs)}")

Adaboost Random Oversample with tuning: MAE = 0.20625743921101852
Adaboost Random Oversample with tuning: roc curve score = 0.868739549184442


In [37]:
#test_copy = test.copy(deep=True)
#test_copy = clean(test_copy)
#y_final_test = best_model.predict(test_copy)
# Create submission DataFrame
#submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
#                           "bank_account": y_final_test})
#submission.to_csv('first_submission1.csv', index = False)

### Adaboost with hyperparameter tuning with oversampling as hyperparameter

In [38]:
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("sampler", RandomOverSampler()),
    ("classifier", AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=2,min_samples_split=6),
        random_state=RSEED
    ))
])

In [39]:
param_grid = [
    {
        'preprocessor__numerical_pipeline__scaler': [MinMaxScaler()],
        "sampler": [
            RandomOverSampler(random_state=RSEED, sampling_strategy={1: 15_000}),
            SMOTE(sampling_strategy={1:15_000}, random_state=RSEED),
            ImbPipeline([
                ("smote", SMOTE(sampling_strategy={1:15_000}, random_state=RSEED)),
                ("nearmiss", NearMiss())
            ])
        ],
        "classifier__n_estimators": [300, 400, 500],
        "classifier__learning_rate": [0.4, 0.5, 0.6]
    }
]

In [40]:
gs = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # or f1, balanced_accuracy, etc.
    cv=5,
    n_jobs=-1,
    verbose=2
)

gs.fit(X_train, y_train.iloc[:, 0])

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [41]:
best_model2 = gs.best_estimator_
print(best_model2)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical_pipeline',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False))]),
                                                  ['country', 'year',
                                                   'location_type',
                                                   'cellphone_access',
                                                   'gender_of_respondent',
                                                   'relationship_with_head',
                                                   'marital_status',
                                                   'education_level',
                                                   'job_type']),
                                                 ('nu

In [42]:
y_test_pred_ad_rnd3_hyp = best_model2.predict(X_test)
y_test_pred_ad_rnd3_hyp_probs = best_model2.predict_proba(X_test)[:, 1]

In [43]:
print(f"Adaboost SMOTE with tuning: MAE = {mean_absolute_error(y_test, y_test_pred_ad_rnd3_hyp)}")
print(f"Adaboost SMOTE with tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_ad_rnd3_hyp_probs)}")

Adaboost SMOTE with tuning: MAE = 0.11647678966162217
Adaboost SMOTE with tuning: roc curve score = 0.8621573877287229


In [44]:
joblib.dump(best_model2, "ad_model.joblib")

['ad_model.joblib']

In [45]:
#test_copy = test.copy(deep=True)
#test_copy = clean(test_copy)
#y_final_test = best_model.predict(test_copy)
# Create submission DataFrame
#submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
#                           "bank_account": y_final_test})
#submission.to_csv('first_submission.csv', index = False)

### XGBoost with Randomized search

In [46]:
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("sampler", SMOTE(sampling_strategy={1:15_000}, random_state=RSEED)),
    ("classifier", XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=RSEED
    ))
])

In [47]:
from scipy.stats import uniform, randint

param_dist = {
    "preprocessor__numerical_pipeline__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],
    "classifier__n_estimators": randint(100, 600),
    "classifier__max_depth": randint(3, 10),
    "classifier__learning_rate": uniform(0.01, 0.2),
    "classifier__subsample": uniform(0.6, 0.4),
    "classifier__colsample_bytree": uniform(0.6, 0.4),
    "classifier__min_child_weight": randint(1, 10),
    "classifier__gamma": uniform(0, 0.5)
}

In [48]:
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,
    scoring="neg_mean_absolute_error",
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=RSEED
)

search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END classifier__max_depth=None, classifier__max_features=log2, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200, preprocessor__numerical_pipeline__scaler=StandardScaler(); total time=   9.4s
[CV] END classifier__max_depth=None, classifier__max_features=log2, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=400, preprocessor__numerical_pipeline__scaler=StandardScaler(); total time=  18.6s
[CV] END classifier__max_depth=None, classifier__max_features=log2, classifier__min_samples_leaf=1, classifier__min_samples_split=10, classifier__n_estimators=300, preprocessor__numerical_pipeline__scaler=StandardScaler(); total time=  13.3s
[CV] END classifier__max_depth=None, classifier__max_features=log2, classifier__min_samples_leaf=1, classifier__min_samples_split=10, classifier__n_estimators=400, preprocessor__numerical_pipeline__scaler=StandardS

In [49]:
best_model3 = search.best_estimator_

In [50]:
print(best_model3)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical_pipeline',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False))]),
                                                  ['country', 'year',
                                                   'location_type',
                                                   'cellphone_access',
                                                   'gender_of_respondent',
                                                   'relationship_with_head',
                                                   'marital_status',
                                                   'education_level',
                                                   'job_type']),
                                                 ('nu

In [51]:
y_test_pred_xg_rnd_hyp = best_model3.predict(X_test)
y_test_pred_xg_rnd_hyp_probs = best_model3.predict_proba(X_test)[:, 1]

In [52]:
print(f"XGBoost with randomized tuning: MAE = {mean_absolute_error(y_test, y_test_pred_xg_rnd_hyp)}")
print(f"XGBoost with randomized tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_xg_rnd_hyp_probs)}")

XGBoost with randomized tuning: MAE = 0.1190273762965482
XGBoost with randomized tuning: roc curve score = 0.8658795272526676


### XGBoost with grid search

In [53]:
param_grid = {
    "classifier__learning_rate": [0.11, 0.13],
    "classifier__max_depth": [4, 5],
    "classifier__min_child_weight": [6, 7],
    "classifier__n_estimators": [500, 550],
    "classifier__colsample_bytree": [0.83, 0.87],
    "classifier__gamma": [0.30, 0.40],
    "classifier__subsample": [0.80, 0.85]
}

In [59]:
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=5,
    n_jobs=-1,
    verbose=0
)

grid_search.fit(X_train, y_train)

In [60]:
best_model4 = grid_search.best_estimator_

In [61]:
y_test_pred_xg_grd_hyp = best_model4.predict(X_test)
y_test_pred_xg_grd_hyp_probs = best_model4.predict_proba(X_test)[:, 1]

In [63]:
print(f"XGBoost with grid tuning: MAE = {mean_absolute_error(y_test, y_test_pred_xg_grd_hyp)}")
print(f"XGBoost with grid tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_xg_grd_hyp_probs)}")

XGBoost with grid tuning: MAE = 0.11953749362353341
XGBoost with grid tuning: roc curve score = 0.8663522937060396


In [64]:
joblib.dump(best_model4, "xg_model.joblib")

['xg_model.joblib']

## Combining 3 models

In [66]:
y_test_pred_mode = (
    y_test_pred_rf_grd_hyp
    + y_test_pred_ad_rnd3_hyp
    + y_test_pred_xg_grd_hyp
    >= 2
).astype(int)

In [67]:
print(f"Mode: MAE = {mean_absolute_error(y_test, y_test_pred_mode)}")

Mode: MAE = 0.1168168678796123
[CV] END classifier__colsample_bytree=0.9746919954946938, classifier__gamma=0.06876047207299663, classifier__learning_rate=0.0782132702100517, classifier__max_depth=3, classifier__min_child_weight=2, classifier__n_estimators=250, classifier__subsample=0.8165791895310264, preprocessor__numerical_pipeline__scaler=MinMaxScaler(); total time=  15.0s
[CV] END classifier__colsample_bytree=0.7031766510860622, classifier__gamma=0.3299920230170895, classifier__learning_rate=0.17344444004024318, classifier__max_depth=3, classifier__min_child_weight=4, classifier__n_estimators=159, classifier__subsample=0.7043316699321636, preprocessor__numerical_pipeline__scaler=StandardScaler(); total time=   7.7s
[CV] END classifier__colsample_bytree=0.8903822715480958, classifier__gamma=0.44855512997628855, classifier__learning_rate=0.18741728485302347, classifier__max_depth=3, classifier__min_child_weight=8, classifier__n_estimators=215, classifier__subsample=0.6336559859980195