In [1]:
# Import of relevant packages
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import make_pipeline as imb_pipe
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import mutual_info_classif


# Set random seed 
RSEED = 42
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('data/Train.csv')  # Training dataset
test = pd.read_csv('data/Test.csv')  # Test dataset (no labels)
ss = pd.read_csv('data/SampleSubmission.csv')  # Sample submission format
variables = pd.read_csv('data/VariableDefinitions.csv')  # Data dictionary

### Train test split

In [3]:
X = train.drop('bank_account', axis = 1)
y = train[['bank_account']]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RSEED)
# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 17643 samples.
Testing set has 5881 samples.


### Cleaning

In [5]:
y_train['bank_account'] = y_train['bank_account'].map({'Yes': 1, 'No': 0})
y_test['bank_account'] = y_test['bank_account'].map({'Yes': 1, 'No': 0})

In [6]:
categorical_variables = ['country', 'year', 'location_type',
       'cellphone_access','gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type']
numerical_variables = ['household_size', 'age_of_respondent']

In [7]:
def clean(df_in):
    df = df_in.copy(deep=True)
    df = df.drop('uniqueid', axis = 1)
    df[numerical_variables] = df[numerical_variables].astype(float)
    return df

In [8]:
X_train = clean(X_train)
X_test = clean(X_test)

### Pipeline for data cleaning and feature engineering

In [9]:
categorical_steps = [('ohe', OneHotEncoder(drop = 'first',sparse_output=False))]
categorical_pipeline = Pipeline(steps=categorical_steps)
numerical_steps = [('scaler', RobustScaler())]
numerical_pipeline = Pipeline(steps=numerical_steps)
transformers = [('categorical_pipeline', categorical_pipeline, categorical_variables),
                  ('numerical_pipeline', numerical_pipeline, numerical_variables)]

preprocessor = ColumnTransformer(transformers=transformers,
                                         remainder = 'drop').set_output(transform='pandas')

In [10]:
preprocessor.fit(X_train)

## Modelling

### Adaboost without hyperparameter tuning

In [11]:
pipeline_ad = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1,
    class_weight='balanced'),
    random_state=RSEED))
])

In [12]:
pipeline_ad.fit(X_train, y_train)

In [13]:
y_test_pred_ad = pipeline_ad.predict(X_test)
y_test_pred_ad_probs = pipeline_ad.predict_proba(X_test)[:, 1]

In [14]:
print(f"Adaboost MAE = {mean_absolute_error(y_test, y_test_pred_ad)}")
print(f"Adaboost roc curve score = {roc_auc_score(y_test, y_test_pred_ad_probs)}")

Adaboost MAE = 0.22309131100153035
Adaboost roc curve score = 0.8607698014572105


### Adaboost with hyperparameter tuning and oversampling

In [15]:
pipeline_ad = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", AdaBoostClassifier(estimator=DecisionTreeClassifier(),
    random_state=RSEED))
])

In [16]:
y_train.value_counts()

bank_account
0               15159
1                2484
Name: count, dtype: int64

In [17]:
ros = RandomOverSampler(random_state=RSEED, sampling_strategy={1: 15_000}) 

# pipeline
ros_pipeline = imb_pipe(ros, pipeline_ad)

In [18]:
param_grid = {
    'pipeline__preprocessor__numerical_pipeline__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pipeline__classifier__n_estimators': [100, 200, 300],
    'pipeline__classifier__learning_rate': [0.01, 0.1, 0.5, 1.0],
    'pipeline__classifier__estimator__min_samples_split' : np.arange(2, 8, 2),  
    'pipeline__classifier__estimator__max_depth': [1, 2]
}

In [19]:
random_search = RandomizedSearchCV(
    estimator=ros_pipeline,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_absolute_error',  # or recall, f1, roc_auc
    cv=5,
    random_state=RSEED,
    n_jobs=-1,
    verbose=1
)

In [20]:
random_search.fit(X_train, y_train.iloc[:, 0])

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [21]:
best_model = random_search.best_estimator_
print(best_model)

Pipeline(steps=[('randomoversampler',
                 RandomOverSampler(random_state=42,
                                   sampling_strategy={1: 15000})),
                ('pipeline',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('categorical_pipeline',
                                                                   Pipeline(steps=[('ohe',
                                                                                    OneHotEncoder(drop='first',
                                                                                                  sparse_output=False))]),
                                                                   ['country',
                                                                    'year',
                                                                    'location_type',
                                                                    'cellphone_access',
                   

In [22]:
y_test_pred_ad_rnd_hyp = best_model.predict(X_test)
y_test_pred_ad_rnd_hyp_probs = best_model.predict_proba(X_test)[:, 1]

In [23]:
print(f"Adaboost Random Oversample with tuning: MAE = {mean_absolute_error(y_test, y_test_pred_ad_rnd_hyp)}")
print(f"Adaboost Random Oversample with tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_ad_rnd_hyp_probs)}")

Adaboost Random Oversample with tuning: MAE = 0.20625743921101852
Adaboost Random Oversample with tuning: roc curve score = 0.868739549184442


In [24]:
#test_copy = test.copy(deep=True)
#test_copy = clean(test_copy)
#y_final_test = best_model.predict(test_copy)
# Create submission DataFrame
#submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
#                           "bank_account": y_final_test})
#submission.to_csv('first_submission1.csv', index = False)

### Adaboost with hyperparameter tuning with oversampling as hyperparameter

In [25]:
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("sampler", RandomOverSampler()),
    ("classifier", AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=2,min_samples_split=6),
        random_state=RSEED
    ))
])

In [26]:
param_grid = [
    {
        'preprocessor__numerical_pipeline__scaler': [MinMaxScaler()],
        "sampler": [
            RandomOverSampler(random_state=RSEED, sampling_strategy={1: 15_000}),
            SMOTE(sampling_strategy={1:15_000}, random_state=RSEED),
            ImbPipeline([
                ("smote", SMOTE(sampling_strategy={1:15_000}, random_state=RSEED)),
                ("nearmiss", NearMiss())
            ])
        ],
        "classifier__n_estimators": [300, 400, 500],
        "classifier__learning_rate": [0.4, 0.5, 0.6]
    }
]

In [27]:
gs = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # or f1, balanced_accuracy, etc.
    cv=5,
    n_jobs=-1,
    verbose=2
)

gs.fit(X_train, y_train.iloc[:, 0])

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [28]:
best_model = gs.best_estimator_
print(best_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical_pipeline',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False))]),
                                                  ['country', 'year',
                                                   'location_type',
                                                   'cellphone_access',
                                                   'gender_of_respondent',
                                                   'relationship_with_head',
                                                   'marital_status',
                                                   'education_level',
                                                   'job_type']),
                                                 ('nu

In [29]:
y_test_pred_ad_rnd3_hyp = best_model.predict(X_test)
y_test_pred_ad_rnd3_hyp_probs = best_model.predict_proba(X_test)[:, 1]

In [30]:
print(f"Adaboost SMOTE with tuning: MAE = {mean_absolute_error(y_test, y_test_pred_ad_rnd3_hyp)}")
print(f"Adaboost SMOTE with tuning: roc curve score = {roc_auc_score(y_test, y_test_pred_ad_rnd3_hyp_probs)}")

Adaboost SMOTE with tuning: MAE = 0.11647678966162217
Adaboost SMOTE with tuning: roc curve score = 0.8621573877287229


In [32]:
test_copy = test.copy(deep=True)
test_copy = clean(test_copy)
y_final_test = best_model.predict(test_copy)
# Create submission DataFrame
submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
                           "bank_account": y_final_test})
submission.to_csv('first_submission.csv', index = False)