<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/subscriber_prediction_hackathon/notebooks/05_eda_baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import gc
import os
import time
import warnings
gc.enable()
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss, roc_auc_score

SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/subscriber_prediction_hackathon/data'

train = pd.read_csv(f'{DATA_URL}/raw/train.csv')
test = pd.read_csv(f'{DATA_URL}/raw/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

# Preprocessing

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        21000 non-null  int64 
 1   job        21000 non-null  object
 2   marital    21000 non-null  object
 3   education  21000 non-null  object
 4   default    21000 non-null  object
 5   balance    21000 non-null  int64 
 6   housing    21000 non-null  object
 7   loan       21000 non-null  object
 8   contact    21000 non-null  object
 9   day        21000 non-null  int64 
 10  month      21000 non-null  object
 11  duration   21000 non-null  int64 
 12  campaign   21000 non-null  int64 
 13  pdays      21000 non-null  int64 
 14  previous   21000 non-null  int64 
 15  poutcome   21000 non-null  object
 16  y_bool     21000 non-null  int64 
dtypes: int64(8), object(9)
memory usage: 2.7+ MB


In [14]:
TARGET = 'y_bool'

In [13]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [18]:
features = list(test.columns)
num_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Adversarial validation

In [15]:
train['set'] = 0
test['set'] = 1
composite = pd.concat([train, test], axis=0, ignore_index=True)

In [16]:
composite = composite.drop(TARGET, axis=1)
train = train.drop('set', axis=1)
test = test.drop('set', axis=1)

In [20]:
%%time
X, y = composite[features], composite['set']
scores = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = ExtraTreesClassifier(
        n_estimators=150,  
        max_depth=7,
        n_jobs=-1, 
        random_state=SEED
    )

    model.fit(X_train, y_train)
    val_probs = model.predict_proba(X_val)[:, 1]

    score = roc_auc_score(y_val, val_probs)
    scores.append(score)
    print(f'Fold #{fold}: {score:.5f}')
    _ = gc.collect()

print(f'Avg AUC = {np.mean(scores):.5f} +/- {np.std(scores):.5f}\n')

Fold #0: 0.50383
Fold #1: 0.49668
Fold #2: 0.49250
Fold #3: 0.50228
Fold #4: 0.49848
Avg AUC = 0.49875 +/- 0.00404

CPU times: user 14.5 s, sys: 247 ms, total: 14.8 s
Wall time: 8.51 s


Train and test sets cannot be differentiated based on features.  
Thus, we can trust CV score for performance on test set.

# Baseline #2

In [25]:
%%time
X, y = train[features], train[TARGET]
X_test = test[features]
scores = []
test_preds = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = ExtraTreesClassifier(
        n_estimators=250, 
        max_depth=6,
        n_jobs=-1, 
        random_state=SEED
    )

    model.fit(X_train, y_train)
    val_probs = model.predict_proba(X_val)[:, 1]
    test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]

    score = log_loss(y_val, val_probs)
    scores.append(score)
    print(f'Fold #{fold}: Logloss = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg Logloss = {np.mean(scores):.5f} +/- {np.std(scores):.5f}\n')

test_preds = pd.DataFrame.from_dict(test_preds)
test_preds['mean'] = test_preds.mean(axis=1)

Fold #0: Logloss = 0.58184
Fold #1: Logloss = 0.58170
Fold #2: Logloss = 0.58100
Fold #3: Logloss = 0.58208
Fold #4: Logloss = 0.58188

Avg Logloss = 0.58170 +/- 0.00037

CPU times: user 17.5 s, sys: 471 ms, total: 18 s
Wall time: 10.8 s


# Submission files

In [26]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [27]:
NOTEBOOK = '05'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/subscriber_prediction_hackathon/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [28]:
def create_submission_files(test_preds, feature_set):
    for col in test_preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{feature_set}_{col}.csv', index=False)

In [29]:
create_submission_files(test_preds, 'dummies')