# Neo Bank Churn Prediction

[IMPORTAN! Attempt to use Copilot to build a full traning notebook for kaggle, did not work well]

This notebook synthesizes all the steps from the week_1 notebooks to build a complete Kaggle notebook for submission to the competition.

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss
from venn_abers import VennAbersCalibrator
import joblib
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

## Load and Merge Data

Load all the Parquet raw data, including train and test datasets, and merge them.

In [None]:
train_files = ['./week_1/data/raw/train_1.parquet', './week_1/data/raw/train_2.parquet', './week_1/data/raw/train_3.parquet']
train_df = pd.concat([pd.read_parquet(file) for file in train_files], ignore_index=True)
test_df = pd.read_parquet('./week_1/data/raw/test.parquet')

print('Train Data Info:')
print(train_df.info())
print('Train Data Memory Usage:', train_df.memory_usage(deep=True).sum() / 1024**2, 'MB')
print('Train Data Date Range:', train_df['date'].min(), 'to', train_df['date'].max())

print('Test Data Info:')
print(test_df.info())
print('Test Data Date Range:', test_df['date'].min(), 'to', test_df['date'].max())

## Feature Engineering

Perform feature engineering and define the target for the full load data (train + test) variable (with the 420 days definition).

In [None]:
full_df = pd.concat([train_df, test_df], ignore_index=True)
full_df['date_of_birth'] = pd.to_datetime(full_df['date_of_birth'])
full_df['date'] = pd.to_datetime(full_df['date'])

# Feature engineering steps
full_df['days_between'] = full_df.groupby('customer_id')['date'].diff().dt.days.fillna(0)
full_df['customer_age'] = (full_df['date'] - full_df['date_of_birth']).dt.days / 365.25
full_df['from_competitor'] = full_df['from_competitor'].astype(int)
full_df['churn_due_to_fraud'] = full_df['churn_due_to_fraud'].astype(int)
full_df['atm_transfer_in'] = np.log1p(full_df['atm_transfer_in'])
full_df['atm_transfer_out'] = np.log1p(full_df['atm_transfer_out'])

# Aggregations
agg_funcs = {
    'days_between': ['count', 'mean', 'max'],
    'bank_transfer_in': 'mean',
    'bank_transfer_out': 'mean',
    'crypto_in': 'mean',
    'crypto_out': 'mean',
    'bank_transfer_in_volume': 'mean',
    'bank_transfer_out_volume': 'mean',
    'crypto_in_volume': 'mean',
    'crypto_out_volume': 'mean'
}

agg_df = full_df.groupby('customer_id').agg(agg_funcs)
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
full_df = full_df.merge(agg_df, on='customer_id', how='left')

# Define target
full_df['churn'] = (full_df['date'] < '2022-10-01').astype(int)

## Split Data

Split the data into `train_df` (to perform training, tuning, and calibration) and `test_df` (to predict and save the prediction for submission to the competition).

In [None]:
train_df = full_df[full_df['date'] < '2022-10-01']
test_df = full_df[full_df['date'] >= '2022-10-01']

X_train = train_df.drop(columns=['churn', 'customer_id', 'date_of_birth', 'date', 'name', 'address', 'touchpoints', 'csat_scores', 'Usage', 'next_date', 'split'])
y_train = train_df['churn']

X_test = test_df.drop(columns=['churn', 'customer_id', 'date_of_birth', 'date', 'name', 'address', 'touchpoints', 'csat_scores', 'Usage', 'next_date', 'split'])
y_test = test_df['churn']

## Tune CatBoost Model

Tune the CatBoost model using all the training data with expanding window cross-validation.

In [None]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e-1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-4, 1e-1),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-4, 1e-1),
        'cat_features': ['country', 'broad_job_category'],
        'verbose': 0
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)
    y_pred = model.predict_proba(X_test)[:, 1]
    return log_loss(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f'Best parameters: {best_params}')

## Train Final Model with Venn Abers Calibration

Train a final CatBoost model with the tuned parameters using Venn Abers CVAP (Cross Venn Abers Calibration).

In [None]:
model_best = CatBoostClassifier(**best_params)
model_best.fit(X_train, y_train, eval_set=(X_test, y_test))

va = VennAbersCalibrator(estimator=model_best, inductive=False, n_splits=2)
va.fit(X_train, y_train)
va_cv_prob = va.predict_proba(X_test)

logloss_best = log_loss(y_test, va_cv_prob[:, 1])
roc_auc_best = roc_auc_score(y_test, va_cv_prob[:, 1])
avg_precision_best = average_precision_score(y_test, va_cv_prob[:, 1])
brier_best = brier_score_loss(y_test, va_cv_prob[:, 1])

print(f'Log Loss: {logloss_best}')
print(f'ROC AUC: {roc_auc_best}')
print(f'Average Precision: {avg_precision_best}')
print(f'Brier Score: {brier_best}')

## Save the Prediction

Save the prediction from the test dataset for submission to the leaderboard.

In [None]:
submission = pd.DataFrame({'customer_id': test_df['customer_id'], 'churn_probability': va_cv_prob[:, 1]})
submission.to_csv('submission.csv', index=False)

print('Submission saved to submission.csv')