In [1]:
%load_ext autoreload
%autoreload 2
import os
import gc

os.chdir('../../')

In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from global_code.util import reduce_mem_usage, clf_metric_report, compute_and_plot_permutation_importance, plot_pr_calib_curve, plot_dis_probs, plot_shap_values
import optuna
import arfs.feature_selection.allrelevant as arfsgroot
import shap
import json
import joblib

sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


### Read the parquet file located at `./week_1/data/processed/feature_engineering_dataset.parquet`

In [5]:
df = pd.read_parquet('./week_1/data/processed/feature_engineering_dataset_v2.parquet')
non_features_list = ['customer_id','name','date_of_birth','address','touchpoints','csat_scores','Usage','churn','next_date','days_diff', 'job']
df = df.drop(non_features_list, axis=1)
df.info()           

<class 'pandas.core.frame.DataFrame'>
Index: 5286530 entries, 0 to 3668009
Columns: 190 entries, Id to churn_18_months
dtypes: bool(1), category(2), datetime64[ns](2), float16(112), float32(49), int16(5), int32(3), int8(15), object(1)
memory usage: 2.4+ GB


### Train, Validation, Test Split

In [6]:
# target - Inactivity 365 days 
#train_max_date = '2022-01-01'
#validation_max_date = '2023-01-01'
#test_start_date = '2024-01-01'
#target = 'churn_365'

# target - no activity after 2022-06-01
#train_max_date = '2019-06-01'
#validation_max_date = '2022-06-01'
#test_start_date = '2024-01-01'
#target = 'churn_18_months'

# target - Inactivity >= 420 days target
train_max_date = '2021-10-01'
validation_max_date = '2022-10-01'
test_start_date = '2024-01-01'
target = 'churn_420'

# Loading features from features_list.json
with open('./week_1/data/processed/features_list.json', 'r') as f:
    features = json.load(f)

train_df = df.loc[df['date'] < train_max_date, features + [target]]
validation_df = df.loc[(df['date'] >= train_max_date) & (df['date'] < validation_max_date), features + [target]]

# Get Train and Validation Subsamples to speed up the process
_, train_df = train_test_split(train_df, test_size=0.10, random_state=42, stratify=train_df[target])

# Spliting the original validatio into two shuffled datasets one for validation only and the other for calibration
validation_df, calibration_df = train_test_split(validation_df, test_size=0.20, random_state=42, stratify=validation_df[target])

print('Train Shape: ', train_df.shape)
print('Validation shape: ', validation_df.shape)
print('Calibration shape: ', calibration_df.shape)

Train Shape:  (306649, 184)
Validation shape:  (283510, 184)
Calibration shape:  (70878, 184)


In [7]:
# Save training data
train_df.to_parquet('./week_1/data/processed/train_df_v2.parquet')

# Save validation data
validation_df.to_parquet('./week_1/data/processed/validation_df_v2.parquet')

# Save calibration data
calibration_df.to_parquet('./week_1/data/processed/calibration_df_v2.parquet')


# Saving a smaller version (40% of the original datasets) for faster hyperparameter optimization
_, train_df = train_test_split(train_df, test_size=0.60, random_state=42, stratify=train_df[target])
_, validation_df = train_test_split(validation_df, test_size=0.60, random_state=42, stratify=validation_df[target])
_, calibration_df = train_test_split(calibration_df, test_size=0.60, random_state=42, stratify=calibration_df[target])

train_df.to_parquet('./week_1/data/processed/train_df_small.parquet')
validation_df.to_parquet('./week_1/data/processed/validation_df_small.parquet')
calibration_df.to_parquet('./week_1/data/processed/calibration_df_small.parquet')