### Import libraries

In [1]:
import joblib
import pandas as pd
from numpy import arange
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, make_scorer, recall_score
from constants import target_col as target_name, output_dir, model_fname, report_cmap, seed, read_config
from utilities import do_modeling, get_evaluation_report

### Configure defaults

In [2]:
# column names
prediction_column = 'prediction'
target_col = [target_name]

# standardized datasets' path
train_std_path = './outputs/std/train.csv'
val_std_path = './outputs/std/validation.csv'
test_std_path = './outputs/std/test.csv'

# scaled datasets' path
train_mm_path = './outputs/min_max/train.csv'
val_mm_path = './outputs/min_max/validation.csv'
test_mm_path = './outputs/min_max/test.csv'

plt.style.use('fivethirtyeight')

### Read data

In [3]:
train_std = pd.read_csv(train_std_path, **read_config)
val_std = pd.read_csv(val_std_path, **read_config)
test_std = pd.read_csv(test_std_path, **read_config)

train_mm = pd.read_csv(train_mm_path, **read_config)
val_mm = pd.read_csv(val_mm_path, **read_config)
test_mm = pd.read_csv(test_mm_path, **read_config)

### Data check

In [4]:
train_std.head()

Unnamed: 0_level_0,attrition_flag,gender_F,gender_M,education_level_College,education_level_Doctorate,education_level_Graduate,education_level_High School,education_level_Post-Graduate,education_level_Uneducated,education_level_Unknown,...,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
clientnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
779040408,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.53119,-1.329577,-0.410992,0.396539,0.670714,0.339143,-0.855777,-1.343908,-0.882476,-0.48335
714212358,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-1.82318,0.674102,-0.410992,-0.369006,0.67563,-1.81847,-0.661828,-0.328752,-0.146956,0.188227
714383583,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.760799,0.674102,0.494007,1.701759,0.663338,0.35277,1.237945,1.363175,0.79752,-0.735191
779536233,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.82318,-1.329577,-1.315992,-0.791731,-0.682639,1.093172,0.076291,0.771001,0.074536,0.531316
720512433,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.406794,1.675941,0.494007,0.732578,1.382422,0.330059,0.939718,0.55951,0.998116,-0.446851


In [5]:
train_mm.head()

Unnamed: 0_level_0,attrition_flag,gender_F,gender_M,education_level_College,education_level_Doctorate,education_level_Graduate,education_level_High School,education_level_Post-Graduate,education_level_Uneducated,education_level_Unknown,...,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
clientnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
779040408,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.4,0.166667,0.333333,0.324772,0.676202,0.245511,0.052802,0.178295,0.140017,0.140704
714212358,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.5,0.333333,0.115537,0.677791,0.105681,0.089785,0.364341,0.189303,0.325628
714383583,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.8,0.5,0.5,0.681507,0.673818,0.246394,0.452044,0.674419,0.25259,0.071357
779536233,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.166667,0.166667,0.0,0.238776,0.294377,0.230534,0.565891,0.204144,0.420101
720512433,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.666667,0.5,0.416616,0.906238,0.244922,0.395177,0.527132,0.266032,0.150754


In [6]:
print(
    'Standardized train set shape:', train_std.shape, 
    'validation set shape:', val_std.shape,
    'test set shape:', test_std.shape
)
print(
    'MinMax scaled train set shape:', train_mm.shape,
    'validation set shape:', val_mm.shape,
    'test set shape:', test_mm.shape
)

Standardized train set shape: (7315, 23) validation set shape: (1292, 23) test set shape: (1520, 23)
MinMax scaled train set shape: (7315, 23) validation set shape: (1292, 23) test set shape: (1520, 23)


In [7]:
print('NA count by column and dataset')

NA_df = pd.DataFrame()
NA_df['train_standardized'] = train_std.isna().sum()
NA_df['validation_standardized'] = val_std.isna().sum()
NA_df['test_standardized'] = test_std.isna().sum()
NA_df['train_minmax'] = train_mm.isna().sum()
NA_df['validation_minmax'] = val_mm.isna().sum()
NA_df['test_minmax'] = test_mm.isna().sum()
NA_df

NA count by column and dataset


Unnamed: 0,train_standardized,validation_standardized,test_standardized,train_minmax,validation_minmax,test_minmax
attrition_flag,0,0,0,0,0,0
gender_F,0,0,0,0,0,0
gender_M,0,0,0,0,0,0
education_level_College,0,0,0,0,0,0
education_level_Doctorate,0,0,0,0,0,0
education_level_Graduate,0,0,0,0,0,0
education_level_High School,0,0,0,0,0,0
education_level_Post-Graduate,0,0,0,0,0,0
education_level_Uneducated,0,0,0,0,0,0
education_level_Unknown,0,0,0,0,0,0


**Observation**
- Datasets have `23` columns (target included) as well it was already divided into train set with `7315` rows, validation set with `1292` rows and test set with `1520` rows `(0.7225 : 0.1275 : 0.15)`
- Initially, there were `0` missing values in data and missing values have not occurred in any of the three splits from bug during preprocessing

### Separate target

In [8]:
X_train_std, y_train_std = train_std.drop(columns=target_col), train_std[target_name]
X_val_std, y_val_std = val_std.drop(columns=target_col), val_std[target_name]
X_test_std, y_test_std = test_std.drop(columns=target_col), test_std[target_name]

X_train_mm, y_train_mm = train_mm.drop(columns=target_col), train_mm[target_name]
X_val_mm, y_val_mm = val_mm.drop(columns=target_col), val_mm[target_name]
X_test_mm, y_test_mm = test_mm.drop(columns=target_col), test_mm[target_name]

### Modeling

In [9]:
std_models = do_modeling(X_train_std, y_train_std)
mm_models = do_modeling(X_train_mm, y_train_mm)

In [10]:
std_models, mm_models

({'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
  'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
  'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
  'Support Vector Machine': SVC(class_weight='balanced', random_state=42)},
 {'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
  'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
  'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
  'Support Vector Machine': SVC(class_weight='balanced', random_state=42)})

### Model selection

In [11]:
std_eval_report = get_evaluation_report(std_models, X_val_std, y_val_std)
mm_eval_report = get_evaluation_report(mm_models, X_val_mm, y_val_mm)

In [12]:
std_eval_report.style.background_gradient(cmap=report_cmap)

Unnamed: 0,algorithm,class,precision,recall,f1,support
0,Decision Tree,0,0.954421,0.965867,0.96011,1084
1,Decision Tree,1,0.810256,0.759615,0.784119,208
2,Random Forest,0,0.957066,0.987085,0.971844,1084
3,Random Forest,1,0.91954,0.769231,0.837696,208
4,Logistic Regression,0,0.955301,0.847786,0.898338,1084
5,Logistic Regression,1,0.5,0.793269,0.613383,208
6,Support Vector Machine,0,0.978109,0.906827,0.94112,1084
7,Support Vector Machine,1,0.648084,0.894231,0.751515,208


In [13]:
mm_eval_report.style.background_gradient(cmap=report_cmap)

Unnamed: 0,algorithm,class,precision,recall,f1,support
0,Decision Tree,0,0.954463,0.96679,0.960587,1084
1,Decision Tree,1,0.814433,0.759615,0.78607,208
2,Random Forest,0,0.957105,0.988007,0.97231,1084
3,Random Forest,1,0.924855,0.769231,0.839895,208
4,Logistic Regression,0,0.957292,0.847786,0.899217,1084
5,Logistic Regression,1,0.503012,0.802885,0.618519,208
6,Support Vector Machine,0,0.968687,0.884686,0.924783,1084
7,Support Vector Machine,1,0.586093,0.850962,0.694118,208


**Observation**
- For churn prediction, recall is considered as first choice of metric for model evaluation to select model with maximum ability to detect churn customers
- Support vector machine (SVM) algorithm with input standardization preprocessing has outperformed other models in churn prediction task on this dataset. Hence, SVM algorithm will be furthur tuned

### Tuning

In [14]:
params = dict(
    C=10 ** arange(-4, 2.5, 0.5)
)

scorer = make_scorer(recall_score, average='binary')

clf = RandomizedSearchCV(
    SVC(class_weight='balanced', random_state=seed),
    params,
    random_state=seed,
    cv=5,
    n_jobs=-1,
    scoring=scorer,
    n_iter=15
)

In [15]:
X_train_val_std = pd.concat([X_train_std, X_val_std])
y_train_val_std = pd.concat([y_train_std, y_val_std])

fitted = clf.fit(X_train_val_std, y_train_val_std)



In [16]:
fitted.best_estimator_

SVC(C=0.31622776601683794, class_weight='balanced', random_state=42)

In [17]:
fitted.best_score_

0.8994872599801182

### Evaluation

In [18]:
preds = fitted.best_estimator_.predict(X_test_std)
print(classification_report(y_test_std, preds))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93      1276
           1       0.62      0.90      0.73       244

    accuracy                           0.90      1520
   macro avg       0.80      0.90      0.83      1520
weighted avg       0.92      0.90      0.90      1520



In [19]:
train_preds = fitted.best_estimator_.predict(X_train_val_std)
print(classification_report(y_train_val_std, train_preds))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95      7224
           1       0.67      0.94      0.79      1383

    accuracy                           0.92      8607
   macro avg       0.83      0.93      0.87      8607
weighted avg       0.94      0.92      0.92      8607



**Observation**
- Parameter tuning helped obtaining a model that is better than model with default settings on validation set
- Model performance on testing set (first report) is not too far from training set (last report). Hence, it can be said that model is not memorizing training data

### Prediction

In [20]:
X_test_std[prediction_column] = preds

In [21]:
pd.concat([X_test_std[prediction_column], y_test_std], axis=1).sample(15)

Unnamed: 0_level_0,prediction,attrition_flag
clientnum,Unnamed: 1_level_1,Unnamed: 2_level_1
794664183,0,0
712665633,1,0
712390683,0,0
713497983,1,0
708860358,0,0
719625933,0,0
813182658,0,0
708811233,0,0
715957233,0,0
719944458,0,0


### Serialization

In [22]:
joblib.dump(fitted.best_estimator_, f"{output_dir}/{model_fname}")

['outputs/model.joblib']