# Objective
* To develop a model that can identify all actual positive case, i.e. 100% recall rate is the ideal case
* False positive is okay 
* False negative is NOT okay

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import sklearn.model_selection as ms
import sklearn.metrics as m
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.svm as svm
import sklearn.linear_model as lm
import sklearn.preprocessing as pp
import sklearn.compose as compose
import sklearn.pipeline as pipe

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
data.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\
           'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 'CLIENTNUM'], inplace=True, axis=1)

# Basic EDA

In [None]:
data.info()

In [None]:
data.iloc[:, :11].head(5)

In [None]:
data.iloc[:, 11:].head(5)

In [None]:
data['Attrition_Flag'].value_counts(normalize=True) * 100

In [None]:
data.isnull().sum()

In [None]:
# display all unique values from all the columns
for col in data.columns:
    print(col)
    print(data[col].unique())
    print()

In [None]:
data.columns

In [None]:
data.iloc[:, :11].head(1)

In [None]:
data.iloc[:, 11:].head(1)

# Data Processing

In [None]:
data['Attrition_Flag'].replace({'Attrited Customer': 1, 'Existing Customer': 0}, inplace=True)
data['Gender'].replace({'M': 1, 'F': 0}, inplace=True) # do not need to one-hot encode

In [None]:
cat_attribute_2b_encoded = ['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

num_attribute_2b_transform = [i for i in data.columns if i not in cat_attribute_2b_encoded]
num_attribute_2b_transform = [i for i in num_attribute_2b_transform if i not in ['Attrition_Flag', 'Gender']]

In [None]:
print(f'No of numercial attribute: {len(num_attribute_2b_transform)}')
print(f'No of categorical attribute: {len(cat_attribute_2b_encoded)}')

In [None]:
y = data['Attrition_Flag'].copy()

X = data.copy()
X.drop('Attrition_Flag', inplace=True, axis=1)

# Transformation Pipeline

In [None]:
num_pipeline = pipe.Pipeline([
                            ('scaler', pp.StandardScaler())
])


full_pipeline = compose.ColumnTransformer([
                            ('num', num_pipeline, num_attribute_2b_transform),
                            ('cat', pp.OneHotEncoder(), cat_attribute_2b_encoded)
                                        ], remainder='passthrough', sparse_threshold=0)

In [None]:
X_transform = full_pipeline.fit_transform(X)

print(X_transform.shape)
print(full_pipeline.named_transformers_)

In [None]:
cat_transform_feature_names = list(full_pipeline.named_transformers_['cat'].get_feature_names())
len(cat_transform_feature_names)

# Splitting Data

In [None]:
X_train_val, X_test, y_train_val, y_test = ms.train_test_split(\
                                    X_transform, y, train_size=0.75, random_state=42, stratify=y, shuffle=True)
X_train, X_validation, y_train, y_validation = ms.train_test_split(\
                                    X_train_val, y_train_val, train_size=0.75, random_state=42, stratify=y_train_val, shuffle=True)

In [None]:
print(f'Total number of Instances: {X_transform.shape[0]}')
print(f'Size of Training Dataset: {X_train.shape[0]}')
print(f'Size of Validation Dataset: {X_validation.shape[0]}')
print(f'Size of Testing Dataset: {X_test.shape[0]}')

# Candidates Models & Ensemble (BASELINE)

In [None]:
rf_clf = ensemble.RandomForestClassifier(random_state=42)
dt_clf = tree.DecisionTreeClassifier(random_state=42)
ext_clf = ensemble.ExtraTreesClassifier(random_state=42)
gb_clf = ensemble.GradientBoostingClassifier(random_state=42)

voting_classifier = ensemble.VotingClassifier([
                    ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                    ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                    ('ext_clf', ensemble.ExtraTreesClassifier(random_state=42)),
                    ('gb_clf', ensemble.GradientBoostingClassifier(random_state=42))
                    ], voting='hard')

estimators = [rf_clf, dt_clf, ext_clf, gb_clf, voting_classifier]

# Performance on Training Data

In [None]:
cv = ms.RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

for estimator in estimators:
    estimator.fit(X_train, y_train)
    cv_accuracy = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
    cv_recall = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='recall')
    cv_f1 = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')
    
    print(estimator.__class__.__name__)
    print(f'Avg Accuracy: {round(np.mean(cv_accuracy) * 100,2)}')
    print(f'Std Accuracy: {round(np.std(cv_accuracy) * 100,2)}')
    
    print(f'Avg Recall: {round(np.mean(cv_recall) * 100,2)}')
    print(f'Std Rcall: {round(np.std(cv_recall) * 100,2)}')
    
    print(f'Avg F1: {round(np.mean(cv_f1) * 100,2)}')
    print(f'Std F1: {round(np.std(cv_f1) * 100,2)}')
    print()

# Performance on Validation Data

In [None]:
for estimator in estimators:
    prediction = estimator.predict(X_validation)

    print(estimator.__class__.__name__)
    print(f'Accuracy score: {round(m.accuracy_score(y_validation, prediction) * 100,2)}')
    print(f'Precision score: {round(m.precision_score(y_validation, prediction) * 100,2)}')
    print(f'Recall score: {round(m.recall_score(y_validation, prediction) * 100,2)}')
    print(f'F1 score: {round(m.f1_score(y_validation, prediction) * 100,2)}')
    print()

# New Voting Model and Performance on Training and Validation Data

In [None]:
# From a recall perspective, ExtraTreesClassifier does not seems to be performing well as compared to the other models
# Let's remove ExtraTreesClassifier from the ensemble VotingClassifier and see if the performance of the voting classifier improves

new_voting_classifier = ensemble.VotingClassifier([
                    ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                    ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                    ('gb_clf', ensemble.GradientBoostingClassifier(random_state=42))
                    ], voting='hard')

new_voting_classifier_list = [new_voting_classifier]

cv = ms.RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
print('Performance on Training Data')
print()
for estimator in new_voting_classifier_list:
    estimator.fit(X_train, y_train)
    cv_accuracy = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
    cv_recall = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='recall')
    cv_f1 = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')
    
    print(estimator.__class__.__name__)
    print(f'Avg Accuracy: {round(np.mean(cv_accuracy) * 100,2)}')
    print(f'Std Accuracy: {round(np.std(cv_accuracy) * 100,2)}')
    print(f'Avg Recall: {round(np.mean(cv_recall) * 100,2)}')
    print(f'Std Rcall: {round(np.std(cv_recall) * 100,2)}')
    print(f'Avg F1: {round(np.mean(cv_f1) * 100,2)}')
    print(f'Std F1: {round(np.std(cv_f1) * 100,2)}')
    print()
    
    print('Performance on Validation Dataset')
    prediction = estimator.predict(X_validation)
    print(f'Accuracy score: {round(m.accuracy_score(y_validation, prediction) * 100,2)}')
    print(f'Precision score: {round(m.precision_score(y_validation, prediction) * 100,2)}')
    print(f'Recall score: {round(m.recall_score(y_validation, prediction) * 100,2)}')
    print(f'F1 score: {round(m.f1_score(y_validation, prediction) * 100,2)}')

# Result Commentary on Models from Training and Validation Dataset
* While the recall performance of the new voting classifier on the validation dataset improves from 77.1% to 85.9%, it is still lower than the performance result from GB Classifier


* Based on the validation dataset, GB Classifier seems to the best performing estimator
    * Highest recall score of 86.9% and 83.8% in validation and training dataset


* Let's fine-tuned the GB Classifier model 

                                                             

| Models              	| Recall Score on Training Set 	| Recall Score on Validation Set 	|
|---------------------	|------------------------------	|--------------------------------	|
| RandomForest        	| 75.11                        	| 80.66                          	|
| DecisionTree        	| 79.12                        	| 81.31                          	|
| ExtraTrees          	| 58.07                        	| 63.28                          	|
| GradientBoosting    	| 83.79                        	| 86.89                          	|
| Old Voting Ensemble 	| 70.13                        	| 77.05                          	|
| New Voting Ensemble 	| 81.97                        	| 85.9                           	|

# Hyperparameter Tuning

In [None]:
gb_clf.get_params()

In [None]:
parameter_grid = [
                {'n_estimators': [50,75,100,125,150],
                'learning_rate': np.arange(0.1,1.0,0.1),
                }, 
                {'ccp_alpha': np.arange(0.1,1.0,0.1)   
                },
                {'max_leaf_nodes': [25,50,75,100],
                 'min_samples_split': [25,50,75,100],
                 'min_samples_leaf': [25,50,75,100]
                }, 
                {'max_depth': [3,5,10,20,25,50,75,100],
                 'max_features': [None, 5, 10,15,20,25,36]
                }
                ]

In [None]:
gb_clf_grid_search_cv = ms.GridSearchCV(gb_clf, parameter_grid, scoring="recall", cv=3, return_train_score= True)
gb_clf_grid_search_cv.fit(X_train, y_train)

In [None]:
gb_clf_grid_search_cv.best_estimator_

In [None]:
gb_clf_grid_search_cv.best_score_

# Best Estimator Training and Performance on Test Data
* Best Estimator will be trained on the full training set
* We then proceed to test it on the testing data

In [None]:
best_gb_clf = gb_clf_grid_search_cv.best_estimator_
best_gb_clf.fit(X_train_val, y_train_val)

In [None]:
# Performance on testing data

training_prediction = best_gb_clf.predict(X_test)

print(m.accuracy_score(y_test, training_prediction) * 100)
print(m.precision_score(y_test, training_prediction) * 100)
print(m.recall_score(y_test, training_prediction) * 100)
print(m.f1_score(y_test, training_prediction) * 100)