### Our aim is to build a predictive model for the churn rate in a bank
Let's get started!

In [None]:
# import important libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os

In [None]:
# import our dataset for that bank for our research
directory_src = r'~/Desktop/AI-ML Requirments/Professional Track_ITIDA_Scholarship/Project_DSC_churn_prediction'
directory_dist = 'datasets/churnd'
file = 'churn.csv'

path = os.path.join(directory_src, file)
path_dist = os.path.join(directory_dist, file)
def load_data(path=path):
    # if the directory of our dataset doesn't exist, please create it.
    if not os.path.isdir(directory_dist):
       os.makedirs(directory_dist) 
    df = pd.read_csv(path)
    df.to_csv(path_dist)
    return df

In [None]:
df = load_data() # loading and saving are done!

In [None]:
# data investigations for gaining insights
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
# let's investigate the data numerical attributes
df.describe() # ~20.37% of the customers in our data exited.

In [None]:
df[df.Exited==1] # 20.37% = 2037 rows

### let's do some visualizations including: histograms, scatterplots for gaining insights.

In [None]:
df.hist(bins=25, figsize=(20, 15))
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(df[['Age', 'Balance',  'CreditScore', 'EstimatedSalary', 'Exited']], figsize=(15, 15))
plt.show()

### In my opinion, we still not able to grasp the insights and correlations with `Exited` attribute so let's try another technique.

In [None]:
# let's split aour dataframe into numerical and categorical attributes for going in investigations in depth
df_num = df[list(df.describe())]
df_cat = df[['Surname', 'Geography', 'Gender']]

In [None]:
df_num.head()

In [None]:
df_cat.head()

In [None]:
# let's take a copy of our numerical attributes dataframe for some playing with it/feature engineering processes for investigations.
df_num_ = df_num.copy()

In [None]:
# let's visualize our linear correlation matrix for gaining insights
corr_mat = df_num_.corr()
sns.heatmap(corr_mat, linewidths=1.5)
corr_mat['Exited'].sort_values(ascending=False)

In [None]:
# we will focus on the Age, Balance attributes for gaining insights

### We replace the `Age` attribute with the `AgeBucket` instead to help our model detect the impact of each Age stratum on the churn rate

In [None]:
# we can see that the rate of churn for each startum of the age: has a very different impact on the churn rate.
df_num_['AgeBucket'] = df_num_.Age//20*(20)
print(df_num_[['AgeBucket', 'Exited']].groupby(['AgeBucket']).mean())
df_num_[['AgeBucket', 'Exited']].groupby(['AgeBucket']).mean().plot(kind='bar')
plt.grid()
plt.title('churn rate per age strata')
plt.ylabel('churn rate')
plt.xlabel('AgeBucket strata')
plt.legend(['Churn rate per age stratum'])
plt.show()

### We replace the `Balance` attribute with the `BalanceBucket` instead to help our model detect the impact of each Balance stratum on the churn rate¶

In [None]:
# we can see that the rate of churn for each startum of the balance: has a very different impact on the churn rate.
df_num_['BalanceBucket'] = df_num_.Balance//50000*(50000)
print(df_num_[['BalanceBucket', 'Exited']].groupby(['BalanceBucket']).mean())
df_num_[['BalanceBucket', 'Exited']].groupby(['BalanceBucket']).mean().plot(kind='bar')
plt.grid()
plt.title('churn rate per balance strata')
plt.ylabel('churn rate')
plt.xlabel('BalanceBucket strata')
plt.legend(['Churn rate per balance stratum'])
plt.show()

### We replace the `CreditScore` attribute with the `CrScoreBucket` instead to help our model detect the impact of each Credit Card stratum on the churn rate¶

In [None]:
df_num_.CreditScore.hist()

In [None]:
# we can see that the rate of churn for each startum of the credit score: has a very different impact on the churn rate.
df_num_['CrScoreBucket'] = df_num_.CreditScore//125*(125)
print(df_num_[['CrScoreBucket', 'Exited']].groupby(['CrScoreBucket']).mean())
df_num_[['CrScoreBucket', 'Exited']].groupby(['CrScoreBucket']).mean().plot(kind='bar')
plt.grid()
plt.title('churn rate per credit score strata')
plt.ylabel('churn rate')
plt.xlabel('Credit scor strata')
plt.legend(['Churn rate per Credit score stratum'])
plt.show()

### All customers in the bank with a credit card score `less than 406`: 23/2037 customers, Exited!

In [None]:
df[df.CreditScore<406]

In [None]:
df[df.CreditScore<405].Exited.value_counts().index.values, df[df.CreditScore<406].Exited.value_counts().index.values

### We replace the `EstimatedSalary` attribute with the `SalaryBucket` instead to help our model detect the impact of each Salary stratum on the churn rate¶

In [None]:
df.EstimatedSalary.hist()

In [None]:
df_num_['SalaryBucket'] = df_num_.EstimatedSalary//35000*(35000)
print(df_num_[['SalaryBucket', 'Exited']].groupby(['SalaryBucket']).mean())
df_num_[['SalaryBucket', 'Exited']].groupby(['SalaryBucket']).mean().plot(kind='bar')
plt.grid()
plt.title('churn rate per salary strata')
plt.ylabel('churn rate')
plt.xlabel('salary strata')
plt.legend(['Churn rate per salary stratum'])
plt.show()

In [None]:
df[df.EstimatedSalary<250000]['Exited'].value_counts()

#### With respect to the numerical continous attributes, we deduced that we should replace them with their corresponding categorical versions to help our model detect the pattern in our data very well

#### With respect to the numerical categorical attributes, they will be with no changes as they are well prepared with no missing values

### let's try combining some attributes to check their impact on the target attribute `Exited`

In [None]:
# Balance/Tenure
df_num_['BalancePerTenure'] = df_num_.Balance/df_num_.Tenure

In [None]:
df_num_['TenurePerAge'] = df_num_.Tenure/df_.Age

In [None]:
df_num_[['BalancePerTenure', 'Exited']].groupby(['BalancePerTenure']).mean()

### Great Job! We could detect a strong variability in the `Exited` i.e. churn rate for each strata of the Balance Per Tenure attribute

In [None]:
print(df_num_[['BalPerTenBucket', 'Exited']].groupby(['BalPerTenBucket']).mean())
df_num_[['BalPerTenBucket', 'Exited']].groupby(['BalPerTenBucket']).mean().plot(kind='bar')
plt.grid()
plt.title('churn rate per Balance per tenure strata')
plt.ylabel('churn rate')
plt.xlabel('Balance per tenure strata')
plt.legend(['Churn rate per Balance per tenure stratum'])
plt.show()

In [None]:
# EstimatedSalary/Tenure
df_num_['SalaryPerTenure'] = df_num_.EstimatedSalary/df_num_.Tenure
df_num_['SalPerTenBucket'] = df_num_.SalaryPerTenure//35450*(35450)
print(df_num_[['SalPerTenBucket', 'Exited']].groupby(['SalPerTenBucket']).mean())
df_num_[['SalPerTenBucket', 'Exited']].groupby(['SalPerTenBucket']).mean().plot(kind='bar')
plt.grid()
plt.title('churn rate per Salary per tenure strata')
plt.ylabel('churn rate')
plt.xlabel('Salary per tenure strata')
plt.legend(['Churn rate per Salary per tenure stratum'])
plt.show()

In [None]:
# let's try the balance/salary attribute
df_num_['BalancePerSalary'] = df_num_.Balance/df_num_.EstimatedSalary

In [None]:
df_num_.head()

### let's try the combination of the credit/numOfproduts attribute

In [None]:
df_num_['CreditPerProducts'] = df_num_.CreditScore/df_num_.NumOfProducts

In [None]:
list(df_num)

In [None]:
list(df_cat)

In [None]:
df_cat.Geography.value_counts()

In [None]:
df_cat.Gender.value_counts()

In [None]:
df[df.Exited==1]['Surname'].value_counts()[:10]

### As long as that the `Age` attribute has the highest impact on the `Exited` (target) attribute; I am going to split the dataset into train and test based on this attribute so that all stratas of this attrbute to be represented in the test set as the train set for a better evaluation of the model

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_idx, test_idx in splitter.split(df, df_num_.AgeBucket):
    train_df = df.loc[train_idx]
    test_df = df.loc[test_idx]
train_df.shape, test_df.shape   

I need to evaluate that our test dataset is including the same distribution of the values within the `AgeBucket` attribute as the same as the training dataset so as not to have a `sampling bias`in the model during evaluation

I am going to visualize the histogram for both the AgeBucket in both the train, test datasets

In [None]:
# create the AgeBucket attribute in both the train, test datasets
train_df['AgeBucket'] = train_df.Age//15*(15)
test_df['AgeBucket'] = test_df.Age//15*(15)

In [None]:
train_df.AgeBucket.hist()

In [None]:
test_df.AgeBucket.hist() # identical histograms

### It's the time to build our transformers to preprocess our training dataset for the machine learning algorithm as we mentioned in the above cell

In [None]:
df_ = train_df.copy()

### The basic steps for preprocessing implemented using a pipeline for automation:
**the following is with respect to numerical (continous and discrete) attributes.**
1. replace `Age` with `AgeBucket`
2. replace `Balance` with `BalanceBucket`
3. replace `CreditScore` with `CrScoreBucket`
4. create `TenurePerAge` attribute
5. create `BalancePerSalary` attribute
6. create `CreditPerProducts` attribute
7. drop `RowNumber`, `CustomerId`  attributes

**the following is with respect to categorical attributes.**
1. create `ExitedNameRatio` attribute instead of the `Surname` attribute
2. encode `Gender`, `Geography` using the OneHotEncoder

In [None]:
df_.Tenure/df_.Age

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
idx_crscore, idx_age, idx_tenure, idx_balance, idx_products, idx_estsalary = 0, 1, 2, 3, 4, 7

class create_buckets(BaseEstimator, TransformerMixin):
      def fit(self, X, y=None):
          return self
      def transform(self, X, y=None):
          # convert the dataframe into 2d numpy array  
          data_array = X.values 
            
          # we aim to replace the Age attribute with it's bucket array  
          ageBucket = data_array[:, idx_age]//15*(15)
          balanceBucket = data_array[:, idx_balance]//50000*(50000)
          crscoreBucket = data_array[:, idx_crscore]//125*(125)
          tenureperage =  data_array[:, idx_tenure]/data_array[:, idx_age]
          balpersal = data_array[:, idx_balance]/data_array[:, idx_estsalary]  
          creditperproduct = data_array[:, idx_crscore]/data_array[:, idx_products]
            
          # replace now and add the new created attributes 
          data_array[:, idx_age] = ageBucket
          data_array[:, idx_balance] = balanceBucket
          data_array[:, idx_crscore] = crscoreBucket
          return np.c_[data_array[:, :-1], tenureperage, balpersal, creditperproduct]

In [None]:
obj = create_buckets()
z = obj.fit_transform(df_)

In [None]:
z[10:20]

In [None]:
z.shape #Done!

In [None]:
'Akubundu' in df['Surname'].values

In [None]:
df_.Surname

In [None]:
idx_surname = 0

def get_ratio(series_surname, series_exited, series_not_exited):
    ratio_exited = []
    # for each name in the surnames of our whole data
    for name_ in series_surname.values:
        val_exited = 0
        val_not_exited = 0
        total = 0
        if name_ in series_exited.index:
           val_exited = series_exited[name_]
        if name_ in series_not_exited.index:
           val_not_exited = series_not_exited[name_]
        total = val_exited + val_not_exited
        ratio_exited.append(val_exited/total)  
    return ratio_exited

class count_exited(BaseEstimator, TransformerMixin):
      def fit(self, X, y=None):
          # we aim to get the count of names in both cases when that name exited and didn't exit to get the ratio of exited with respect to didn't exit for that name 
          series_exited = X[X['Exited']==1]['Surname'].value_counts()
          series_not_exited = X[X['Exited']==0]['Surname'].value_counts()
          # we aim to get the list of counts for each exited name by get_freq(self.series_) function
          self.ratio_exited = get_ratio(X.Surname, series_exited, series_not_exited)  
          return self
      def transform(self, X, y=None):
          data_ = X.values
          ratios_ = np.array(self.ratio_exited)
          # we replace the surname attribute with the ratio of exited for that surname and then to drop the target attribute
          data_[:, 0] = ratios_
        
          return data_

In [None]:
list(df_)

In [None]:
obj = count_exited()
z = obj.fit_transform(df_)

In [None]:
z.shape

In [None]:
df_.Surname.value_counts()['Andrews']

In [None]:
temp = df_[df.Exited==1]

In [None]:
temp.Surname.value_counts()['Fanucci']

In [None]:
temp2 = df_[df_.Exited==0]

In [None]:
temp2.Surname.value_counts()['Fanucci']

In [None]:
z[21:30]

In [None]:
df_[df_.Exited==0]['Surname'].value_counts()[20:]

In [None]:
x = np.array([90, 80, 56, 78, 90, 80, 56, 78, 90, 80, 56, 78]).reshape(3, 4)

In [None]:
x

In [None]:
x[:, 2:]

### Combine the output of the numerical and categorical attributes
Using the column transformer library.

idx_crscore, idx_age, idx_tenure, idx_balance, idx_products, idx_estsalary = 0, 1, 2, 3, 4, 7
idx_surname = 0

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

attr_list = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
full_pipe = ColumnTransformer([
    ('count_excited', count_exited(), ['Surname', 'Exited']),
    ('num_pipe', create_buckets(), attr_list),
    ('scaler', StandardScaler(), ['EstimatedSalary']),
    ('encode_geo_gender', OneHotEncoder(), ['Geography', 'Gender'])
])

In [None]:
df_ = train_df.copy()

In [None]:
# extracting the training labels
y_train = df_.Exited.copy()

In [None]:
final_data = full_pipe.fit_transform(df_)

### Great Job ! We could transform our training dataframe `sucessfully` ;).

In [None]:
final_data.shape

In [None]:
final_data[8000:8020] #1, 7

In [None]:
df_.iloc[8999]

### Great Transformation Job is done here... !

#### We aim to create our transformed dataframe on the training dataset and to save it for furture researches

In [None]:
list_transformed_attrs = ['SurnamePercent', 'Exited', 'CreditCardBucket', 'AgeBucket', 'Tenure', 'BalanceBucket', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'TenurePerAge', 'BalancePerSalary', 'CreditPerProduct', 'EstSalaryScaled', 'Geo.France', 'Geo.Germany', 'Geo.Spain', 'Gen.Female', 'Gen.Male']

len(list_transformed_attrs)

In [None]:
train_df_transformed = pd.DataFrame(final_data, columns=list_transformed_attrs)

In [None]:
# let's see it's structure!
train_df_transformed.head()

In [None]:
train_df_transformed.tail()

**Great!**

In [None]:
# save this dataframe
train_df_transformed.to_csv('datasets/churnd/churn_train_transformed.csv', index=False)

### Let's start to train our model
I am going to try/fine-tune the following models for our Churn Rate Problem: `Binary Classification 1: Churned, 0: Didn't Churn`
1. `Stochastic Gradient Descent Classifier`
2. `Support Vector Machine Classifier`
3. `Desicion Tree Classifier`
4. `Random Forest Classifier`

Let's Get Started!

In [None]:
# We aim here to remove the target labels away from our final training 2d-numpy array for training.
X_train = np.c_[final_data[:, 0], final_data[:, 2:]]

In [None]:
X_train.shape, final_data.shape

In [None]:
from sklearn.linear_model import SGDClassifier

model_SGD = SGDClassifier()

In [None]:
model_SGD.fit(X_train, y_train)

In [None]:
# evaluate model prediction on the training data
SGD_train_pred = model_SGD.predict(X_train)

In [None]:
y_train[:10].values

In [None]:
SGD_train_pred[:10]

**It looks that the stochastic gradient descent classifier is `underfitting` the training dataset with overall accuracy = 63.62%**
We will need to use a more powerful model for a better prediction.

In [None]:
# let's evaluate the overall model accuracy on the training dataset
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, SGD_train_pred)
train_accuracy*100

In [None]:
# let's evaluate the model accuracy on multiple training rounds
from sklearn.model_selection import cross_val_score

cross_SGD_accuracy = cross_val_score(model_SGD, X_train, y_train, cv=10, scoring='accuracy')
(cross_SGD_accuracy.mean())*100

In [None]:
# let's investigate the model confusion matrix on training dataset
from sklearn.metrics import confusion_matrix
conf_SGD = confusion_matrix(y_train, SGD_train_pred)
conf_SGD

In [None]:
from sklearn.model_selection import cross_val_predict

cross_SGD_pred = cross_val_predict(model_SGD, X_train, y_train, cv=3)

In [None]:
# let's investigate the model confusion matrix on multiple rounds of training
from sklearn.metrics import confusion_matrix
conf_SGD = confusion_matrix(y_train, cross_SGD_pred)
conf_SGD

In [None]:
# let's see the model precision/recall/f1 metrics
from sklearn.metrics import precision_score, recall_score, f1_score

precision_score(y_train, cross_SGD_pred), recall_score(y_train, cross_SGD_pred), f1_score(y_train, cross_SGD_pred)

In [None]:
# optimal classifier is:
conf_SGD_ = confusion_matrix(y_train, y_train)
conf_SGD_

In [None]:
# let's investigate the SGradientDescent classifier Precision/Recall plot, ROC curve
from sklearn.metrics import precision_recall_curve

cross_SGD_score = cross_val_predict(model_SGD, X_train, y_train, cv=3, method='decision_function')
prec_sgd, recall_sgd, thre_sgd = precision_recall_curve(y_train, cross_SGD_score)

In [None]:
def precision_recall_curve_plot(precisions, recalls, thresholds):
    # plot() is used for continous values
    plt.plot(thresholds, precisions[:-1], 'r--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'b-', label='recall')
    plt.xlabel('Threshold')
    plt.legend()
    plt.grid()
    plt.title('Precision - Recall Curve vs Threshold')
    plt.figure(figsize=(25, 10))
    plt.show()

In [None]:
precision_recall_curve_plot(prec_sgd, recall_sgd, thre_sgd)

In [None]:
def precision_recall_direct_plot(recalls, precisions):
    plt.plot(recalls[:-1], precisions[:-1])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.grid()
    plt.legend(['Recall-Precision'])
    plt.title('Precisin - Recall Curve Directly')
    plt.show()

In [None]:
precision_recall_direct_plot(prec_sgd, recall_sgd)

In [None]:
# let's investigate the model roc curve
from sklearn.metrics import roc_curve
fpr_sgd, tpr_sgd, thre_sgd_roc = roc_curve(y_train, cross_SGD_score)

In [None]:
def roc_curve_plot(FPR, TPR):
    plt.plot(FPR, TPR, 'r-', label='FPR : TPR')
    plt.plot([0, 1], [0, 1], 'b--', label='diagonal') # plotting the diagonal of the ROC curve.
#     plt.plot([0, 0.4], [0.4, 0.98], 'b--', label='checked:)') # plotting the diagonal of the ROC curve.
    plt.legend()
    plt.grid()
    plt.xlabel('False Postive Rate')
    plt.ylabel('True Postive Rate')
    plt.title('ROC curve of TPRate, FPRate')
    plt.show()

In [None]:
roc_curve_plot(fpr_sgd, tpr_sgd)

the stochastic gradient descent classifier is very poor on the training dataset and is considered as a very weak model

In [None]:
# area under the roc curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, cross_SGD_score)

In [None]:
# let's try to train our support vector machine model
from sklearn.svm import SVC

model_SVC = SVC()

In [None]:
model_SVC.fit(X_train, y_train)

Evaluate the SVC model on the training dataset

In [None]:
SVC_train_pred = model_SVC.predict(X_train)
SGD_train_pred[:10]

In [None]:
y_train[:10].values

In [None]:
# evaluate the model accuracy on training dataset
accuracy_score(y_train, SVC_train_pred)*100

In [None]:
# evaluate the confusion matrx on trainig dataset
confusion_matrix(y_train, SVC_train_pred)

### Much better results

In [None]:
# evaluate on multiple rounds of trainig
cross_SVC_score = cross_val_score(SVC(), X_train, y_train, cv=10, scoring='accuracy')
(cross_SVC_score.mean())*100

In [None]:
# evaluate the accuracy on multiple trainig rounds
cross_SVC_pred = cross_val_predict(SVC(), X_train, y_train, cv=10)
accuracy_score(y_train, cross_SVC_pred)*100

In [None]:
confusion_matrix(y_train, cross_SVC_pred)

In [None]:
cross_SVC_score = cross_val_predict(SVC(), X_train, y_train, cv=10, method='decision_function')
prec_svc, recall_svc, thre_svc = precision_recall_curve(y_train, cross_SVC_score)
precision_recall_curve_plot(prec_svc, recall_svc, thre_svc)
precision_recall_direct_plot(prec_svc, recall_svc)

**We got an overall poor performance as well with the support vector machine classifier**

In [None]:
# evaluate the roc curve
fpr_svc, tpr_svc, thre_svc_roc = roc_curve(y_train, cross_SVC_score)
roc_curve_plot(fpr_svc, tpr_svc)
roc_auc_score(y_train, cross_SVC_score)

In [None]:
# let's evaluate the decision tree classifier
from sklearn.tree import DecisionTreeClassifier

model_DTC = DecisionTreeClassifier()

In [None]:
model_DTC.fit(X_train, y_train)

In [None]:
DTC_train_pred = model_DTC.predict(X_train)

In [None]:
y_train.values[:10]

In [None]:
DTC_train_pred[:10]

### For sure, our decision tree classifier reached accuracy = 100% on training dataset indicating **Overfitting** problem

In [None]:
accuracy_score(y_train, DTC_train_pred)*100

In [None]:
cross_DTC_train = cross_val_score(model_DTC, X_train, y_train, cv=10, scoring='accuracy')
(cross_DTC_train.mean())*100

In [None]:
cross_DTC_pred = cross_val_predict(model_DTC, X_train, y_train, cv=10)
cross_DTC_pred

In [None]:
# evaluate the confusion matrix of multiple training rounds
confusion_matrix(y_train, cross_DTC_pred)

In [None]:
precision_score(y_train, cross_DTC_pred), recall_score(y_train, cross_DTC_pred), f1_score(y_train, cross_DTC_pred)

In [None]:
model_DTC_ = DecisionTreeClassifier(max_depth=7)
cross_DTC_pred = cross_val_predict(model_DTC_, X_train, y_train, cv=10, method='predict_proba')
cross_DTC_pred[:10]

In [None]:
cross_DTC_score = cross_DTC_pred[:, 1] # labels of the +ve class only.`
prec_dtc, recall_dtc, thre_dtc = precision_recall_curve(y_train, cross_DTC_score)

In [None]:
precision_recall_curve_plot(prec_dtc, recall_dtc, thre_dtc)
precision_recall_direct_plot(prec_dtc, recall_dtc)

In [None]:
fpr_dtc, tpr_dtc, thre_dtc_roc = roc_curve(y_train, cross_DTC_score)
roc_curve_plot(fpr_dtc, tpr_dtc)
roc_auc_score(y_train, cross_DTC_score)

In [None]:
# let's try the random forest classifier
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier()

In [None]:
model_RFC.fit(X_train, y_train)

In [None]:
RFC_train_pred = model_RFC.predict(X_train)
RFC_train_pred[:10]

In [None]:
y_train.values[:10]

In [None]:
# accuracy on multiple training processes
cross_RFC_train_score = cross_val_score(model_RFC, X_train, y_train, cv=10, scoring='accuracy')
(cross_RFC_train_score.mean())*100

In [None]:
# confusion matrix for Random forest classifier
cross_RFC_pred = cross_val_predict(model_RFC, X_train, y_train, cv=5)
confusion_matrix(y_train, cross_RFC_pred)

In [None]:
cross_RFC_pred

In [None]:
precision_score(y_train, cross_RFC_pred), recall_score(y_train, cross_RFC_pred), f1_score(y_train, cross_RFC_pred)

In [None]:
cross_RFC_score = cross_val_predict(model_RFC, X_train, y_train, cv=10, method='predict_proba')
cross_RFC_score # we got the probability for each customer to be within which class either: churned or didn't churn

In [None]:
# let's evaluate the model performance with the precision/recall metrics
#  get the +ve class probability only as the score
cross_RFC_score = cross_RFC_score[:, 1]
prec_rfc, recall_rfc, thre_rfc = precision_recall_curve(y_train, cross_RFC_score)
precision_recall_curve_plot(prec_rfc, recall_rfc, thre_rfc)
precision_recall_direct_plot(prec_rfc, recall_rfc)

In [None]:
fpr_rfc, tpr_rfc, thre_rfc_roc = roc_curve(y_train, cross_RFC_score)
roc_curve_plot(fpr_rfc, tpr_rfc)
roc_auc_score(y_train, cross_RFC_score)

We concluded that the **`random forest classifier`** is the most promising model to be used.
With **`AUC` = 92.37%**

Let's `fine-tune` our model

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = [
    {'n_estimators': randint(low=1, high=200), 'max_features':randint(low=1, high=17), 'max_depth': randint(low=1, high=50)}
]

model_RFC_randcv = RandomizedSearchCV(model_RFC, param_dist, cv=3, random_state=42, return_train_score=True, verbose=2)

In [None]:
model_RFC_randcv.fit(X_train, y_train)

In [None]:
model_RFC_randcv.best_estimator_, model_RFC_randcv.best_score_, model_RFC_randcv.

In [None]:
cross_RFC_pred = cross_val_predict(model_RFC_randcv.best_estimator_, X_train, y_train, cv=10)

In [None]:
confusion_matrix(y_train, cross_RFC_pred)

In [None]:
cross_RFC_score = cross_val_predict(model_RFC_randcv.best_estimator_, X_train, y_train, cv=10, method='predict_proba')
cross_RFC_score = cross_RFC_score[:, 1] # the proba of the +ve class to be used as a score

In [None]:
# let's evaluate the model performance with the precision/recall metrics
#  get the +ve class probability only as the score
prec_rfc, recall_rfc, thre_rfc = precision_recall_curve(y_train, cross_RFC_score)
precision_recall_curve_plot(prec_rfc, recall_rfc, thre_rfc)
precision_recall_direct_plot(prec_rfc, recall_rfc)

In [None]:
fpr_rfc, tpr_rfc, thre_rfc_roc = roc_curve(y_train, cross_RFC_score)
roc_curve_plot(fpr_rfc, tpr_rfc)
roc_auc_score(y_train, cross_RFC_score)

In [None]:
model_RFC_randcv.best_estimator_ # Done!

### Let's evaluate the model performance on the testing dataset


In [None]:
final_test_data = full_pipe.fit_transform(test_df)

In [None]:
final_test_data.shape

In [None]:
final_test_data[-10:]

In [None]:
# let's drop the target labels
X_test = np.c_[final_test_data[:, 0], final_test_data[:, 2:]]
y_test = test_df.Exited.copy()

In [None]:
rfc_final_pred = model_RFC_randcv.best_estimator_.predict(X_test)
confusion_matrix(y_test, rfc_final_pred)

In [None]:
precision_score(y_test, rfc_final_pred), recall_score(y_test, rfc_final_pred), f1_score(y_test, rfc_final_pred)

In [None]:
rfc_final_score = model_RFC_randcv.best_estimator_.predict_proba(X_test)

In [None]:
rfc_final_score = rfc_final_score[:, 1] # we take the +ve class probability as the score

In [None]:
prec_rfc, recall_rfc, thre_rfc = precision_recall_curve(y_test, rfc_final_score)
precision_recall_curve_plot(prec_rfc, recall_rfc, thre_rfc)
precision_recall_direct_plot(prec_rfc, recall_rfc)

In [None]:
fpr_rfc, tpr_rfc, thre_rfc_roc = roc_curve(y_test, rfc_final_score)
roc_curve_plot(fpr_rfc, tpr_rfc)
roc_auc_score(y_test, rfc_final_score)

In [None]:
accuracy_score(y_test, rfc_final_pred)*100

### This is the end of our classification Journey  ;)

Final results on the testing dataset after fine-tuning: <br>
1. `Precision: 93.23%`<br>
2. `Recall:    86.06%`<br>
3. `accuracy:  95.8%`<br>

### Saving our final model

In [None]:
import joblib
joblib.dump(model_RFC_randcv.best_estimator_, 'final_RFC_fine_tuned.pkl')

### Saving our final transformed testing dataset

In [None]:
test_df_transformed = pd.DataFrame(final_test_data, columns=list_transformed_attrs)
test_df_transformed.head()

In [None]:
test_df_transformed.to_csv('datasets/churnd/test_df_transformed.csv', index=False)