### Introduction

This IBM Sample Dataset has information about Telco customers and if they left the company within the last month (churn). Each row represents a unique costumer, while the columns contains information about customer’s services, account and demographic data. We will be using Python and Seaborn library to plot and analyze the data.

7043 rows, There are 21 columns with 19 features.

#### Basic Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance

#### Churn Data

In [None]:
df =pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
df.info()

In [None]:
df.shape

In [None]:
X = df.drop(['customerID','Churn'],axis=1)
Y = df['Churn']

In [None]:
plt.figure(figsize=(8,8))
df.Churn.value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05))
plt.title('Target Variable "Churn"')

#### Missing Values

In [None]:
df.isnull().sum()

### Feature Engineering

##### Categorical Features

In [None]:
fig, (ax) = plt.subplots(8, 2, figsize=(15,60))

feat = df[['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod']]

feat['gender'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05),ax =ax[0][0])
ax[0][0].set_title('Gender')
feat['SeniorCitizen'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05),ax =ax[0][1])
ax[0][1].set_title('Senior Citizen')
feat['Partner'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05),ax =ax[1][0])
ax[1][0].set_title('Partner')
feat['Dependents'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05),ax =ax[1][1])
ax[1][1].set_title('Dependent')
feat['PhoneService'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05),ax =ax[2][0])
ax[2][0].set_title('PhoneService')
feat['MultipleLines'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[2][1])
ax[2][1].set_title('MultipleLines')
feat['InternetService'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[3][0])
ax[3][0].set_title('InternetServices')
feat['OnlineSecurity'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[3][1])
ax[3][1].set_title('OnlineSecurity')
feat['OnlineBackup'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[4][0])
ax[4][0].set_title('OnlineBackup')
feat['DeviceProtection'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[4][1])
ax[4][1].set_title('DeviceProtection')
feat['TechSupport'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[5][0])
ax[5][0].set_title('TechSupport')
feat['StreamingTV'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[5][1])
ax[5][1].set_title('StreamingTV')
feat['StreamingMovies'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[6][0])
ax[6][0].set_title('StremingMovie')
feat['Contract'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05),ax =ax[6][1])
ax[6][1].set_title('Contract')
feat['PaperlessBilling'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05),ax =ax[7][0])
ax[7][0].set_title('PaperBilling')
feat['PaymentMethod'].value_counts().plot(kind='pie',autopct='%1.1f%%',explode=(0.05,0.05,0.05,0.05),ax =ax[7][1])
ax[7][1].set_title('PaymentMethod')

plt.suptitle(
    "Categorical Features", fontweight="bold")
plt.show()

In [None]:
feat = feat.astype('category')

In [None]:
categorical_features = pd.get_dummies(feat)

##### Numerical features

In [None]:
X.tenure = df.tenure.astype(int)

In [None]:
X.TotalCharges = pd.to_numeric(X.TotalCharges, errors='coerce')

In [None]:
numerical_features = X[['tenure','MonthlyCharges','TotalCharges']]

In [None]:
## Tenure
sns.violinplot(x=X.tenure, y=Y)
plt.title('Tenure Distribution')

In [None]:
## Monthly Charges
sns.violinplot(x=X.MonthlyCharges, y=Y)
plt.title('Monthly Charges Distribution')

In [None]:
## Total Charges
sns.violinplot(x=X.TotalCharges, y=Y)
plt.title('Total Charges Distribution')

### ReCreating Data

In [None]:
dat = categorical_features
for col in numerical_features:
    dat[col] = numerical_features[col]

In [None]:
dat.shape

##### Correlation

In [None]:
labelencoder = LabelEncoder()
daf = dat
daf['Y'] = Y
daf.Y = labelencoder.fit_transform(daf.Y)

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(40,35))
plt.title('Correlation', y=1.025, size=30)
sns.heatmap(daf.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
colormap = plt.cm.BrBG
plt.figure(figsize=(14,12))
plt.title('Correlation', y=1.025, size=20)
sns.heatmap(daf[['OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes','Y']].astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

##### Outlier Detetction

In [None]:
# Monthly Charges
sns.boxplot(x=X.MonthlyCharges, y=Y)
plt.title('Monthly Charges Distribution')

In [None]:
#Tenure
sns.boxplot(x=X.tenure, y=Y)
plt.title('Tenure Distribution')

In [None]:
dat.tenure[dat.tenure > 70].value_counts()

In [None]:
# Total Charges
sns.boxplot(x=X.TotalCharges, y=Y)
plt.title('Total Charges Distribution')

#### Sorting Dataset

In [None]:
data_X = dat.drop(['Y'],axis=1) 
data_X.shape

In [None]:
data_Y = dat.Y

##### Missing Values

In [None]:
total = data_X.isnull().sum().sort_values(ascending=False) # missing values analysis
percent = (data_X.isnull().sum()/data_X.isnull().count()).sort_values(ascending=False) # percentage of missing values 
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(5)

In [None]:
data_X.TotalCharges.fillna(data_X.TotalCharges.median(), inplace = True)

In [None]:
data_X.TotalCharges.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size = 0.2) 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X__train = scaler.fit_transform(X_train)
X__test = scaler.transform(X_test)

##### PFI (Permutation Feature Importance)

##### KNN total feature PFI

In [None]:
plt.figure(figsize=(12,8))
model = KNeighborsClassifier()
model.fit(X__train, Y_train)
results = permutation_importance(model, X__train, Y_train, scoring='accuracy')
importance = results.importances_mean

for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

plt.bar([x for x in range(len(importance))], importance)
plt.show()

##### KNN Numerical Feature PFI

In [None]:
model = KNeighborsClassifier()
model.fit(data_X[['tenure','MonthlyCharges','TotalCharges',]], data_Y)
results = permutation_importance(model, data_X[['tenure','MonthlyCharges','TotalCharges']], data_Y, scoring='accuracy')
importance = results.importances_mean

for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

plt.bar([x for x in range(len(importance))], importance)
plt.show()

##### KNN Label Encoded Feature PFI

In [None]:
data_le = feat
for cols in feat:
    data_le[cols] = labelencoder.fit_transform(feat[cols])
    
data_le['tenure'] = data_X['tenure']
data_le['MonthlyCharges'] = data_X['MonthlyCharges']
data_le['TotalCharges'] = data_X['TotalCharges']

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(18,16))
plt.title('Correlation', y=1.025, size=30)
sns.heatmap(data_le.astype(float).corr(),linewidths=0.1,vmax=0.5, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
X_tr, X_t, Y_tr, Y_t = train_test_split(data_le, data_Y, test_size = 0.2)

X__tr = scaler.fit_transform(X_tr)
X__t = scaler.transform(X_t)

In [None]:
plt.figure(figsize=(12,8))
model = KNeighborsClassifier()
model.fit(X__tr, Y_tr)
results = permutation_importance(model, X__tr, Y_tr, scoring='accuracy')
importance = results.importances_mean

for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

plt.bar([x for x in range(len(importance))], importance)
plt.show()

##### GradientBoost Label ENcoded Feature PFI

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier

classifier = GradientBoostingClassifier()
classifier.fit(X__tr, Y_tr)

In [None]:
classifier.feature_importances_

##### Multi Layer Perceptron Feature Importance

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

MLP = MLPClassifier(hidden_layer_sizes=(20,6))
MLP.fit(X__tr, Y_tr)

In [None]:
plt.figure(figsize=(12,8))
results = permutation_importance(MLP, X__tr, Y_tr, scoring='accuracy')
importance = results.importances_mean

for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
'''
lsst = []
for i in range(20):    
    minimum,maximum = data_le[:,i].min(), data_le[:,i].max()  # minimum and maximum value of tenure
    arr = np.arange(minimum, maximum, (maximum - minimum)/100) # randomized array between min and max of 100 values

    lst = []
    for variation in arr:
        val = list(data_le.median().values) # saving all rows median values in a list
        val[i] = variation
        lst.append(val)  
    lst = np.array(lst)
    lst = scaler.transform(lst)
    MLP.predict(lst)
'''

### SHAP Explainer

###### Variable Importance Plot — Global Interpretability

In [None]:
import shap
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(max_depth=8, random_state=19)
RFC.fit(X__tr, Y_tr)

In [None]:
shap_values = shap.TreeExplainer(RFC).shap_values(X__tr)
shap.summary_plot(shap_values, X__tr)

##### XGBoost SHAP

In [None]:
import xgboost

# train XGBoost model
 
model_ = xgboost.XGBClassifier().fit(X__tr, Y_tr)

# compute SHAP values
shap_values = shap.TreeExplainer(model_).shap_values(X__tr)


In [None]:
shap.summary_plot(shap_values, X__tr)

##### SHAP Analysis

In [None]:
data_for_prediction = pd.DataFrame(X__tr).iloc[35]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


print(model_.predict_proba(data_for_prediction_array))

In [None]:
explainer = shap.TreeExplainer(model_)
shap_values = explainer.shap_values(X__tr)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], data_for_prediction_array)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[1], data_for_prediction)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[2], data_for_prediction)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[3], data_for_prediction)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[4], data_for_prediction)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[5], data_for_prediction)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[6], data_for_prediction)

#####  Visualize the training set predictions

In [None]:
shap.force_plot(explainer.expected_value, shap_values, X__tr)

##### Kernel Explainer

In [None]:
k_explainer = shap.KernelExplainer(model_.predict_proba, X__tr)
k_shap_values = k_explainer.shap_values(data_for_prediction)
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)

##### SHAP Feature Dependence plot 

In [None]:
for cols in X_tr[['MonthlyCharges',"TotalCharges","tenure"]]:
    shap.dependence_plot(cols, shap_values, X_tr)

##### SHAP Interaction Value Summary Plot

In [None]:
shap_interaction_values = shap.TreeExplainer(model_).shap_interaction_values(pd.DataFrame(X_tr).iloc[:2000,:])

In [None]:
shap.summary_plot(shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:])

### Feature Hidden Patterns Analysis

###### tenure

In [None]:
fig, (ax) = plt.subplots(2, 2, figsize=(16,12))
shap.dependence_plot(
    ('tenure', 'SeniorCitizen'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[0][0]
)
shap.dependence_plot(
    ('tenure', 'Partner'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[0][1]
)
shap.dependence_plot(
    ('tenure', 'Contract'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[1][0]
)
shap.dependence_plot(
    ('tenure','TotalCharges'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[1][1]
)

plt.show()

##### Monthly Charges

In [None]:
fig, (ax) = plt.subplots(2, 2, figsize=(16,12))
shap.dependence_plot(
    ('MonthlyCharges', 'Contract'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[0][0]
)
shap.dependence_plot(
    ('MonthlyCharges', 'StreamingTV'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[0][1]
)
shap.dependence_plot(
    ('MonthlyCharges', 'Dependents'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[1][0]
)
shap.dependence_plot(
    ('MonthlyCharges','Partner'),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[1][1]
)

plt.show()

##### Total Charges

In [None]:
fig, (ax) = plt.subplots(2, 2, figsize=(16,12))
shap.dependence_plot(
    ('TotalCharges', "DeviceProtection"),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[0][0]
)
shap.dependence_plot(
    ('TotalCharges', "OnlineBackup"),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[0][1]
)
shap.dependence_plot(
    ('TotalCharges', "Contract"),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[1][0]
)
shap.dependence_plot(
    ('TotalCharges',"MultipleLines"),
    shap_interaction_values, pd.DataFrame(X_tr).iloc[:2000,:],
    display_features= X_tr.iloc[:2000,:],
    show = False,
    ax = ax[1][1]
)

plt.show()

##### Feature Desicion Explainer first 100 entries

In [None]:
explainer = shap.TreeExplainer(model_)
expected_value = explainer.expected_value

shap.decision_plot(expected_value, shap_values[0:100,], pd.DataFrame(X_tr).iloc[0:100,])

#### Feature Desicion Explainer  100-200 entries

In [None]:
shap.decision_plot(expected_value, shap_values[0:1999,], pd.DataFrame(X_tr).iloc[0:1999,])

##### Feature Desicion Explainer 2000-2500 entries

In [None]:
shap.decision_plot(expected_value, shap_values[2000:2500,], pd.DataFrame(X_tr).iloc[2000:2500,])

In [None]:
shap.decision_plot(expected_value, shap_values[0:1999,], pd.DataFrame(X_tr).iloc[0:1999,],  feature_order='hclust', return_objects=True)

In [None]:
shap.decision_plot(expected_value, shap_values[2000:3999,], pd.DataFrame(X_tr).iloc[2000:3999,],  feature_order='hclust', return_objects=True)

In [None]:
shap.decision_plot(expected_value, shap_values[4000:5999,], pd.DataFrame(X_tr).iloc[4000:5999,],  feature_order='hclust', return_objects=True)

In [None]:
df_s = data_le
df_y = labelencoder.fit_transform(df['Churn'])

##### SKewness

###### Cloned/Taken from https://github.com/datamadness/Automatic-skewness-transformation-for-Pandas-DataFrame/blob/master/TEST_skew_autotransform.py

In [None]:
import seaborn as sns
import numpy as np
import math
import scipy.stats as ss
import matplotlib.pyplot as plt

def skew_autotransform(DF, include = None, exclude = None, plot = False, threshold = 1, exp = False):
    
    #Get list of column names that should be processed based on input parameters
    if include is None and exclude is None:
        colnames = DF.columns.values
    elif include is not None:
        colnames = include
    elif exclude is not None:
        colnames = [item for item in list(DF.columns.values) if item not in exclude]
    else:
        print('No columns to process!')
    
    #Helper function that checks if all values are positive
    def make_positive(series):
        minimum = np.amin(series)
        #If minimum is negative, offset all values by a constant to move all values to positive teritory
        if minimum <= 0:
            series = series + abs(minimum) + 0.01
        return series
    
    
    #Go throug desired columns in DataFrame
    for col in colnames:
        #Get column skewness
        skew = DF[col].skew()
        transformed = True
        
        if plot:
            #Prep the plot of original data
            sns.set_style("darkgrid")
            sns.set_palette("Blues_r")
            fig, axes = plt.subplots(1, 2, figsize=(10, 5))
            ax1 = sns.distplot(DF[col], ax=axes[0])
            ax1.set(xlabel='Original ' + col)
        
        #If skewness is larger than threshold and positively skewed; If yes, apply appropriate transformation
        if abs(skew) > threshold and skew > 0:
            skewType = 'positive'
            #Make sure all values are positive
            DF[col] = make_positive(DF[col])
            
            if exp:
               #Apply log transformation 
               DF[col] = DF[col].apply(math.log)
            else:
                #Apply boxcox transformation
                DF[col] = ss.boxcox(DF[col])[0]
            skew_new = DF[col].skew()
         
        elif abs(skew) > threshold and skew < 0:
            skewType = 'negative'
            #Make sure all values are positive
            DF[col] = make_positive(DF[col])
            
            if exp:
               #Apply exp transformation 
               DF[col] = DF[col].pow(10)
            else:
                #Apply boxcox transformation
                DF[col] = ss.boxcox(DF[col])[0]
            skew_new = DF[col].skew()
        
        else:
            #Flag if no transformation was performed
            transformed = False
            skew_new = skew
        
        #Compare before and after if plot is True
        if plot:
            print('\n ------------------------------------------------------')     
            if transformed:
                print('\n %r had %r skewness of %2.2f' %(col, skewType, skew))
                print('\n Transformation yielded skewness of %2.2f' %(skew_new))
                sns.set_palette("Paired")
                ax2 = sns.distplot(DF[col], ax=axes[1], color = 'r')
                ax2.set(xlabel='Transformed ' + col)
                plt.show()
            else:
                print('\n NO TRANSFORMATION APPLIED FOR %r . Skewness = %2.2f' %(col, skew))
                ax2 = sns.distplot(DF[col], ax=axes[1])
                ax2.set(xlabel='NO TRANSFORM ' + col)
                plt.show()
                

    return DF


In [None]:
transformedDF = df_s
transformedDF['MonthlyCharges'] = skew_autotransform(df_s[['MonthlyCharges']].copy(deep=True), plot = True, 
                                   exp = True, threshold = 0.7, exclude = ['B','LSTAT'])

transformedDF['TotalCharges'] = skew_autotransform(df_s[['TotalCharges']].copy(deep=True), plot = True, 
                                   exp = True, threshold = 0.7, exclude = ['B','LSTAT'])

transformedDF['tenure'] = skew_autotransform(df_s[['tenure']].copy(deep=True), plot = True, 
                                   exp = True, threshold = 0.7, exclude = ['B','LSTAT'])



print('Average skewness after transformation is %2.2f' %(np.mean(abs(transformedDF.skew()))))

### Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# transform the dataset Oversampling to be exact
oversample = SMOTE()
x_, y_ = oversample.fit_resample(transformedDF, df_y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_, y_, test_size = 0.2) 
x__train = scaler.fit_transform(x_train)
x__test = scaler.transform(x_test)

In [None]:
LR = LogisticRegression()
SVD = SVC()
KNC = KNeighborsClassifier()
P = Perceptron()
DTC = DecisionTreeClassifier()
XGB = xgboost.XGBClassifier()
RF = RandomForestClassifier()
ML = MLPClassifier()
GBC = GradientBoostingClassifier()

##### Logistic Regression

In [None]:
param_LR = {'C':[0.001,0.01,0.1,1,10,100]}

clf1= GridSearchCV(LR, param_LR, cv=5)
clf1.fit(x__train, y_train)
pred=clf1.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)
Acc_LR = acc

##### Support Vector Machine

In [None]:
param_SVD =  {
    'C' : [1, 2, 3, 4],
    'kernel' : ['linear', 'rbf', 'sigmoid']    
}

clf2= GridSearchCV(SVD, param_LR, cv=5)
clf2.fit(x__train, y_train)
pred=clf2.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)

Acc_SVD = acc

##### K Neighbour Classifier

In [None]:
param_KNC = {
    'n_neighbors':np.arange(1,50),
    'leaf_size' : [30,20,40] 
}

clf3= GridSearchCV(KNC, param_KNC, cv=5, n_jobs= -1)
clf3.fit(x__train, y_train)
pred=clf3.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)
Acc_KNC = acc

##### Perceptron

In [None]:
perceptron = Perceptron()
perceptron.fit(x__train, y_train)
pred = perceptron.predict(x__test)
acc = accuracy_score(y_test,pred)
print('accuracy_score',acc)
Acc_P = acc

##### Decision Tree Classifier

In [None]:
param_DTC = {
    'criterion' : ['gini' ,'entropy'],
    'max_depth' : [ 3, 5, 7, 9, 11, 13, 15, 17],
    "min_samples_leaf": [ 1, 3, 5, 7, 9, 13, 11]
}

clf4= GridSearchCV(DTC, param_DTC, cv=5, n_jobs= -1)
clf4.fit(x__train, y_train)
pred=clf4.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)

Acc_DTC = acc



##### XGBoost Classifier

In [None]:
param_XGB = {
     'max_depth':range(3,10,2),
     'min_child_weight':range(1,10,2),
     'max_depth':[4,5,6,7,8],
     'min_child_weight':[4,5,6,7,8],
     'learning_rate' : [0.1, 0.001],
     'gamma':[i/10.0 for i in range(0,5)]
}

clf5= GridSearchCV(XGB, param_XGB, cv=5, n_jobs= -1)
clf5.fit(x__train, y_train)
pred=clf5.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)

Acc_XGB = acc

##### Random Forest Classifier

In [None]:
param_RFC = {
    'max_depth' : [4, 6, 8, 10, 12],
    'max_leaf_nodes' :  [3, 6, 9, 12, 14],
    'min_samples_leaf': [1, 3, 5, 7],
    'n_estimators' : [80, 100, 120, 140]
}
clf6= GridSearchCV(RF, param_RFC, cv=5, n_jobs= -1)
clf6.fit(x__train, y_train)
pred=clf6.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)

Acc_RFC = acc

##### MLP Classifier

In [None]:
param_ML = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

clf7= GridSearchCV(ML, param_ML, cv=3, n_jobs=-1)
clf7.fit(x__train, y_train)
pred=clf7.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)
Acc_ML = acc

##### Gradient Boosting Classfier

In [None]:
param_GBC = {
    'n_estimators':range(20,81,10),
    'max_depth':range(5,16,2), 
    'min_samples_split':range(200,1001,200),
    'min_samples_leaf':range(30,71,10)
}

clf8= GridSearchCV(GBC, param_GBC, cv=5, n_jobs= -1)
clf8.fit(x__train, y_train)
pred=clf8.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)
Acc_GBC = acc

##### Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
param_AC = {
    "base_estimator__criterion" : ["gini", "entropy"],
    "base_estimator__splitter" :   ["best", "random"],
    "algorithm" : ["SAMME","SAMME.R"],
    "n_estimators" :[10, 100, 200, 250],
    "learning_rate":  [0.05, 0.5, 1.5, 2.5]
}

AC = AdaBoostClassifier(
    DecisionTreeClassifier())

clf9= GridSearchCV(AC, param_AC, cv=3, n_jobs= -1)
clf9.fit(x__train, y_train)
pred=clf9.predict(x__test)
acc = accuracy_score(y_test,pred)

print('accuracy_score',acc)
Acc_AC = acc

##### Models Accuracy Analysis

In [None]:
models = pd.DataFrame( {
'Models' : ['Logistic Regression', 'Support Vector Machine', 
         'KNeighbours Classfier', 'Perceptron',
         'Decision Tree Classifier', 'XGBoost Classifier',
         'Random Forest Classifier', 'Multi-Layer Perceptron',
         'Gradient Boosting Classifier', 'Ada Boost Classifier'],
'Score' : [Acc_LR*100, Acc_SVD*100, Acc_KNC*100,
        Acc_P*100, Acc_DTC*100, Acc_XGB*100,
        Acc_RFC*100, Acc_ML*100, Acc_GBC*100, Acc_AC*100]
})

models.sort_values(by='Score', ascending=False)

In [None]:
# models.plot(kind='line',x='Score',y='Models',figsize=(14,12))
plt.figure(figsize=(14,6))
sns.lineplot(data=models, x="Models", y="Score",ci=None, marker='o')
plt.xticks(rotation = 45)
plt.show()

##### Best Model Accuracy In-Depth Analysis (ADA Boost Classifier)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

y_test_pred = clf5.predict(x__test)
y_train_pred = clf5.predict(x__train)

print("TRAINIG RESULTS: \n=========================================================")
clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
print("\n")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print("\n=========================================================")

print("TESTING RESULTS: \n=========================================================")
clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
print("\n")
print(f"CLASSIFICATION REPORT:\n{clf_report}")

In [None]:
from sklearn.metrics import precision_recall_curve, roc_curve

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.title("Precision/Recall Tradeoff")
    

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], "k--")
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    
    
precisions, recalls, thresholds = precision_recall_curve(y_test, clf5.predict(x__test))
plt.figure(figsize=(14, 25))
plt.subplot(4, 2, 1)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.subplot(4, 2, 2)
plt.plot(precisions, recalls)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.title("PR Curve: precisions/recalls tradeoff");

plt.subplot(4, 2, 3)
fpr, tpr, thresholds = roc_curve(y_test, clf5.predict(x__test))
plot_roc_curve(fpr, tpr)

##### Area Under Curve

In [None]:
plt.figure(figsize=(12,6))
y_pred_proba = clf5.predict_proba(x__test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="Area Under Curve="+str(auc))
plt.legend(loc=4)
plt.show()