In [None]:
from IPython.display import Image
Image('churn.png')

# 1. Data preprocessing and exploration

## 1.1. Importing libraries and dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('Churn_Modelling.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.isnull().sum()

## 1.2. Statistical description

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.describe(include=['object'])

In [None]:
dataset['Geography'].unique()

In [None]:
dataset['Geography'].value_counts()

## 1.3. Boxplot

In [None]:
sns.set(style='whitegrid')
ax = sns.boxplot(data = dataset['CreditScore'], orient = 'h')

In [None]:
IQR = dataset['CreditScore'].quantile(0.75)-dataset['CreditScore'].quantile(0.25)

In [None]:
IQR

In [None]:
Inf = dataset['CreditScore'].quantile(0.25)-1.5*IQR

In [None]:
Inf

In [None]:
dataset[dataset['CreditScore']<Inf].count()

In [None]:
Sup = dataset['CreditScore'].quantile(0.75)+1.5*IQR

In [None]:
Sup

In [None]:
my_list = ['CreditScore', 'Age']

In [None]:
my_list

In [None]:
dataset[my_list].head()

In [None]:
ax = sns.boxplot(data = dataset[my_list], orient = 'h')

## 1.4. Scatter Plot

In [None]:
plt.scatter(x=dataset['Age'],y=dataset['CreditScore'])
plt.xlabel('Age')
plt.ylabel('CreditScore')

In [None]:
sns.set(style='ticks')
sns.pairplot(dataset)

## 1.5. Input variables and target

In [None]:
X = dataset.iloc[:,3:13]
y = dataset.iloc[:,13]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
type(X)

**Convert DataFrame to Numpy Array**

In [None]:
X = X.values
y = y.values

In [None]:
type(X)

In [None]:
X[0:5,:]

## 1.6. Encoding categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [None]:
X[:,1] = label.fit_transform(X[:,1])
X[:,2] = label.fit_transform(X[:,2])

In [None]:
X[0:10,:]

**Dummy encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
X.shape

In [None]:
ct = ColumnTransformer([('Geography',OneHotEncoder(),[1])],\
                      remainder = 'passthrough')

In [None]:
X = ct.fit_transform(X)

In [None]:
X.shape

In [None]:
X[0:10,:]

In [None]:
X = X[:,1:]

In [None]:
X.shape

In [None]:
X[0:5,:]

In [None]:
dataset.columns

In [None]:
features = ['Geography2','Geography3','CreditScore', 
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']

In [None]:
features

## 1.7. Splitting dataset into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,\
                                                random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y.mean()

In [None]:
print(y_train.mean())
print(y_test.mean())

## 1.8. Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
#from sklearn.preprocessing import MinMaxScaler

In [None]:
X_train[0,:]

In [None]:
X_train_sc[0,:]

In [None]:
(667 - X_train[:,2].mean())/(X_train[:,2].std())

In [None]:
scaler.inverse_transform(X_train_sc[0,:])

# 2. k Nearest Neighbors

In [None]:
Image('knn.png')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)

In [None]:
knn.fit(X_train_sc,y_train)

In [None]:
y_pred_knn = knn.predict(X_test_sc)

In [None]:
for i in range(10):
    print(y_test[i], y_pred_knn[i])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print('Confusion matrix knn \n', confusion_matrix(y_test,y_pred_knn))
print('Accuracy knn', accuracy_score(y_test,y_pred_knn))

In [None]:
print(classification_report(y_test,y_pred_knn))

In [None]:
Image('metrics.png')

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
print('Recall knn', recall_score(y_test,y_pred_knn))
print('Precision knn', precision_score(y_test,y_pred_knn))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred_knn))

**Grid search**

In [None]:
Image('grid.png')

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors' : [1,3,5,7,9,11,13]}
model = KNeighborsClassifier()
clf = GridSearchCV(model,parameters, scoring='accuracy', cv=5)
grille = clf.fit(X_train_sc,y_train)
print(grille.best_params_)
print(grille.best_score_)

In [None]:
y_pred_knn_o = grille.predict(X_test_sc)

In [None]:
print('Confusion matrix knn op \n', confusion_matrix(y_test,y_pred_knn_o))
print('Accuracy knn op', accuracy_score(y_test,y_pred_knn_o))
print('Recall knn op', recall_score(y_test,y_pred_knn_o))
print('Precision knn op', precision_score(y_test,y_pred_knn_o))

# 3. Decision Tree

In [None]:
Image('decision.png')

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [None]:
dt.fit(X_train,y_train)

In [None]:
y_pred_dt = dt.predict(X_test)

In [None]:
for i in range(10):
    print(y_test[i],y_pred_dt[i])

In [None]:
print('Confusion matrix dt \n', confusion_matrix(y_test,y_pred_dt))
print('Accuracy dt', accuracy_score(y_test,y_pred_dt))
print('Recall dt', recall_score(y_test,y_pred_dt))
print('Precision dt', precision_score(y_test,y_pred_dt))

In [None]:
print(classification_report(y_test,y_pred_dt))

**Visualization**

In [None]:
!pip install graphviz

In [None]:
import graphviz
from sklearn import tree
from sklearn.tree import export_graphviz
model = DecisionTreeClassifier(max_depth = 5)
model.fit(X,y)

In [None]:
tree.export_graphviz(model,feature_names = features,\
                    out_file = 'dt_cdoss.dot',\
                    label = 'all',\
                    filled = True,\
                    rounded = True)

In [None]:
Image('dt_cdoss.png')

# 4. Random Forest

In [None]:
Image('random_forest.png')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)

In [None]:
rf.fit(X_train,y_train)

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
print('Confusion matrix rf \n', confusion_matrix(y_test,y_pred_rf))
print('Accuracy rf', accuracy_score(y_test,y_pred_rf))
print('Recall rf', recall_score(y_test,y_pred_rf))
print('Precision rf', precision_score(y_test,y_pred_rf))

In [None]:
print(classification_report(y_test,y_pred_rf))

# 5. Features selection

In [None]:
importances = rf.feature_importances_

In [None]:
importances

In [None]:
sorted_idx = np.argsort(importances)

In [None]:
features_arr = np.asarray(features)

In [None]:
padding=np.arange(X_train_sc.size/len(X_train_sc))
plt.barh(padding,importances[sorted_idx],align='center')
plt.yticks(padding,features_arr[sorted_idx])
plt.xlabel('Relative importance')
plt.title('Variable importance')

# 6. Support vector Machine

In [None]:
Image('linear_svm.png')

In [None]:
Image('kernel_svm.png')

In [None]:
from sklearn.svm import SVC
linear_SVM = SVC(kernel='linear')
linear_SVM.fit(X_train_sc,y_train)

In [None]:
y_predictSVM_l = linear_SVM.predict(X_test_sc)
print(confusion_matrix(y_test,y_predictSVM_l))
print('Accuracy linear SVM {0:.3f}'.format(accuracy_score(y_test,y_predictSVM_l)))
print('Precision linear SVM {0:.3f}'.format(precision_score(y_test,y_predictSVM_l)))
print('Recall linear SVM {0:.3f}'.format(recall_score(y_test,y_predictSVM_l)))

In [None]:
print(classification_report(y_test,y_predictSVM_l))

In [None]:
kernel_SVM = SVC(kernel='rbf')
kernel_SVM.fit(X_train_sc,y_train)

In [None]:
y_predictSVM_k = kernel_SVM.predict(X_test_sc)
print(confusion_matrix(y_test,y_predictSVM_k))
print('Accuracy rbf SVM {0:.3f}'.format(accuracy_score(y_test,y_predictSVM_k)))
print('Precision rbf SVM {0:.3f}'.format(precision_score(y_test,y_predictSVM_k)))
print('Recall rbf SVM {0:.3f}'.format(recall_score(y_test,y_predictSVM_k)))

In [None]:
print(classification_report(y_test,y_predictSVM_k))

# 7. Logistic Regression

In [None]:
Image(filename='logistic.png')

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train_sc,y_train)

In [None]:
y_predictLR = LR.predict(X_test_sc)
print(confusion_matrix(y_test,y_predictLR))
print('Accuracy Logistic Regression {0:.3f}'.format(accuracy_score(y_test,y_predictLR)))
print('Precision Logistic Regression {0:.3f}'.format(precision_score(y_test,y_predictLR)))
print('Recall Logistic Regression {0:.3f}'.format(recall_score(y_test,y_predictLR)))

# 8. Summary

| Algorithm       | Accuracy     |Recall | Precision |
| :------------- | -----------|-----------|-----------:|
|  Optimized kNN  | 0.834    | 0.377 | 0.659 |
| Decision Tree   | 0.804  | 0.572 | 0.514 |
| Random Forest | 0.865 | 0.523 | 0.736 |
| linear SVM   | 0.797  | 0.000 | 0.000 |
| kernel SVM   | 0.864  | 0.444 | 0.789 |
| Logistic Regression | 0.811 | 0.237 | 0.582 |

# 9. Receiver Operating Characteristic (ROC) Curve

In [None]:
Image(filename='roc.png')

In [None]:
from sklearn import metrics
fpr,tpr,thr = metrics.roc_curve(y_test,y_pred_knn_o)
fpr2,tpr2,thr2 = metrics.roc_curve(y_test,y_predictLR)
fpr3,tpr3,thr3 = metrics.roc_curve(y_test,y_pred_dt)
fpr4,tpr4,thr4 = metrics.roc_curve(y_test,y_predictSVM_l)
fpr5,tpr5,thr5 = metrics.roc_curve(y_test,y_predictSVM_k)
fpr6,tpr6,thr6 = metrics.roc_curve(y_test,y_pred_rf)
auc = metrics.auc(fpr,tpr)
auc2 = metrics.auc(fpr2,tpr2)
auc3 = metrics.auc(fpr3,tpr3)
auc4 = metrics.auc(fpr4,tpr4)
auc5 = metrics.auc(fpr5,tpr5)
auc6 = metrics.auc(fpr6,tpr6)
plt.plot(fpr,tpr,'-',lw=2,label='gamma=0.01,AUC KNN_O =%.2f'%auc)
plt.plot(fpr2,tpr2,'-',lw=2,label='gamma=0.01,AUC LR=%.2f'%auc2)
plt.plot(fpr3,tpr3,'-',lw=2,label='gamma=0.01,AUC DT=%.2f'%auc3)
plt.plot(fpr4,tpr4,'-',lw=2,label='gamma=0.01,AUC SVM_l=%.2f'%auc4)
plt.plot(fpr5,tpr5,'-',lw=2,label='gamma=0.01,AUC SVM_k=%.2f'%auc5)
plt.plot(fpr6,tpr6,'-',lw=2,label='gamma=0.01,AUC RF=%.2f'%auc6)
plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN, LR, DT, SVM & RF ROC curves')
plt.legend(loc='lower right')
plt.show()

**Random Forest est le meilleur algorithme**