# Churn Model Prediction

## Import the libraries needed

In [None]:
# ignore local warnings
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
sns.set(style="darkgrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

# Model packages
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
import os
os.chdir("/kaggle/input/churn-modelling")

## Import the dataset

In [None]:
df = pd.read_csv("Churn_Modelling.csv")
df.head()

## Exploratory Data Analysis (EDA)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

We can see that there are no missing data in the dataset. We will proceed with exploring each of the feature and try to find the best approach to each of the features.

We won't need to include the first three columns in our model since "CustomerId", "Surname" and "RowNumber" do not  have any logical contribution to our prediction.

## Data Pre-processing

In [None]:
X = df.iloc[:, 3:-1].values
y = df.iloc[:, -1].values

#### Categorical Variable Encoding

We will encode two categorical variables: "Geography" and "Gender".
The way to encode feature "Geography" is different from "Gender" as we only have two possible values for "Gender" (Female or Male). We can simply encode one of the to be "1" and the other one to be "0". But in "Geography" case, we have more than two possible values. If we encode this feature the same way as "Gender" there will be some numerical order in the value of this feature which would create some confusion in our model (we do not want to create a correlation between the numerical order in the feature and our target variable) 

In [None]:
# Gender
le = LabelEncoder()
X[:,2] = le.fit_transform(X[:,2])

In [None]:
# Geography
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [1])] , remainder = "passthrough")
X = np.array(ct.fit_transform(X))

#### Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

#### Feature Scaling

We will need to scale some of the features (some numerical features). The reason we are doing this is because we do not want one feature to overweight the other because of their relatively bigger range number. We will use methods of standardization to scale some of the numerical values.

Here's the formula for standardization:

#### $ x_{standardized} = \frac{x - \mu(X)}{\sigma(X)} $

In [None]:
scaler = StandardScaler()
X_train[:, [3,5,6,7,8,11]] = scaler.fit_transform(X_train[:, [3,5,6,7,8,11]])
X_test[:, [3,5,6,7,8,11]] = scaler.transform(X_test[:, [3,5,6,7,8,11]])

## Applying different classification models
### 1) XGBoost Classifier

In [None]:
classifier_1 = XGBClassifier()
classifier_1.fit(X_train, y_train)

#### Confusion Matrix

In [None]:
y_pred_1 = classifier_1.predict(X_test)
print(confusion_matrix(y_test,y_pred_1))

#### K-fold Cross Validation

In [None]:
acc_1 = cross_val_score(estimator = classifier_1, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(acc_1.mean()*100))
print("Standard Deviation: {:.2f} %".format(acc_1.std()*100))

### 2) Random Forest Classifier

In [None]:
classifier_2 = RandomForestClassifier(n_estimators = 100, random_state = 0)
classifier_2.fit(X_train, y_train)

#### Confusion Matrix

In [None]:
y_pred_2 = classifier_2.predict(X_test)
print(confusion_matrix(y_test,y_pred_2))

#### K-fold Cross Validation

In [None]:
acc_2 = cross_val_score(estimator = classifier_2, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(acc_2.mean()*100))
print("Standard Deviation: {:.2f} %".format(acc_2.std()*100))

### 3) Naive Bayes Classifier

In [None]:
classifier_3 = GaussianNB()
classifier_3.fit(X_train, y_train)

#### Confusion Matrix

In [None]:
y_pred_3 = classifier_3.predict(X_test)
print(confusion_matrix(y_test,y_pred_3))

#### K-fold Cross Validation

In [None]:
acc_3 = cross_val_score(estimator = classifier_3, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(acc_3.mean()*100))
print("Standard Deviation: {:.2f} %".format(acc_3.std()*100))

### 4) Logistic Regression

In [None]:
classifier_4 = LogisticRegression(random_state = 0)
classifier_4.fit(X_train, y_train)

#### Confusion Matrix

In [None]:
y_pred_4 = classifier_4.predict(X_test)
print(confusion_matrix(y_test,y_pred_4))

#### K-fold Cross Validation

In [None]:
acc_4 = cross_val_score(estimator = classifier_4, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(acc_4.mean()*100))
print("Standard Deviation: {:.2f} %".format(acc_4.std()*100))

### 5) K-Nearest Neighbors

In [None]:
classifier_5 = KNeighborsClassifier(n_neighbors = 10, metric= "minkowski", p=2)
classifier_5.fit(X_train, y_train)

#### Confusion Matrix

In [None]:
y_pred_5 = classifier_5.predict(X_test)
print(confusion_matrix(y_test,y_pred_5))

#### K-fold Cross Validation

In [None]:
acc_5 = cross_val_score(estimator = classifier_5, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(acc_5.mean()*100))
print("Standard Deviation: {:.2f} %".format(acc_5.std()*100))

### 6) Artifical Neural Network

#### Initializing

In [None]:
ann = tf.keras.models.Sequential()

#### Adding layers

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=12)) #first layer
ann.add(tf.keras.layers.Dense(units=6, activation="relu")) #Second layer 
ann.add(tf.keras.layers.Dense(units=6, activation="relu")) #Third layer
ann.add(tf.keras.layers.Dense(units=1, activation="sigmoid")) #Output layer

#### Compile

In [None]:
ann.compile(optimizer="adam", loss= "binary_crossentropy", metrics= ["accuracy"])

In [None]:
type(X_train)

In [None]:
ann.fit(X_train, y_train, batch_size=32, epochs=100)

#### Confusion Matrix

In [None]:
y_pred_6 = ann.predict(X_test)
y_pred_6 = (y_pred_6 > 0.5)
print(confusion_matrix(y_test, y_pred_6))

In [None]:
acc_6 = accuracy_score(y_test, y_pred_6)
print("Accuracy: {:.2f}%".format(acc_6*100))

There are so many improvement that could be made such as tuning the hyperparameter, adding extra layers (for ANN), etc. All in all, the best classification algorithm that we get for now are Artifical Neural Network (ANN) and Random Forest Classifier, with accuracy score of approximately 86%.