In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file_name = "/kaggle/input/predicting-churn-for-bank-customers/Churn_Modelling.csv"
churn_df = pd.read_csv(file_name)

In [None]:
churn_df.sample()

In [None]:
# View unique for Tenure, we can see this is a 
churn_df.Tenure.unique()

In [None]:
# Summarize our dataset 
print ("Rows     : " ,churn_df.shape[0])
print ("Columns  : " ,churn_df.shape[1])
print ("\nFeatures : \n" ,churn_df.columns.tolist())
print ("\nMissing values :  ", churn_df.isnull().sum().values.sum())
print ("\nUnique values :  \n",churn_df.nunique())

In [None]:
churn_df['Exited'].value_counts(sort = False)

In [None]:
churn_df_copy = churn_df.copy()
churn_df_copy.drop(["RowNumber",'CustomerId','Surname', 'Tenure',"EstimatedSalary", "Balance", "CreditScore", "Age"], axis=1, inplace=True)
# Create a new dataset called summary so that we can summarize our churn data
# Crosstab - Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed.
summary = pd.concat([pd.crosstab(churn_df_copy[x], churn_df_copy["Exited"]) for x in churn_df_copy.columns[:-1]], keys=churn_df_copy.columns[:-1])
summary

In [None]:
summary['Churn_Percentage'] = summary[1] / (summary[0] + summary[1])
summary

In [None]:
import seaborn as sns
# Create a Volin Plot showing how monthy charges relate to Churn
g = sns.factorplot(x="Exited", y = "NumOfProducts",data = churn_df, kind="violin", palette = "Pastel1")

In [None]:
import seaborn as sns
# Create a Volin Plot showing how monthy charges relate to Churn
g = sns.factorplot(x="Exited", y = "IsActiveMember",data = churn_df, kind="violin", palette = "Pastel1")

In [None]:

import seaborn as sns
# Create a Volin Plot showing how monthy charges relate to Churn
g = sns.factorplot(x="Exited", y = "Age",data = churn_df, kind="violin", palette = "Pastel1")

In [None]:
# Correlation plot doesn't end up being too informative
import matplotlib.pyplot as plt

def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.legend()
    cax = ax.matshow(corr)
    fig.colorbar(cax)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical')
    plt.yticks(range(len(corr.columns)), corr.columns)
    
plot_corr(churn_df)

### features selection

In [None]:
churn_df.columns

In [None]:
churn_df = churn_df[[ "CustomerId",'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'Exited']]

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#customer id col
Id_col = ['CustomerId']

#Target columns
target_col = ["Exited"]

#categorical columns
cat_cols = churn_df.nunique()[churn_df.nunique() < 11].keys().tolist() 
cat_cols = [x for x in cat_cols if x not in target_col]

#numerical columns
num_cols = [x for x in churn_df.columns if x not in cat_cols + target_col + Id_col]

#Binary columns with 2 values
bin_cols = churn_df.nunique()[churn_df.nunique() == 2].keys().tolist()

#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    churn_df[i] = le.fit_transform(churn_df[i])
    
#Duplicating columns for multi value columns
churn_df = pd.get_dummies(data = churn_df, columns = multi_cols )
churn_df.head()

In [None]:
#Scaling Numerical columns
std = StandardScaler()

# Scale data
scaled = std.fit_transform(churn_df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_bank_og = churn_df.copy()
churn_df = churn_df.drop(columns = num_cols,axis = 1)
churn_df = churn_df.merge(scaled, left_index=True, right_index=True, how = "left")

#churn_df.info()
churn_df.head()

In [None]:
churn_df.drop(['CustomerId'], axis=1, inplace=True)
churn_df = churn_df.dropna()

### Modeling

In [None]:
from sklearn.model_selection import train_test_split

# We remove the label values from our training data
X = churn_df.drop(['Exited'], axis=1).values

# We assigned those label values to our Y dataset
y = churn_df['Exited'].values

# Split it to a 70:30 Ratio Train:Test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

df_train = pd.DataFrame(X_train)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
score = model.score(X_test, y_test)

print("Accuracy = " + str(score))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:
# Let's see what features mattered most i.e. Feature Importance
# We sort on the co-efficients with the largest weights as those impact the resulting output the most 
coef = model.coef_[0]
coef = [abs(number) for number in coef]
# Sorting on Feature Importance
cols = list(churn_df.columns)
sorted_index = sorted(range(len(coef)), key = lambda k: coef[k], reverse = True)
for idx in sorted_index:
    print(cols[idx])

In [None]:
# RAndom Forest
# Let's try Random Forests now to see if our resutls get better
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model_rf = RandomForestClassifier(300)
model_rf.fit(X_train, y_train)

predictions = model_rf.predict(X_test)
score = model_rf.score(X_test, y_test)

print("Accuracy = " + str(score))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:
# Neural Network
import tensorflow as tf

# Create a simple model
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout

model = Sequential()

model.add(Dense(200, kernel_initializer = "uniform",activation = "relu", input_dim=13))
model.add(Dense(200, kernel_initializer = "uniform",activation = "relu"))
model.add(Dense(200, kernel_initializer = "uniform",activation = "relu",  kernel_regularizer=l2(0.01)))

model.add(Dropout(0.3, noise_shape=None, seed=None))

model.add(Dense(200, kernel_initializer = "uniform",activation = "relu"))

model.add(Dropout(0.3, noise_shape=None, seed=None))

model.add(Dense(200, kernel_initializer = "uniform",activation = "relu"))
model.add(Dense(1, kernel_initializer = "uniform",activation = "sigmoid"))

model.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
# Display Model Summary and Show Parameters
model.summary()

In [None]:
X_train.shape

In [None]:
# Start Training Our Classifier 

batch_size = 64
epochs = 25

history = model.fit(X_train,
                    y_train,
                    batch_size = batch_size,
                    epochs = epochs,
                    verbose = 1,
                    validation_data = (X_test, y_test))

score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
predictions = model.predict(X_test)
predictions = (predictions > 0.5)

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))