# 1. Introduction

Summary Information about the variables and their types in the data:
* Surname : The customer surname
* CreditScore : The customer credit score 
* Geography : The country of the customer(Germany/France/Spain)
* Gender : The gender of the customer (Female/Male)
* Age : The age of the customer
* Tenure : The customer's number of years in the in the bank
* Balance : The customer's account balance
* NumOfProducts : The number of bank products that the customer uses
* HasCrCard : Does the customer has a credit card? (0=No,1=Yes)
* IsActiveMember : Does the customer has an active mebership (0=No,1=Yes)
* EstimatedSalary : The estimated salary of the customer
* Exited : Churned or not? (0=No,1=Yes)

# 2. Data Analysis

**2.1 Importing Libraries and Loading Data**

In [None]:
# data analysis libraries:
import numpy as np
import pandas as pd
# data visualization libraries:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

# to ignore warnings:
import sys
if not sys.warnoptions:
    import os, warnings
    warnings.simplefilter("ignore") 
    os.environ["PYTHONWARNINGS"] = "ignore" 

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

In [None]:
# Importing modelling libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score

In [None]:
# Read train and test data with pd.read_csv():
data = pd.read_csv(r'../input/churn-modelling/Churn_Modelling.csv')
data.name = 'Data Set'
print('Number of Examples = {}'.format(data.shape[0]))
print(data.name, 'X Shape = {}'.format(data.shape))
print(data.name, 'y Shape = {}\n'.format(data['Exited'].shape[0]))
print(data.columns)

In [None]:
print(data.info())

In [None]:
data.sample(5)

In [None]:
def missing_values(data):    
    for col in data.columns.tolist():          
        print('{} column missing values: {}'.format(col, data[col].isnull().sum()))

print('{}'.format(data.name), 'missing values:')
missing_values(data)

**2.2 Basic summary statistics about the data**

In [None]:
data.iloc[:,2:len(data)].describe([0.1,0.25,0.5,0.75,0.99]).T

In [None]:
data[["Geography", "Exited"]].groupby(['Geography'], as_index=False).mean().sort_values(by='Exited', ascending=False)

In [None]:
g= sns.catplot(x = "Geography", y = "Exited", data = data, kind = "bar", height = 5)
g.set_ylabels("Churn Probability")
plt.show()

In [None]:
data[["Gender", "Exited"]].groupby(['Gender'], as_index=False).mean().sort_values(by='Exited', ascending=False)

In [None]:
g= sns.catplot(x = "Gender", y = "Exited", data = data, kind = "bar", height = 5)
g.set_ylabels("Churn Probability")
plt.show()

In [None]:
data[["HasCrCard", "Exited"]].groupby(['HasCrCard'], as_index=False).mean().sort_values(by='Exited', ascending=False)

In [None]:
g= sns.catplot(x = "HasCrCard", y = "Exited", data = data, kind = "bar", height = 5)
g.set_ylabels("Churn Probability")
plt.show()

In [None]:
data[["IsActiveMember", "Exited"]].groupby(['IsActiveMember'], as_index=False).mean().sort_values(by='Exited', ascending=False)

In [None]:
g= sns.catplot(x = "IsActiveMember", y = "Exited", data = data, kind = "bar", height = 5)
g.set_ylabels("Churn Probability")
plt.show()

In [None]:
data[["NumOfProducts", "Exited"]].groupby(['NumOfProducts'], as_index=False).mean().sort_values(by='Exited', ascending=False)

In [None]:
g= sns.catplot(x = "NumOfProducts", y = "Exited", data = data, kind = "bar", height = 5)
g.set_ylabels("Churn Probability")
plt.show()

2.3 Correlation matrix

In [None]:
fig, axs = plt.subplots(figsize=(12, 6))

sns.heatmap(data.drop(['CustomerId', 'RowNumber'], axis=1).corr(), ax=axs, annot=True, fmt = ".2f", linewidths=0.5, cmap='coolwarm')

axs.tick_params(axis='x', labelsize=10)
axs.tick_params(axis='y', labelsize=10)

axs.set_title('Data Set Correlations', size=15)

plt.show()

# 3. Data Preprocessing

In [None]:
data.drop("RowNumber", axis = 1, inplace = True)
data.drop("Surname", axis = 1, inplace = True)
data.drop("CustomerId", axis = 1, inplace = True)

In [None]:
data.sample(5)

3.1 Label encoding of gender variable

In [None]:
enc = LabelEncoder()
data['Gender'] = enc.fit_transform(data['Gender'])

3.2 One hot encoding of Geography (Country)

In [None]:
data = pd.get_dummies(data, columns = ['Geography'])

3.3 Splitting the data as train and Test data

In [None]:
train = data.sample(frac = 0.8)

test = data.drop(train.index)

In [None]:
all_data = pd.concat((train.loc[:,:],test.loc[:,:]))
target = all_data.Exited
all_data.drop("Exited", axis = 1, inplace = True)

3.4 Scaling Features in [0,1] range

In [None]:
dataEx = MinMaxScaler().fit_transform(all_data)

In [None]:
x_train = dataEx[:train.shape[0]]
x_test = dataEx[train.shape[0]:]

y_train = train.Exited
y_test = test.Exited

x_train.shape, y_train.shape, x_test.shape, y_test.shape

# 4. Modeling

4.1 Test Set Accuracy for the default models

In [None]:
r=1000
models = [LogisticRegression(random_state=r),GaussianNB(), KNeighborsClassifier(),
          SVC(random_state=r,probability=True),DecisionTreeClassifier(random_state=r),
          RandomForestClassifier(random_state=r), GradientBoostingClassifier(random_state=r)]
names = ["LogisticRegression","GaussianNB","KNN","SVC",
             "DecisionTree","Random_Forest","GBM"]

In [None]:
print('Default model test accuracies for the train data:', end = "\n")
print('_____________________________________________________________________________________________________')
for name, model in zip(names, models):
    model.fit(x_train, y_train)
    print(name, ': Train set accuracy :', "%.3f" % round(model.score(x_train, y_train)*100, 2))
    y_pred = model.predict(x_test) 
    print('Test set accuracy :',"%.3f" % round(accuracy_score(y_pred, y_test)*100, 2))
    print('_____________________________________________________________________________________________________')

4.2 Cross validation accuracy and std of the default models for all data

In [None]:
predictors=dataEx

In [None]:
results = []
print('10 fold Cross validation accuracy and std of the default models for all data:', end = "\n")
print('_____________________________________________________________________________________________________')
for name, model in zip(names, models):
    kfold = KFold(shuffle=True, n_splits=10, random_state=1000)
    cv_results = cross_val_score(model, predictors, target, cv = kfold, scoring = "accuracy")
    results.append(cv_results)
    print("{}: {} ({})".format(name, "%.3f" % round(cv_results.mean()*100, 2) , "%.3f" %  cv_results.std()))
    print('_____________________________________________________________________________________________________')