In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
customer_df = pd.read_csv('/kaggle/input/bank-customer-churn-modeling/Churn_Modelling.csv', delimiter=',')
customer_df.describe()

In [None]:
customer_df.info()

In [None]:
#checking for null values
customer_df.loc[customer_df.isna().any(axis=1)]


In [None]:
Male_dummies=pd.get_dummies(customer_df.Gender,prefix=None,prefix_sep='_') 
region_dummies=pd.get_dummies(customer_df.Geography,prefix=None)
print(region_dummies)

In [None]:
customer_df=customer_df.merge(Male_dummies,left_index=True, right_index=True).merge(region_dummies,right_index=True,left_index=True)
customer_df.head()

In [None]:
customer_df=customer_df[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Germany', 'Spain',
       'Male']]
customer_df.Male=customer_df.Male.astype('int64')
customer_df.Spain=customer_df.Spain.astype('int64')
customer_df.Germany=customer_df.Germany.astype('int64')
customer_df.rename(columns={'Exited': 'Churn'},inplace=True)

In [None]:
 customer_df.info()

In [None]:
X = customer_df.drop(['Churn'], axis=1)
y = customer_df['Churn']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, test_size = 0.2, random_state = 0)


In [None]:

models = []
models.append(('LR', LogisticRegression(solver='lbfgs')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='scale')))
models.append(('RF',RandomForestClassifier(n_estimators=200, random_state=0) ))




In [None]:
seed = 7
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10)
	cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

In [None]:
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

The Graph shows that the Random Forest Classifier is the best among all of those models.

In [None]:
# fit a random forest classifier on the data and predict the test set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=200, random_state=0)  
classifier.fit(X_train, y_train)  
predictions = classifier.predict(X_test)
classifier.predict_proba(X_test)


In [None]:
# return accuracy metrics. this model has accuracy of 86%, not bad for the start
from sklearn.metrics import classification_report, accuracy_score
#print(classification_report(y_test,predictions ))  
print(accuracy_score(y_test, predictions ))


In [None]:
# confusion metrics can be calculated using cross validation
from sklearn.model_selection import cross_val_predict
y_train_pred=cross_val_predict(classifier,X_train,y_train,cv=3)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train,y_train_pred)


In [None]:

# for feature selection the top important ones can be selected
feature_importances = pd.Series(classifier.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')

# this model can be used to predict if some costumers are about to abandan the bank and actions can be taken for those customers.
 