# Data Scientist Coding Exercise

### Predict customers may churn in future from BankCo


## Load dataset using Pandas

In [None]:
# Import libraries
import pandas as pd
import numpy as np

#Data visualisaton and images
import matplotlib.pyplot as plt
#reading data of churn for bank customers dataset
data = pd.read_csv('../input/predicting-churn-for-bank-customers/Churn_Modelling.csv')

In [None]:
#verifying if there was Nan data in the given dataset
data.info()


We can see that there are 10000 rows and 14 columns in total, that lead me to drop the RowNumber because it is not the part of the data. As we have seen above there is no feature in this dataset that is missing for any row.

###  Explore the data

In [None]:
#Droping the Row number as it was the index of dataset in Excel version
data = data.drop('RowNumber', axis = 1)
data.head()

### Bar Chart for Categorical Features
.CreditScore  
.Geography  
.Gender  
.Age  
.Tenure  
.Balance  
.NumOfProducts  
.HasCrCard  
.IsActiveMember  
.EstimatedSalary    

In [None]:
'''This function is for helping us for better visualization of our dataset '''
def bar_chart(feature):
    exited = data[data['Exited']==1][feature].value_counts()
    stayed = data[data['Exited']==0][feature].value_counts()
    df = pd.DataFrame([exited,stayed])
    df.index = ['Exited','Stayed']
    df.plot(kind = 'bar', stacked = True, figsize=(10,5))

In [None]:
bar_chart('Exited')

In [None]:
# Now we have the data in dataframe, we can begin an advanced analysis
#of data. Lets examine the overall chance to Exited from the bank

data['Exited'].mean()

The calculation shows 20% of BankCO customers leave the bank, we can group data by the features to explore the main reason of this cuase.

In [None]:
bar_chart('NumOfProducts')

In [None]:
nberOfProcucts_grouping = data.groupby('NumOfProducts').mean()
nberOfProcucts_grouping

In [None]:
nberOfProcucts_grouping['Exited'].plot.bar()

We can start drawing some interesting insights from this data. 
For instance, Customer with 4 number of products had a 100% 
chance of exit the bank, compared to a 7.58% chance for those 
with only 2 number of products.

In [None]:
bar_chart('Gender')

In [None]:
'''We can continue to extend the statIcal breakdown by 
using the grouping function for both number of products  and Gender'''

nberOfProcucts_genger_grouping = data.groupby(['NumOfProducts', 'Gender']).mean()
nberOfProcucts_genger_grouping

In [None]:
nberOfProcucts_genger_grouping['Exited'].plot.bar()

### Converting Numerical Age to Categorical Variable

feature:  
*Child: 0  
*Young: 1  
*Adult: 2  
*Mid-age: 3  
*Senior: 4  
*Older: 5  
*Very-old: 6

In [None]:
'''Function of categorizing the age of customers'''
data_age = data
def age_cat(x):
    if x <= 20:
        return 0
    elif x > 20 and x <=30:
        return 1
    elif x >30 and x <= 40:
        return 2
    elif x >40 and x <= 50:
        return 3
    elif x > 50 and x <= 60:
        return 4
    elif x >60 and x <= 70:
        return 5
    else:
        return 6
data_age['Age_categ'] = data_age['Age'].apply(age_cat)
data_age.head(10)

In [None]:
bar_chart('Age_categ')

In [None]:
#Grouping the cutomers' age and make barchart for the Exited customers
group_by_age = pd.cut(data["Age"], np.arange(0,100,10))
age_grouping = data.groupby(group_by_age).mean()
age_grouping['Exited'].plot.bar()

In [None]:
#Bar chart for Geograhical countries location of the customers 
bar_chart('Geography')

In [None]:
nberOfProcucts_Geo_grouping = data.groupby(['NumOfProducts', 'Geography']).mean()
nberOfProcucts_Geo_grouping

In [None]:
nberOfProcucts_Geo_grouping['Exited'].plot.bar()

In [None]:
data_credit = data
def credit_cat(x):
    if x <= 350:
        return 0
    elif x > 350 and x <=450:
        return 1
    elif x >450 and x <= 550:
        return 2
    elif x >550 and x <= 650:
        return 3
    elif x > 650 and x <= 750:
        return 4
    else:
        return 5
data_credit['credit_categ'] = data_credit['CreditScore'].apply(credit_cat)
data_credit.head(10)

In [None]:
bar_chart('credit_categ')

In [None]:
Credit_grouping = data_credit.groupby(['credit_categ']).mean()
Credit_grouping['Exited'].plot.bar()

In [None]:
bar_chart('HasCrCard')

In [None]:
bar_chart('IsActiveMember')

In [None]:
bar_chart('Tenure')

In [None]:
data_salary = data
def salary_cat(x):
    if x <= 50000:
        return ("Normal Customer")
    elif x > 50000 and x <=100000:
        return ("Intermediate Customer")
    elif x >100000 and x <= 150000:
        return ("Class customer")
    else:
        return ("VIP Customer")
data_salary['salary_categ'] = data_salary['EstimatedSalary'].apply(salary_cat)
data_salary.head(10)

In [None]:
bar_chart('salary_categ')

Here we can see that people with 4 Number Of Products, female,
with age between 50-60 interval and located in Germany were indeed the most likely 
to exit the bank group

# Feature engineering

In next steps I am going to using domain knowledge of the data to create features (feature vectors) that make machine learning algorithms work. Since we don't have missing values in our dataset.

The Gender and Geography fields are both string values. I need to converts these strings into integer keys, making it easier for classification algorithms to find patterns and also to facilitate processing and statistical analysis and also I will categorize feautures including "Age", " CreditScore", "EstimatedSalary", "Tenure", and "Balance" for the same purpose.

The Surname,CutomerID columns consist of non-categorical string values. These are difficult to use in a classification algorithm, so I will drop them from the data set.

In [None]:
data.head()

As you can see above there were some features added during the process of halping me to visualize data where I was categorized in different categories including " Age_cat", " Credit_categ", " Salary_categ". So, I am going to drop them in order to back to my real data set that I am going to model.

In [None]:
#Dropping the added columns
data_final = data.drop(['Age_categ', 'credit_categ', 'salary_categ'], axis=1)
data_final.head()

In [None]:
#create a dict file to convert string variable into numerical one
# for Gender column
gender = {'Male':0, 'Female':1}
data_final.Gender = [gender[item] for item in data_final.Gender]
data_final.head()

In [None]:
data_final.info()

In [None]:
#create a dict file to convert string variable into numerical one
#For contries
geo = {'France':1, 'Spain':2, 'Germany':3}
data_final.Geography = [geo[item] for item in data_final.Geography]
data_final.head()

In [None]:
def age_cat(x):
    if x <= 20:
        return 0
    elif x > 20 and x <=30:
        return 1
    elif x >30 and x <= 40:
        return 2
    elif x >40 and x <= 50:
        return 3
    elif x > 50 and x <= 60:
        return 4
    elif x >60 and x <= 70:
        return 5
    else:
        return 6
data_final['Age'] = data_final['Age'].apply(age_cat)

In [None]:
def salary_cat(x):
    if x <= 50000:
        return 0
    elif x > 50000 and x <=100000:
        return 1
    elif x >100000 and x <= 150000:
        return 2
    else:
        return 3
data_final['EstimatedSalary'] = data_final['EstimatedSalary'].apply(salary_cat)

In [None]:
def credit_cat(x):
    if x <= 350:
        return 0
    elif x > 350 and x <=450:
        return 1
    elif x >450 and x <= 550:
        return 2
    elif x >550 and x <= 650:
        return 3
    elif x > 650 and x <= 750:
        return 4
    else:
        return 5
data_final['CreditScore'] = data_final['CreditScore'].apply(credit_cat)

In [None]:
def balance_cat(x):
    if x <= 50000:
        return 0
    elif x > 50000 and x <=100000:
        return 1
    elif x >100000 and x <= 150000:
        return 2
    elif x >150000 and x <= 200000:
        return 3
    elif x > 200000 and x <= 250000:
        return 4
    else:
        return 5
data_final['Balance'] = data_final['Balance'].apply(balance_cat)

In [None]:
def Tenure_cat(x):
    if x <= 3:
        return 0
    elif x > 3 and x <=6:
        return 1
    elif x >6 and x <= 9:
        return 2
    else:
        return 3
data_final['Tenure'] = data_final['Tenure'].apply(Tenure_cat)

In [None]:
# delete the unnecessary features from dataset

df = data_final.drop(['Surname'], axis=1)
df.head()

In [None]:
df.head()

In [None]:
# Splitting the dataset
from sklearn.model_selection import train_test_split
target = df['Exited']
features = df.drop(['Exited'],  axis=1)
features.shape, target.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     features, target, test_size=0.4, random_state=0)

X_train.shape, y_train.shape

In [None]:
X_train = X_train.drop(['CustomerId'], axis=1)

In [None]:
X_train.head(10)

# Modelling

In [None]:
# Import classifier Modules

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

# Cross Validation (K-fold)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

# kNN

In [None]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# kNN Score
round(np.mean(score)*100, 2)

# Decision Tree

In [None]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# decision tree Score
round(np.mean(score)*100, 2)

# Random Forest

In [None]:
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# Random Forest Score
round(np.mean(score)*100, 2)

# Naive Bayes

In [None]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# Naive Bayes Score
round(np.mean(score)*100, 2)

# SVM

In [None]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
round(np.mean(score)*100,2)

The SVM ( SUpport Vector Machine) Classifier gives the best score of 84.15. And I am going to use it for testing my test dataset 

# Testing

In [None]:
clf = SVC()
clf.fit(X_train, y_train)

test_data = X_test.drop("CustomerId", axis=1).copy()
prediction = clf.predict(test_data)

In [None]:
#Calculating the accuracy
print("%s: %.2f%%" % ('Accuracy: ', (clf.score(test_data,y_test))*100))