In [1]:
# Importing Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [2]:
# Importing data
churn_data = pd.read_csv("Customer-Churn-Records.csv")
churn_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [3]:
# Checking number of unique values under "Geography"
churn_data.value_counts("Geography")

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [4]:
# Dropping non-essential columns in reference to churn potential
churn_data = churn_data.drop(["RowNumber", "CustomerId", "Surname", "Geography"], axis=1)
churn_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,619,Female,42,2,0.00,1,1,1,101348.88,1,1,2,DIAMOND,464
1,608,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,502,Female,42,8,159660.80,3,1,0,113931.57,1,1,3,DIAMOND,377
3,699,Female,39,1,0.00,2,0,0,93826.63,0,0,5,GOLD,350
4,850,Female,43,2,125510.82,1,1,1,79084.10,0,0,5,GOLD,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,Male,39,5,0.00,2,1,0,96270.64,0,0,1,DIAMOND,300
9996,516,Male,35,10,57369.61,1,1,1,101699.77,0,0,5,PLATINUM,771
9997,709,Female,36,7,0.00,1,0,1,42085.58,1,1,3,SILVER,564
9998,772,Male,42,3,75075.31,2,1,0,92888.52,1,1,2,GOLD,339


In [5]:
# Finding the number of unique values in each column
churn_data.nunique()

CreditScore            460
Gender                   2
Age                     70
Tenure                  11
Balance               6382
NumOfProducts            4
HasCrCard                2
IsActiveMember           2
EstimatedSalary       9999
Exited                   2
Complain                 2
Satisfaction Score       5
Card Type                4
Point Earned           785
dtype: int64

In [6]:
# Looking at the value counts for "Tenure" for binning
tenure_vcounts = churn_data["Tenure"].value_counts()
tenure_vcounts

Tenure
2     1048
1     1035
7     1028
8     1025
5     1012
3     1009
4      989
9      984
6      967
10     490
0      413
Name: count, dtype: int64

In [7]:
# Looking at value counts for "Satisfaction Score" for binning
satisfaction = churn_data["Satisfaction Score"].value_counts()
satisfaction

Satisfaction Score
3    2042
2    2014
4    2008
5    2004
1    1932
Name: count, dtype: int64

In [8]:
# Looking at value counts for "Credit Score" for binning
cred_score = churn_data["CreditScore"].value_counts()
cred_score

CreditScore
850    233
678     63
655     54
705     53
667     53
      ... 
404      1
351      1
365      1
417      1
419      1
Name: count, Length: 460, dtype: int64

In [10]:
# Choosing a cutoff value and create a list of credit score values to be replaced
credit_scores_to_replace = list(cred_score.iloc[10:].index)
cutoff = 40

# Replace in dataframe
# Using "Below x" as I don't know what to label it yet
for cred in credit_scores_to_replace:
    churn_data["CreditScore"] = churn_data["CreditScore"].replace(cred, "Below x")

# Checking if binning was successful
churn_data["CreditScore"].value_counts()

CreditScore
Below x    9296
850         233
678          63
655          54
667          53
705          53
684          52
670          50
651          50
652          48
683          48
Name: count, dtype: int64

In [12]:
# Converting categorical data to numeric with "pd.get_dummies"
# Probably need to sort out the credit score first since we are getting some individual credit score columns
numerics = pd.get_dummies(churn_data)
numerics

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,...,CreditScore_684,CreditScore_705,CreditScore_850,CreditScore_Below x,Gender_Female,Gender_Male,Card Type_DIAMOND,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER
0,42,2,0.00,1,1,1,101348.88,1,1,2,...,False,False,False,True,True,False,True,False,False,False
1,41,1,83807.86,1,0,1,112542.58,0,1,3,...,False,False,False,True,True,False,True,False,False,False
2,42,8,159660.80,3,1,0,113931.57,1,1,3,...,False,False,False,True,True,False,True,False,False,False
3,39,1,0.00,2,0,0,93826.63,0,0,5,...,False,False,False,True,True,False,False,True,False,False
4,43,2,125510.82,1,1,1,79084.10,0,0,5,...,False,False,True,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39,5,0.00,2,1,0,96270.64,0,0,1,...,False,False,False,True,False,True,True,False,False,False
9996,35,10,57369.61,1,1,1,101699.77,0,0,5,...,False,False,False,True,False,True,False,False,True,False
9997,36,7,0.00,1,0,1,42085.58,1,1,3,...,False,False,False,True,True,False,False,False,False,True
9998,42,3,75075.31,2,1,0,92888.52,1,1,2,...,False,False,False,True,False,True,False,True,False,False


In [13]:
# Splitting preprocessed data into featured and target arrays
y = numerics["Exited"].values
x = numerics.drop("Exited", axis = 1).values

# Getting the training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [15]:
# Creating StandardScaler instances
scaler = StandardScaler()

# Fitting StandardScaler
x_scaler = scaler.fit(x_train)

# Scaling data
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)