In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
churn_df = pd.read_csv("customer_churn.csv")

In [4]:
churn_df = churn_df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

In [5]:
churn_df.head(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [8]:
# separate features and label

X = churn_df.drop(["Exited"], axis=1)
y = churn_df["Exited"]

print(X.head(10))
print("******************************")
print(y.head(10))

   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619    France  Female   42       2       0.00              1   
1          608     Spain  Female   41       1   83807.86              1   
2          502    France  Female   42       8  159660.80              3   
3          699    France  Female   39       1       0.00              2   
4          850     Spain  Female   43       2  125510.82              1   
5          645     Spain    Male   44       8  113755.78              2   
6          822    France    Male   50       7       0.00              2   
7          376   Germany  Female   29       4  115046.74              4   
8          501    France    Male   44       4  142051.07              2   
9          684    France    Male   27       2  134603.88              1   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               1        101348.88  
1          0               1        112542.58  
2          1               0  

In [9]:
# separate numerical data in X from categorical data

X_num = X.drop(["Geography", "Gender"], axis=1)
X_num.head(10)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,42,2,0.0,1,1,1,101348.88
1,608,41,1,83807.86,1,0,1,112542.58
2,502,42,8,159660.8,3,1,0,113931.57
3,699,39,1,0.0,2,0,0,93826.63
4,850,43,2,125510.82,1,1,1,79084.1
5,645,44,8,113755.78,2,1,0,149756.71
6,822,50,7,0.0,2,1,1,10062.8
7,376,29,4,115046.74,4,1,0,119346.88
8,501,44,4,142051.07,2,0,1,74940.5
9,684,27,2,134603.88,1,1,1,71725.73


In [12]:
X_cat = X.filter(["Geography", "Gender"])
X_cat.head(10)

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female
5,Spain,Male
6,France,Male
7,Germany,Female
8,France,Male
9,France,Male


In [14]:
X_cat_num = pd.get_dummies(X_cat, drop_first=True)
X_cat_num.head()

Unnamed: 0,Geography_Germany,Geography_Spain,Gender_Male
0,False,False,False
1,False,True,False
2,False,False,False
3,False,False,False
4,False,True,False


In [15]:
X_proc = pd.concat([X_num, X_cat_num], axis=1)
X_proc.head(10)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,False,True,False
2,502,42,8,159660.8,3,1,0,113931.57,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,False,True,False
5,645,44,8,113755.78,2,1,0,149756.71,False,True,True
6,822,50,7,0.0,2,1,1,10062.8,False,False,True
7,376,29,4,115046.74,4,1,0,119346.88,True,False,False
8,501,44,4,142051.07,2,0,1,74940.5,False,False,True
9,684,27,2,134603.88,1,1,1,71725.73,False,False,True


In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.20, random_state=0)

X_train.head(10)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
7389,667,34,5,0.0,2,1,0,163830.64,False,True,False
9275,427,42,1,75681.52,1,1,1,57098.0,True,False,True
2995,535,29,2,112367.34,1,1,0,185630.76,False,False,False
5316,654,40,5,105683.63,1,1,0,173617.09,False,True,True
356,850,57,8,126776.3,2,1,1,132298.49,False,True,False
49,776,37,2,103769.22,2,1,0,194099.12,True,False,False
7931,807,47,1,95120.59,1,0,0,127875.1,False,False,True
1523,598,41,8,0.0,2,1,1,161954.43,False,True,True
7552,636,76,9,126534.6,1,1,1,39789.62,False,True,True
496,622,32,6,169089.38,2,1,0,101057.95,False,False,False


In [33]:
from sklearn.preprocessing import StandardScaler

In [34]:
sc = StandardScaler()

In [35]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)