In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


performing univariate analysis

In [3]:
df['Geography'].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

Handle the missing values

In [4]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Check for Categorical columns and perform encoding.

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Surname'] = le.fit_transform(df['Surname'])
df['Geography'] = le.fit_transform(df['Geography'])
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,1999,771,0,1,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,1336,516,0,1,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,1570,709,0,0,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,2345,772,1,1,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,2751,792,0,0,28,4,130142.79,1,1,0,38190.78,0


Split the data into dependent and independent variables

In [6]:
x = df.drop(['Gender', 'Surname', 'Geography', 'EstimatedSalary'],axis=1)
y = df[['EstimatedSalary']].values

In [7]:
x

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,1,15634602,619,42,2,0.00,1,1,1,1
1,2,15647311,608,41,1,83807.86,1,0,1,0
2,3,15619304,502,42,8,159660.80,3,1,0,1
3,4,15701354,699,39,1,0.00,2,0,0,0
4,5,15737888,850,43,2,125510.82,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,771,39,5,0.00,2,1,0,0
9996,9997,15569892,516,35,10,57369.61,1,1,1,0
9997,9998,15584532,709,36,7,0.00,1,0,1,1
9998,9999,15682355,772,42,3,75075.31,2,1,0,1


In [8]:
y

array([[101348.88],
       [112542.58],
       [113931.57],
       ...,
       [ 42085.58],
       [ 92888.52],
       [ 38190.78]])

scale the independent variables

In [9]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()


scaledX = scale.fit_transform(x,y)

print(scaledX) 

[[-1.73187761 -0.78321342 -0.32622142 ...  0.64609167  0.97024255
   1.97716468]
 [-1.7315312  -0.60653412 -0.44003595 ... -1.54776799  0.97024255
  -0.50577476]
 [-1.73118479 -0.99588476 -1.53679418 ...  0.64609167 -1.03067011
   1.97716468]
 ...
 [ 1.73118479 -1.47928179  0.60498839 ... -1.54776799  0.97024255
   1.97716468]
 [ 1.7315312  -0.11935577  1.25683526 ...  0.64609167 -1.03067011
   1.97716468]
 [ 1.73187761 -0.87055909  1.46377078 ...  0.64609167 -1.03067011
  -0.50577476]]


Split the data into training and testing

In [10]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.3,random_state=0)

In [11]:
ytest

array([[192852.67],
       [128702.1 ],
       [ 75732.25],
       ...,
       [167400.29],
       [ 70849.47],
       [ 33759.41]])

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [13]:
regressor = Sequential()
regressor.add(Dense(4, activation='relu')) # Input
regressor.add(Dense(12, activation='relu')) # 1st hid. layer
regressor.add(Dense(8, activation='relu')) # 2nd hid. layer
regressor.add(Dense(9, activation='relu')) # 3rd hid. layer
regressor.add(Dense(1,activation='linear')) # output layer

In [14]:
regressor.compile(optimizer='adam',loss='mse',metrics=['mse'])

In [15]:
regressor.fit(xtrain,ytrain,batch_size=10,epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7f8fdad3df50>

In [16]:
ypred = regressor.predict(xtest)

In [17]:
ypred

array([[98892.17],
       [99548.85],
       [99324.61],
       ...,
       [99356.44],
       [98862.67],
       [98894.37]], dtype=float32)

In [18]:
ytest

array([[192852.67],
       [128702.1 ],
       [ 75732.25],
       ...,
       [167400.29],
       [ 70849.47],
       [ 33759.41]])

In [19]:
pd.DataFrame({'Actual values':ytest.flatten(),
              'Predicted values':ypred.flatten()})

Unnamed: 0,Actual values,Predicted values
0,192852.67,98892.171875
1,128702.10,99548.851562
2,75732.25,99324.609375
3,89368.59,99871.500000
4,135662.17,100260.117188
...,...,...
2995,147606.71,99774.617188
2996,55829.25,99073.242188
2997,167400.29,99356.437500
2998,70849.47,98862.671875
