In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [3]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


> # **x and y**

In [4]:
x = train.iloc[:,3:-1]
x.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83


In [5]:
x.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0
mean,656.454373,38.125888,5.020353,55478.086689,1.554455,0.753954,0.49777,112574.822734
std,80.10334,8.867205,2.806159,62817.663278,0.547154,0.430707,0.499997,50292.865585
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58
25%,597.0,32.0,3.0,0.0,1.0,1.0,0.0,74637.57
50%,659.0,37.0,5.0,0.0,2.0,1.0,0.0,117948.0
75%,710.0,42.0,7.0,119939.5175,2.0,1.0,1.0,155152.4675
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48


In [6]:
x['Geography'].value_counts()

Geography
France     94215
Spain      36213
Germany    34606
Name: count, dtype: int64

In [7]:
y = train.iloc[:,-1]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Exited, dtype: int64

> # **Preprocessing**

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [10]:
column_transformer = ColumnTransformer(
    transformers=[
        ('ord_en', OrdinalEncoder(), ['Gender']),
        ('One_hot_en', OneHotEncoder(), ['Geography']),
        ('std_scl', StandardScaler(), ['Balance']),
        ('Std_scl',StandardScaler(), ['EstimatedSalary'])
    ],remainder='passthrough')

In [11]:
x_trf = column_transformer.fit_transform(x)
one_hot_columns = column_transformer.named_transformers_['One_hot_en'].get_feature_names_out(['Geography'])

# Combine all column names
column_names = ['Gender'] + list(one_hot_columns) + ['Balance', 'EstimatedSalary'] +  ['CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

# Convert the transformed data to a DataFrame
x_trf= pd.DataFrame(x_trf, columns=column_names)

x_trf.head()

Unnamed: 0,Gender,Geography_France,Geography_Germany,Geography_Spain,Balance,EstimatedSalary,CreditScore,Age,Tenure,NumOfProducts,HasCrCard,IsActiveMember
0,1.0,1.0,0.0,0.0,-0.883163,1.369486,668.0,33.0,3.0,2.0,1.0,0.0
1,1.0,1.0,0.0,0.0,-0.883163,-1.254085,627.0,33.0,1.0,2.0,1.0,1.0
2,1.0,1.0,0.0,0.0,-0.883163,1.437422,678.0,40.0,10.0,2.0,1.0,0.0
3,1.0,1.0,0.0,0.0,1.486918,-0.557018,581.0,34.0,2.0,1.0,1.0,1.0
4,1.0,0.0,0.0,1.0,-0.883163,-1.93877,716.0,33.0,5.0,2.0,1.0,1.0


In [12]:
test_trf = column_transformer.transform(test)
one_hot_columns = column_transformer.named_transformers_['One_hot_en'].get_feature_names_out(['Geography'])

# Combine all column names
column_names = ['Gender'] + list(one_hot_columns) + ['Balance', 'EstimatedSalary'] +  ['CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

# Convert the transformed data to a DataFrame
test_trf= pd.DataFrame(test_trf, columns=column_names)

test_trf.head()

Unnamed: 0,Gender,Geography_France,Geography_Germany,Geography_Spain,Balance,EstimatedSalary,CreditScore,Age,Tenure,NumOfProducts,HasCrCard,IsActiveMember
0,0.0,1.0,0.0,0.0,-0.883163,0.962404,586.0,23.0,2.0,2.0,0.0,1.0
1,0.0,1.0,0.0,0.0,-0.883163,-0.795852,683.0,46.0,2.0,1.0,1.0,0.0
2,0.0,1.0,0.0,0.0,-0.883163,0.523083,656.0,34.0,7.0,2.0,1.0,0.0
3,1.0,1.0,0.0,0.0,-0.883163,0.026977,681.0,36.0,8.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.047249,0.533997,752.0,38.0,10.0,1.0,1.0,0.0


In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_trf,y,test_size=0.25,random_state=42)
y_train.shape

(123775,)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(128,input_dim =12, activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
from tensorflow.keras.optimizers import Adam
adam = Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])

In [16]:
from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(
    monitor = 'val_loss',
    min_delta = 0.00001,
    patience =10,
    verbose=1,
    mode = 'auto',
    baseline = None,
    restore_best_weights = True
)

In [17]:
history = model.fit(x_train,y_train,epochs=5000,validation_data=(x_test, y_test),callbacks=callback)

Epoch 1/5000
[1m3868/3868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.7906 - loss: 0.7543 - val_accuracy: 0.7838 - val_loss: 0.4624
Epoch 2/5000
[1m3868/3868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8194 - loss: 0.4147 - val_accuracy: 0.8260 - val_loss: 0.4032
Epoch 3/5000
[1m3868/3868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8220 - loss: 0.4131 - val_accuracy: 0.8330 - val_loss: 0.4042
Epoch 4/5000
[1m3868/3868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8214 - loss: 0.4101 - val_accuracy: 0.8322 - val_loss: 0.3942
Epoch 5/5000
[1m3868/3868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8231 - loss: 0.4116 - val_accuracy: 0.8335 - val_loss: 0.3934
Epoch 6/5000
[1m3868/3868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8218 - loss: 0.4091 - val_accuracy: 0.8222 - val_loss: 0.4060
Epoc

In [18]:
from sklearn.metrics import accuracy_score
y_prob = model.predict(x_test)
y_pred = (y_prob > 0.5).astype(int)
accuracy_score(y_test,y_pred)

[1m1290/1290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674us/step


0.8355267941540028

In [19]:
y_prob = model.predict(test_trf)
y_pred = (y_prob > 0.5).astype(int)

[1m3439/3439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 697us/step


In [20]:

y_pred = y_pred.reshape(-1)
y_pred.shape

(110023,)

In [22]:
submission_df = pd.DataFrame({
    'id': test['id'],
    'Exited': y_pred  
})
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0
1,165035,1
2,165036,0
3,165037,0
4,165038,0


In [23]:
submission_df.to_csv('DNN_output.csv', index=False)