In [18]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Read the Excel File.
import pandas as pd
df = pd.read_excel('E Commerce Dataset.xlsx', sheet_name='E Comm')
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93
1,50002,1,,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,120.9
2,50003,1,,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120.28
3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07
4,50005,1,0.0,Phone,1,12.0,CC,Male,,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,129.6


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5366 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5379 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5375 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress   

In [20]:
df.nunique()

CustomerID                     5630
Churn                             2
Tenure                           36
PreferredLoginDevice              3
CityTier                          3
WarehouseToHome                  34
PreferredPaymentMode              7
Gender                            2
HourSpendOnApp                    6
NumberOfDeviceRegistered          6
PreferedOrderCat                  6
SatisfactionScore                 5
MaritalStatus                     3
NumberOfAddress                  15
Complain                          2
OrderAmountHikeFromlastYear      16
CouponUsed                       17
OrderCount                       16
DaySinceLastOrder                22
CashbackAmount                 2586
dtype: int64

In [21]:
df_cleaned = df.dropna()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3774 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   3774 non-null   int64  
 1   Churn                        3774 non-null   int64  
 2   Tenure                       3774 non-null   float64
 3   PreferredLoginDevice         3774 non-null   object 
 4   CityTier                     3774 non-null   int64  
 5   WarehouseToHome              3774 non-null   float64
 6   PreferredPaymentMode         3774 non-null   object 
 7   Gender                       3774 non-null   object 
 8   HourSpendOnApp               3774 non-null   float64
 9   NumberOfDeviceRegistered     3774 non-null   int64  
 10  PreferedOrderCat             3774 non-null   object 
 11  SatisfactionScore            3774 non-null   int64  
 12  MaritalStatus                3774 non-null   object 
 13  NumberOfAddress   

In [22]:
# Find the perccentage of missing values in each column
round((df.isnull().sum()*100 / df.shape[0]),2)

# Missing values in each column are relatively low, roughly 5% on average. We can therefore impute the missing values

CustomerID                     0.00
Churn                          0.00
Tenure                         4.69
PreferredLoginDevice           0.00
CityTier                       0.00
WarehouseToHome                4.46
PreferredPaymentMode           0.00
Gender                         0.00
HourSpendOnApp                 4.53
NumberOfDeviceRegistered       0.00
PreferedOrderCat               0.00
SatisfactionScore              0.00
MaritalStatus                  0.00
NumberOfAddress                0.00
Complain                       0.00
OrderAmountHikeFromlastYear    4.71
CouponUsed                     4.55
OrderCount                     4.58
DaySinceLastOrder              5.45
CashbackAmount                 0.00
dtype: float64

In [23]:
# use the iterative imputer to fill missing values

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=1)

columns_to_impute = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 
                     'OrderCount', 'DaySinceLastOrder']

df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

In [24]:
# merge with original df
df_imputed = pd.DataFrame(df[columns_to_impute], columns=columns_to_impute)
df_imputed = df_imputed.round()

df_updated = pd.concat([df.drop(columns=columns_to_impute), df_imputed], axis=1)
df_updated.head()

Unnamed: 0,CustomerID,Churn,PreferredLoginDevice,CityTier,PreferredPaymentMode,Gender,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,CashbackAmount,Tenure,WarehouseToHome,HourSpendOnApp,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder
0,50001,1,Mobile Phone,3,Debit Card,Female,3,Laptop & Accessory,2,Single,9,1,159.93,4.0,6.0,3.0,11.0,1.0,1.0,5.0
1,50002,1,Phone,1,UPI,Male,4,Mobile,3,Single,7,1,120.9,8.0,8.0,3.0,15.0,0.0,1.0,0.0
2,50003,1,Phone,1,Debit Card,Male,4,Mobile,3,Single,6,1,120.28,9.0,30.0,2.0,14.0,0.0,1.0,3.0
3,50004,1,Phone,3,Debit Card,Male,4,Laptop & Accessory,5,Single,8,0,134.07,0.0,15.0,2.0,23.0,0.0,1.0,3.0
4,50005,1,Phone,1,CC,Male,3,Mobile,5,Single,3,0,129.6,0.0,12.0,3.0,11.0,1.0,1.0,3.0


In [25]:
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   PreferredLoginDevice         5630 non-null   object 
 3   CityTier                     5630 non-null   int64  
 4   PreferredPaymentMode         5630 non-null   object 
 5   Gender                       5630 non-null   object 
 6   NumberOfDeviceRegistered     5630 non-null   int64  
 7   PreferedOrderCat             5630 non-null   object 
 8   SatisfactionScore            5630 non-null   int64  
 9   MaritalStatus                5630 non-null   object 
 10  NumberOfAddress              5630 non-null   int64  
 11  Complain                     5630 non-null   int64  
 12  CashbackAmount               5630 non-null   float64
 13  Tenure            

In [26]:
round((df_updated.isnull().sum()*100 / df.shape[0]),2)

CustomerID                     0.0
Churn                          0.0
PreferredLoginDevice           0.0
CityTier                       0.0
PreferredPaymentMode           0.0
Gender                         0.0
NumberOfDeviceRegistered       0.0
PreferedOrderCat               0.0
SatisfactionScore              0.0
MaritalStatus                  0.0
NumberOfAddress                0.0
Complain                       0.0
CashbackAmount                 0.0
Tenure                         0.0
WarehouseToHome                0.0
HourSpendOnApp                 0.0
OrderAmountHikeFromlastYear    0.0
CouponUsed                     0.0
OrderCount                     0.0
DaySinceLastOrder              0.0
dtype: float64

In [27]:
# create dummies out of categorical variables

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

columns_to_onehot = ['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'MaritalStatus']

df_with_dummies = encoder.fit_transform(df_updated[columns_to_onehot]).toarray()

df_onehot_encoded = pd.DataFrame(df_with_dummies, columns=encoder.get_feature_names_out(columns_to_onehot))

df_final = pd.concat([df_updated.drop(columns=columns_to_onehot), df_onehot_encoded], axis=1)

df_final.head()

Unnamed: 0,CustomerID,Churn,CityTier,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,CashbackAmount,Tenure,WarehouseToHome,...,Gender_Male,PreferedOrderCat_Fashion,PreferedOrderCat_Grocery,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Mobile,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Others,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,50001,1,3,3,2,9,1,159.93,4.0,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,50002,1,1,4,3,7,1,120.9,8.0,8.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,50003,1,1,4,3,6,1,120.28,9.0,30.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,50004,1,3,4,5,8,0,134.07,0.0,15.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,50005,1,1,3,5,3,0,129.6,0.0,12.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [28]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   CustomerID                             5630 non-null   int64  
 1   Churn                                  5630 non-null   int64  
 2   CityTier                               5630 non-null   int64  
 3   NumberOfDeviceRegistered               5630 non-null   int64  
 4   SatisfactionScore                      5630 non-null   int64  
 5   NumberOfAddress                        5630 non-null   int64  
 6   Complain                               5630 non-null   int64  
 7   CashbackAmount                         5630 non-null   float64
 8   Tenure                                 5630 non-null   float64
 9   WarehouseToHome                        5630 non-null   float64
 10  HourSpendOnApp                         5630 non-null   float64
 11  Orde

In [29]:
relevant_df = df_final.drop(["CustomerID"],axis=1)
relevant_df.head()

Unnamed: 0,Churn,CityTier,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,CashbackAmount,Tenure,WarehouseToHome,HourSpendOnApp,...,Gender_Male,PreferedOrderCat_Fashion,PreferedOrderCat_Grocery,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Mobile,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Others,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,1,3,3,2,9,1,159.93,4.0,6.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,4,3,7,1,120.9,8.0,8.0,3.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,1,4,3,6,1,120.28,9.0,30.0,2.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,3,4,5,8,0,134.07,0.0,15.0,2.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,3,5,3,0,129.6,0.0,12.0,3.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [30]:
# Split our preprocessed data into our features and target arrays
X = relevant_df.drop('Churn', axis=1).values
y = relevant_df['Churn'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [31]:
X

array([[3., 3., 2., ..., 0., 0., 1.],
       [1., 4., 3., ..., 0., 0., 1.],
       [1., 4., 3., ..., 0., 0., 1.],
       ...,
       [1., 2., 4., ..., 0., 1., 0.],
       [3., 5., 4., ..., 0., 1., 0.],
       [1., 2., 3., ..., 0., 1., 0.]])

In [32]:
import numpy as np

# get counts of 1s vs 0s to determine class imbalance
counts = np.bincount(y)

num_zeros = counts[0]
num_ones = counts[1]

print(f"Number of 0s in 'y': {num_zeros}")
print(f"Number of 1s in 'y': {num_ones}")

Number of 0s in 'y': 4682
Number of 1s in 'y': 948


In [34]:
# will need to resample training data due to class imbalance outlined above
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [40]:
counts = np.bincount(y_train_resampled)

num_zeros = counts[0]
num_ones = counts[1]

print(f"Number of 0s in 'y': {num_zeros}")
print(f"Number of 1s in 'y': {num_ones}")

Number of 0s in 'y': 3506
Number of 1s in 'y': 3506


NameError: name 'X_train_scaled' is not defined

In [39]:
# finally, scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)

# Apply the same to the testing data
X_test_scaled = scaler.transform(X_test)

In [41]:
print(len(X_train_resampled_scaled[0]))

34


In [42]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_resampled_scaled[0])
hidden_layer1 = 80
hidden_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                2800      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5261 (20.55 KB)
Trainable params: 5261 (20.55 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
fit_model = nn.fit(X_train_resampled_scaled,y_train_resampled,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [45]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

44/44 - 0s - loss: 0.1735 - accuracy: 0.9503 - 122ms/epoch - 3ms/step
Loss: 0.17345690727233887, Accuracy: 0.9502840638160706


## Model with auto-optimisation

In [46]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=100,
        step=5), activation=activation, input_dim=len(X_train_resampled_scaled[0])))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=100,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [47]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=2)

Using TensorFlow backend


In [48]:
tuner.search(X_train_resampled_scaled,y_train_resampled,epochs=50,validation_data=(X_test_scaled,y_test))

Trial 180 Complete [00h 00m 16s]
val_accuracy: 0.9552556872367859

Best val_accuracy So Far: 0.9829545617103577
Total elapsed time: 00h 15m 24s


In [49]:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 81,
 'num_layers': 5,
 'units_0': 96,
 'units_1': 21,
 'units_2': 16,
 'units_3': 31,
 'units_4': 26,
 'units_5': 1,
 'tuner/epochs': 50,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [50]:
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

44/44 - 0s - loss: 0.0861 - accuracy: 0.9830 - 173ms/epoch - 4ms/step
Loss: 0.08609770238399506, Accuracy: 0.9829545617103577
