In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
import joblib

dataset = pd.read_csv('../data/E Commerce Dataset.csv')
dataset_copy = dataset.copy()
dataset.head()
Churn_col = 'Churn'

In [37]:
dataset.describe()

Unnamed: 0,CustomerID,Churn,Tenure,CityTier,WarehouseToHome,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
count,5630.0,5630.0,5366.0,5630.0,5379.0,5375.0,5630.0,5630.0,5630.0,5630.0,5365.0,5374.0,5372.0,5323.0,5630.0
mean,52815.5,0.168384,10.189899,1.654707,15.639896,2.931535,3.688988,3.066785,4.214032,0.284902,15.707922,1.751023,3.008004,4.543491,177.221492
std,1625.385339,0.37424,8.557241,0.915389,8.531475,0.721926,1.023999,1.380194,2.583586,0.451408,3.675485,1.894621,2.93968,3.654433,49.193869
min,50001.0,0.0,0.0,1.0,5.0,0.0,1.0,1.0,1.0,0.0,11.0,0.0,1.0,0.0,0.0
25%,51408.25,0.0,2.0,1.0,9.0,2.0,3.0,2.0,2.0,0.0,13.0,1.0,1.0,2.0,146.0
50%,52815.5,0.0,9.0,1.0,14.0,3.0,4.0,3.0,3.0,0.0,15.0,1.0,2.0,3.0,163.0
75%,54222.75,0.0,16.0,3.0,20.0,3.0,4.0,4.0,6.0,1.0,18.0,2.0,3.0,7.0,196.0
max,55630.0,1.0,61.0,3.0,127.0,5.0,6.0,5.0,22.0,1.0,26.0,16.0,16.0,46.0,325.0


In [38]:
dataset.shape

(5630, 20)

## Splitting the data

In [39]:
# Split the dataset into features (X) and target (y) early
X = dataset.drop(Churn_col, axis=1)
y = dataset[Churn_col]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('../data/test.csv', index=False)

## Feature Engineering

In [41]:
print(X_train.isnull().sum())
print(X_test.isnull().sum())

CustomerID                       0
Tenure                         216
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                203
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 206
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    216
CouponUsed                     203
OrderCount                     207
DaySinceLastOrder              230
CashbackAmount                   0
dtype: int64
CustomerID                      0
Tenure                         48
PreferredLoginDevice            0
CityTier                        0
WarehouseToHome                48
PreferredPaymentMode            0
Gender                          0
HourSpendOnApp                 49
NumberOfDeviceRegistered        0
PreferedOrderCat

In [42]:
# Set a threshold for null values 
null_threshold = len(X_train) * 0.5  # Drop columns with more than 50% null values

# Identify columns with null values exceeding the threshold in X_train
columns_to_drop_train = X_train.columns[X_train.isnull().sum() > null_threshold]

# Identify columns with null values exceeding the threshold in X_test
columns_to_drop_test = X_test.columns[X_test.isnull().sum() > null_threshold]

# Combine columns to drop from both X_train and X_test
columns_to_drop = set(columns_to_drop_train).union(set(columns_to_drop_test))

# Drop columns with a lot of null values from both X_train and X_test
X_train_filtered = X_train.drop(columns=columns_to_drop)
X_test_filtered = X_test.drop(columns=columns_to_drop)

# Check the updated shape of X_train and X_test
print("X_train shape after dropping columns:", X_train_filtered.shape)
print("X_test shape after dropping columns:", X_test_filtered.shape)


X_train shape after dropping columns: (4504, 19)
X_test shape after dropping columns: (1126, 19)


In [43]:
# Calculate missing value percentages for remaining columns in X_train_filtered
missing_percentage_train = (X_train_filtered.isnull().sum() / len(X_train_filtered)) * 100

# Calculate missing value percentages for remaining columns in X_test_filtered
missing_percentage_test = (X_test_filtered.isnull().sum() / len(X_test_filtered)) * 100

# Combine missing value percentages from both X_train_filtered and X_test_filtered
missing_percentage_combined = pd.concat([missing_percentage_train, missing_percentage_test], axis=1, keys=['Train', 'Test'])

# Display missing value percentages for remaining columns
print("Missing value percentages for remaining columns:")
print(missing_percentage_combined)


Missing value percentages for remaining columns:
                                Train      Test
CustomerID                   0.000000  0.000000
Tenure                       4.795737  4.262877
PreferredLoginDevice         0.000000  0.000000
CityTier                     0.000000  0.000000
WarehouseToHome              4.507105  4.262877
PreferredPaymentMode         0.000000  0.000000
Gender                       0.000000  0.000000
HourSpendOnApp               4.573712  4.351687
NumberOfDeviceRegistered     0.000000  0.000000
PreferedOrderCat             0.000000  0.000000
SatisfactionScore            0.000000  0.000000
MaritalStatus                0.000000  0.000000
NumberOfAddress              0.000000  0.000000
Complain                     0.000000  0.000000
OrderAmountHikeFromlastYear  4.795737  4.351687
CouponUsed                   4.507105  4.706927
OrderCount                   4.595915  4.529307
DaySinceLastOrder            5.106572  6.838366
CashbackAmount               0.000000  

In [44]:
# Select the six features with the lowest missing value percentages
selected_features = ['CustomerID', 'PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode', 
                     'Gender', 'NumberOfDeviceRegistered']
# Extract the selected features from the original dataset
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [45]:
# Convert the list of feature names into a DataFrame with the corresponding columns from the original dataset
selected_features_df = X_train[selected_features]

# Check the data types of the selected features DataFrame
selected_features_dtypes = selected_features_df.dtypes

# Print the data types
print(selected_features_dtypes)

CustomerID                   int64
PreferredLoginDevice        object
CityTier                     int64
PreferredPaymentMode        object
Gender                      object
NumberOfDeviceRegistered     int64
dtype: object


In [46]:
# Define the continuous and categorical features
categorical_columns = dataset[selected_features].select_dtypes(include='object').columns
categorical_columns

Index(['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender'], dtype='object')

In [47]:
continuous_columns = dataset[selected_features].select_dtypes(include='number').columns
continuous_columns

Index(['CustomerID', 'CityTier', 'NumberOfDeviceRegistered'], dtype='object')

In [48]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Create persistent preprocessing objects
scaler = StandardScaler()
onehot_encoder = OneHotEncoder(drop='first')

In [49]:
scaler = StandardScaler()
scaler.fit(X_train[continuous_columns])

In [50]:
joblib.dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [51]:
onehot_encoder.fit(X_train[categorical_columns])

In [52]:
joblib.dump(onehot_encoder, '../models/encoder.joblib')

['../models/encoder.joblib']

In [53]:
# Transform data using preprocessing objects
scaled_columns_train = scaler.transform(X_train[continuous_columns])
onehot_encoded_features_train = onehot_encoder.transform(X_train[categorical_columns])

In [54]:
# Concatenate the transformed features into a single DataFrame
X_train_processed = pd.concat([pd.DataFrame(scaled_columns_train, columns=continuous_columns),
                               pd.DataFrame(onehot_encoded_features_train.toarray(), columns=onehot_encoder.get_feature_names_out(categorical_columns))],
                              axis=1)

## Model Training

In [55]:

from sklearn.ensemble import RandomForestRegressor



model = RandomForestRegressor()
model.fit(X_train_processed, y_train)




In [56]:
model.fit(X_train_processed, y_train)

In [57]:
joblib.dump(model, '../models/model.joblib')

['../models/model.joblib']

## Model Evaluation

In [58]:
# Transform the test set using the same preprocessing objects
scaled_columns_test = scaler.transform(X_test[continuous_columns])
onehot_encoded_features_test = onehot_encoder.transform(X_test[categorical_columns])

# Concatenate the transformed features into a single DataFrame for the test set
X_test_processed = pd.concat([pd.DataFrame(scaled_columns_test, columns=continuous_columns),
                              pd.DataFrame(onehot_encoded_features_test.toarray(), columns=onehot_encoder.get_feature_names_out(categorical_columns))],
                             axis=1)

# Ensure that the column names are consistent with those during training
missing_features_test = set(X_train_processed.columns) - set(X_test_processed.columns)
for feature in missing_features_test:
    X_test_processed[feature] = 0

In [59]:
y_pred = model.predict(X_test_processed)

In [60]:
from sklearn.metrics import mean_squared_error

def compute_rmse(y_test, y_pred, precision=2):
    # Compute mean squared error
    mse = mean_squared_error(y_test, y_pred)
    
    # Compute root mean squared error
    rmse = np.sqrt(mse)
    
    return round(rmse, precision)

# Example usage
rmse = compute_rmse(y_test, y_pred)
print(f'Root Mean Squared Error on Test Set: {rmse}')


Root Mean Squared Error on Test Set: 0.39


In [61]:
from sklearn.metrics import mean_squared_log_error

def compute_msle(y_test, y_pred, precision=2):
    # Ensure that both y_test and y_pred contain only non-negative values
    y_test = np.clip(y_test, a_min=0, a_max=None)
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    
    # Compute mean squared logarithmic error
    msle = mean_squared_log_error(y_test, y_pred)
    
    return round(msle, precision)

# Example usage
msle = compute_msle(y_test, y_pred)
print(f'Mean Squared Logarithmic Error on Test Set: {msle}')

Mean Squared Logarithmic Error on Test Set: 0.08


## Model Inference

In [62]:
df_test = pd.read_csv('../data/test.csv')
df_test_copy = dataset.copy()
df_test.head()
Churn_col = 'Churn'

In [63]:
df_test.describe

<bound method NDFrame.describe of       CustomerID  Tenure PreferredLoginDevice  CityTier  WarehouseToHome  \
0          54332     1.0             Computer         3              7.0   
1          51989    15.0         Mobile Phone         1              9.0   
2          53444    13.0             Computer         1             29.0   
3          54560     5.0                Phone         1              7.0   
4          54899    13.0         Mobile Phone         1              7.0   
...          ...     ...                  ...       ...              ...   
1121       53741    16.0             Computer         1             16.0   
1122       50834     7.0         Mobile Phone         1             15.0   
1123       54442    16.0         Mobile Phone         3              7.0   
1124       54491    17.0             Computer         1             17.0   
1125       52843    27.0                Phone         3             13.0   

     PreferredPaymentMode  Gender  HourSpendOnApp  Nu

In [64]:
# Split the dataset into features (X) and target (y) early
X = df_test.drop(Churn_col, axis=1)
y = df_test[Churn_col]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1126 entries, 0 to 1125
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   1126 non-null   int64  
 1   Tenure                       1078 non-null   float64
 2   PreferredLoginDevice         1126 non-null   object 
 3   CityTier                     1126 non-null   int64  
 4   WarehouseToHome              1078 non-null   float64
 5   PreferredPaymentMode         1126 non-null   object 
 6   Gender                       1126 non-null   object 
 7   HourSpendOnApp               1077 non-null   float64
 8   NumberOfDeviceRegistered     1126 non-null   int64  
 9   PreferedOrderCat             1126 non-null   object 
 10  SatisfactionScore            1126 non-null   int64  
 11  MaritalStatus                1126 non-null   object 
 12  NumberOfAddress              1126 non-null   int64  
 13  Complain          

In [66]:
# Set a threshold for null values 
null_threshold = len(X_train) * 0.5  # Drop columns with more than 50% null values

# Identify columns with null values exceeding the threshold in X_train
columns_to_drop_train = X_train.columns[X_train.isnull().sum() > null_threshold]

# Identify columns with null values exceeding the threshold in X_test
columns_to_drop_test = X_test.columns[X_test.isnull().sum() > null_threshold]

# Combine columns to drop from both X_train and X_test
columns_to_drop = set(columns_to_drop_train).union(set(columns_to_drop_test))

# Drop columns with a lot of null values from both X_train and X_test
X_train_filtered = X_train.drop(columns=columns_to_drop)
X_test_filtered = X_test.drop(columns=columns_to_drop)

# Check the updated shape of X_train and X_test
print("X_train shape after dropping columns:", X_train_filtered.shape)
print("X_test shape after dropping columns:", X_test_filtered.shape)

X_train shape after dropping columns: (900, 19)
X_test shape after dropping columns: (226, 19)


In [67]:
selected_features = ['CustomerID', 'PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode', 
                     'Gender', 'NumberOfDeviceRegistered']
df_test = df_test[selected_features + [Churn_col]]
df_test.head()

Unnamed: 0,CustomerID,PreferredLoginDevice,CityTier,PreferredPaymentMode,Gender,NumberOfDeviceRegistered,Churn
0,54332,Computer,3,COD,Female,6,1
1,51989,Mobile Phone,1,Debit Card,Female,1,0
2,53444,Computer,1,Credit Card,Female,4,0
3,54560,Phone,1,Debit Card,Male,5,0
4,54899,Mobile Phone,1,Debit Card,Female,5,0


In [68]:
# Convert the list of feature names into a DataFrame with the corresponding columns from the original dataset
selected_features_df = X_train[selected_features]

# Check the data types of the selected features DataFrame
selected_features_dtypes = selected_features_df.dtypes

# Print the data types
print(selected_features_dtypes)

CustomerID                   int64
PreferredLoginDevice        object
CityTier                     int64
PreferredPaymentMode        object
Gender                      object
NumberOfDeviceRegistered     int64
dtype: object


In [69]:
# Define the continuous and categorical features
categorical_columns_test = df_test[selected_features].select_dtypes(include='object').columns
categorical_columns_test
continuous_columns_test = df_test[selected_features].select_dtypes(include='number').columns
continuous_columns_test

#Loading the scaler and encoder fro the memory
loaded_scaler = joblib.load('../models/scaler.joblib')

loaded_encoder = joblib.load('../models/encoder.joblib')


In [70]:
# Load the model
loaded_model = joblib.load('../models/model.joblib')

# Transform test data using preprocessing objects
scaled_columns_test = loaded_scaler.transform(X_test_filtered[continuous_columns_test])
onehot_encoded_features_test = loaded_encoder.transform(X_test_filtered[categorical_columns_test])

# Concatenate scaled continuous features and one-hot encoded categorical features
X_test_processed = np.concatenate([scaled_columns_test, onehot_encoded_features_test.toarray()], axis=1)

# Make predictions
y_pred = loaded_model.predict(X_test_processed)

#Evaluation
def compute_rmse(y_test, y_pred, precision=2):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return round(rmse, precision)

rmse = compute_rmse(y_test, y_pred)
print(f'Root Mean Squared Error on Test Set: {rmse}')


Root Mean Squared Error on Test Set: 0.39


