# Machine Learning Modelling Experiments

## Experiment Setup 

In [1]:
# Setting up execution path
import os

print(f"Current working directory: {os.path.basename(os.getcwd())}")

# Change to root directory
os.chdir("../")
print(f"Current working directory (Changed): {os.path.basename(os.getcwd())}")

Current working directory: notebooks
Current working directory (Changed): Ecommerce-Customer-Analysis


In [2]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# module setup
%matplotlib inline
pd.options.display.precision = 3
warnings.filterwarnings("ignore")

In [3]:
# Function imports
from src.constants import CONFIGS, SCHEMA
from src.utils.basic_utils import read_yaml

In [7]:
# Read configuration file
configs = read_yaml(CONFIGS).data_preparation
schema = read_yaml(SCHEMA).processed_data_columns

[2024-02-11 12:09:40 PM]:ProjectLogger INFO:basic_utils41 - yaml file: conf\configs.yaml loaded successfully
[2024-02-11 12:09:40 PM]:ProjectLogger INFO:basic_utils41 - yaml file: conf\schema.yaml loaded successfully


In [8]:
# View the data ingestion configurations
print(dict(configs))

{'processed_data_path': 'data/processed/customers.csv', 'train_path': 'data/train/train_data.csv', 'test_path': 'data/test/test_data.csv', 'test_size_pct': 0.2, 'random_seed': 42}


In [9]:
# View the schema
print(dict(schema))

{'features': Box({'avg_session_length': 'float64', 'time_on_app': 'float64', 'time_on_website': 'float64', 'membership_length': 'float64'}), 'target': Box({'annual_amount_spent': 'float64'}), 'ignore': Box({'email': 'object', 'address': 'object', 'avatar': 'object'})}


## Data Ingestion

In [10]:
# Get the CSV filepath from configs
data_path = configs.processed_data_path

# Read the CSV data
customers_df_main = pd.read_csv(data_path, index_col=False)

# create a copy of it for working
customers_df = customers_df_main.copy(deep=True)

# View a glimpse of the data
customers_df.head()

Unnamed: 0,email,address,avatar,avg_session_length,time_on_app,time_on_website,membership_length,annual_amount_spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497,12.656,39.578,4.083,587.951
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926,11.109,37.269,2.664,392.205
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.001,11.33,37.111,4.105,487.548
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.306,13.718,36.721,3.12,581.852
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.331,12.795,37.537,4.446,599.406


In [11]:
# columns in the data
customers_df.columns.tolist()

['email',
 'address',
 'avatar',
 'avg_session_length',
 'time_on_app',
 'time_on_website',
 'membership_length',
 'annual_amount_spent']

In [12]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   email                500 non-null    object 
 1   address              500 non-null    object 
 2   avatar               500 non-null    object 
 3   avg_session_length   500 non-null    float64
 4   time_on_app          500 non-null    float64
 5   time_on_website      500 non-null    float64
 6   membership_length    500 non-null    float64
 7   annual_amount_spent  500 non-null    float64
dtypes: float64(5), object(3)
memory usage: 31.4+ KB


## Train-Test Splits

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# splitting the data into training and test sets
train_set, test_set = train_test_split(
    customers_df, test_size=configs.test_size_pct, random_state=configs.random_seed
)

In [28]:
# View the size of training and test sets
print(f"The shape of train set: {train_set.shape}")
print(f"The shape of test set: {test_set.shape}")

The shape of train set: (400, 8)
The shape of test set: (100, 8)


## Feature Selection

In [18]:
# Getting the feature cols
feature_cols = list(schema.features.keys())
print(f"The feature columns are:\n{feature_cols}\n")


# Getting the target column
target_col = list(schema.target.keys())
print(f"The target column is: {target_col}\n")

# Get the columns to ignore/remove for machine learning modelling purpose
ignore_cols = list(schema.ignore.keys())
print(f"The columns to ignore/remove from the dataframe are:\n{ignore_cols}")

The feature columns are:
['avg_session_length', 'time_on_app', 'time_on_website', 'membership_length']

The targte column is: ['annual_amount_spent']

The columns to ignore/remove from the dataframe are:
['email', 'address', 'avatar']


## Feature Engineering

We need to create a preprocessor object that will standardize the numerical features.
We'll follow the following strategies:

- `SimpleImputer(strategy="median")` : To impute any null values in the numerical feature
- `StandardScaler()` : To normalize the numerical features

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [30]:
# Create the numerical feature engineering pipeline:
num_pipeline = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scalar", StandardScaler())]
)

In [31]:
# View the pipeline object
num_pipeline

In [32]:
# Create the column transformer object
preprocessor = ColumnTransformer([("numerical_pipeline", num_pipeline, feature_cols)])

In [33]:
# View the preprocessor object
preprocessor

## Preprocessor Fitting & Transform

In [44]:
# Split the train and test set into feature and targets
X_train, y_train = train_set[feature_cols], train_set[target_col]
X_test, y_test = test_set[feature_cols], test_set[target_col]

In [45]:
# View the shape of dataframes
print(f"The shape of X_train: {X_train.shape}")
print(f"The shape of y_train: {y_train.shape}")
print(f"The shape of X_test: {X_test.shape}")
print(f"The shape of y_test: {y_test.shape}")

The shape of X_train: (400, 4)
The shape of y_train: (400, 1)
The shape of X_test: (100, 4)
The shape of y_test: (100, 1)


In [56]:
# View first 5 elements of X_train and X_test
print(f"The first 5 items of X_train:\n{X_train[:5]}\n")
print(f"The first 5 items of X_test:\n{X_test[:5]}")

The first 5 items of X_train:
     avg_session_length  time_on_app  time_on_website  membership_length
249              33.780       11.918           36.845              3.635
433              34.278       11.823           36.309              2.117
19               32.618       13.990           37.191              4.065
322              33.265       10.732           36.146              4.087
332              33.144       11.737           37.935              2.190

The first 5 items of X_test:
     avg_session_length  time_on_app  time_on_website  membership_length
361              32.078       10.348           39.045              3.435
73               32.809       12.817           37.032              3.852
374              31.447       10.102           38.043              4.238
155              32.450       13.458           37.239              2.941
104              31.390       10.994           38.074              3.429


In [57]:
# View first 5 elements of y_train and y_test
print(f"The first 5 items of y_train:\n{y_train[:5]}\n")
print(f"The first 5 items of y_test:\n{y_test[:5]}")

The first 5 items of y_train:
     annual_amount_spent
249              518.786
433              434.144
19               605.061
322              478.262
332              424.203

The first 5 items of y_test:
     annual_amount_spent
361              401.033
73               534.777
374              418.603
155              503.978
104              410.070


In [47]:
# Fit & transform preprocessor with the X_train data
X_train_norm = preprocessor.fit_transform(X_train)

# Transform the X_test data
X_test_norm = preprocessor.transform(X_test)

In [48]:
# View first 5 elements of X_train and X_test
print(f"The first 5 items of normalized X_train:\n{X_train_norm[:5]}\n")
print(f"The first 5 items of normalized X_test:\n{X_test_norm[:5]}")

The first 5 items of normalized X_train:
[[ 0.72875642 -0.14037637 -0.17321599  0.06519749]
 [ 1.23367035 -0.23549152 -0.70076461 -1.41309514]
 [-0.44946538  1.93596147  0.16698241  0.48362047]
 [ 0.20617011 -1.32838815 -0.86089565  0.50506779]
 [ 0.08412284 -0.32135382  0.89966814 -1.34223042]]

The first 5 items of normalized X_test:
[[-0.99713193 -1.71345511  1.99174928 -0.13004556]
 [-0.25600972  0.76100262  0.01057931  0.27616869]
 [-1.63590713 -1.96022058  1.00618723  0.65286637]
 [-0.62010572  1.40296914  0.21450597 -0.61041721]
 [-1.69456072 -1.06574134  1.03668748 -0.1355977 ]]


In [49]:
# Convert target labels into numpy arrays
y_train_arr = np.array(y_train.squeeze())
y_test_arr = np.array(y_test.squeeze())

In [50]:
# View first 5 elements of y_train and y_test
print(f"The first 5 items of y_train:\n{y_train_arr[:5]}\n")
print(f"The first 5 items of y_test:\n{y_test_arr[:5]}")

The first 5 items of y_train:
[518.78648309 434.14420203 605.0610388  478.2621264  424.2028271 ]

The first 5 items of y_test:
[401.03313522 534.7771881  418.6027421  503.97837905 410.06961106]


## Model Training

In [58]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer