# Machine Learning Modelling Experiments

## Experiment Setup 

In [1]:
# Setting up execution path
import os

print(f"Current working directory: {os.path.basename(os.getcwd())}")

# Change to root directory
os.chdir("../")
print(f"Current working directory (Changed): {os.path.basename(os.getcwd())}")

Current working directory: notebooks
Current working directory (Changed): Ecommerce-Customer-Analysis


In [2]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# module setup
%matplotlib inline
pd.options.display.precision = 3
warnings.filterwarnings("ignore")

In [3]:
# Function imports
from src.constants import CONFIGS
from src.utils.basic_utils import read_yaml

In [4]:
# Read configuration file
configs = read_yaml(CONFIGS).data_ingestion

# View the data ingestion configurations
print(dict(configs))

[2024-02-10 06:17:44 PM]:ProjectLogger INFO:basic_utils41 - yaml file: conf\configs.yaml loaded successfully
{'raw_data': 'data/raw/ecommerce_customers.csv'}


## Data Ingestion

In [5]:
# Get the CSV filepath from configs
raw_data_path = configs.raw_data

# Read the CSV data
customers_df_main = pd.read_csv(raw_data_path, index_col=False)

# create a copy of it for working
customers_df = customers_df_main.copy(deep=True)

# View a glimpse of the data
customers_df.head()

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497,12.656,39.578,4.083,587.951
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926,11.109,37.269,2.664,392.205
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.001,11.33,37.111,4.105,487.548
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.306,13.718,36.721,3.12,581.852
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.331,12.795,37.537,4.446,599.406


In [6]:
# columns in the data
customers_df.columns.tolist()

['Email',
 'Address',
 'Avatar',
 'Avg. Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [7]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Email                 500 non-null    object 
 1   Address               500 non-null    object 
 2   Avatar                500 non-null    object 
 3   Avg. Session Length   500 non-null    float64
 4   Time on App           500 non-null    float64
 5   Time on Website       500 non-null    float64
 6   Length of Membership  500 non-null    float64
 7   Yearly Amount Spent   500 non-null    float64
dtypes: float64(5), object(3)
memory usage: 31.4+ KB


## Train-Test Splits

In [8]:
X = customers_df.select_dtypes(include=["int", "float"]).drop(
    columns="Yearly Amount Spent"
)
y = customers_df[["Yearly Amount Spent"]]

In [9]:
X.head()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
0,34.497,12.656,39.578,4.083
1,31.926,11.109,37.269,2.664
2,33.001,11.33,37.111,4.105
3,34.306,13.718,36.721,3.12
4,33.331,12.795,37.537,4.446


In [10]:
y.head()

Unnamed: 0,Yearly Amount Spent
0,587.951
1,392.205
2,487.548
3,581.852
4,599.406


In [13]:
customers_df.dtypes

Email                    object
Address                  object
Avatar                   object
Avg. Session Length     float64
Time on App             float64
Time on Website         float64
Length of Membership    float64
Yearly Amount Spent     float64
dtype: object