# Imports

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib widget
from sklearn.datasets import make_blobs
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import L2
np.set_printoptions(precision=2)
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)

# Set a seed

In [2]:
# set seed
tf.random.set_seed(1234) 

# Load in file

In [3]:
# create path
file_path = "shopping.csv"

# read in file
df = pd.read_csv(file_path)

# Create mappings

In [4]:
# create mappings for bools
bool_mapping = {
    "TRUE":1,
    "FALSE": 0
}

# create mappings for month
month_mapping = {
    "jan":1,
    "feb":2,
    "mar":3,
    "apr":4,
    "may":5,
    "june":6,
    "jul":7,
    "aug":8,
    "sep":9,
    "oct":10,
    "nov":11,
    "dec":12
}

# create mapping for visitorType
visitor_mapping = {
    "returning_visitor": 1,
    "new_visitor": 2,
    "other": 3
}

# Change data to usable types

In [5]:
# change data types to strings
df["Weekend"] = df["Weekend"].astype(str)
df["Revenue"] = df["Revenue"].astype(str)
df["VisitorType"] = df["VisitorType"].astype(str)
df["Month"] = df["Month"].astype(str)

In [6]:
# convert strings to lower
# convert bools to upper
df["VisitorType"] = df["VisitorType"].str.lower()
df["Month"] = df["Month"].str.lower()
df["Weekend"] = df["Weekend"].str.upper()
df["Revenue"] = df["Revenue"].str.upper()

# map
df["VisitorType"] = df["VisitorType"].map(visitor_mapping)
df["Month"] = df["Month"].map(month_mapping)
df["Weekend"] = df["Weekend"].map(bool_mapping)
df["Revenue"] = df["Revenue"].map(bool_mapping)

# Convert dataframe to numpy array

In [7]:
# convert dataframe to 2d numpy array
data_array = df.to_numpy()

# Deal with outliers
In my last file, the outliers were sending the scaled models training and test loss to the moon. I will now deal with the outliers and see if it makes a difference.

I will first initialize xtrain and ytrain so that we can examine outliers in the x train

In [8]:
# initialize xTrain, yTrain
xtrain = data_array[:,0:17]
ytrain = data_array[:,17]

In [9]:
# debugging
from scipy import stats
import numpy as np

# using the z score, i will remove values outside of the threshhold z = 4
# 99.7% of values should be between z = (-3,3), so removing these values above 4 should not hurt our model
z_scores = np.abs(stats.zscore(xtrain))  
outliers = np.where(z_scores > 4)

print(len(outliers[1]))

unique_rows = np.unique(outliers[0])

xtrain = np.delete(xtrain, unique_rows, axis=0)
ytrain = np.delete(ytrain, unique_rows, axis=0)

print(f"Number of rows removed: {len(unique_rows)}")
print(len(xtrain))

638
Number of rows removed: 485
4515


So, there were 638 outliers(values with above 4 z score) belonging to 485 rows of data in the training set.
Removing these values, we still have 4500 observations, but they are much less skewed right. 
This should help our model train and test better.

# Create minmax, meanNorm, zScore scaling functions

In [10]:
# feature scaling function 1: minMax
# (X - xmin) / (xmax - xmin)

def minMaxScale(data):
    m = data.shape[0]
    n = data.shape[1]
    scaledData = np.zeros((m,n))
    for j in range(n):
        xmin = np.min(data[:,j])
        xmax = np.max(data[:,j])

        # troubleshooting
        if (xmin == xmax):
            scaledData[:,j] = 0
        else:
            for i in range(m):
                numer = data[i,j] - xmin
                denom = xmax - xmin
                scaledData[i,j] = numer/denom
    return(scaledData)
        

In [11]:
# feature scaling function 2: mean normalization
# x - xmean
# xmax-xmin

def meanNormalization(data):
    m = data.shape[0]
    n = data.shape[1]
    scaledData = np.zeros((m,n))
    for j in range(n):
        xmin = np.min(data[:,j])
        xmax = np.max(data[:,j])
        xmean = np.mean(data[:,j])
        for i in range(m):
            numer = data[i,j] - xmean
            denom = xmax - xmin
            if denom == 0:
                scaledData[i,j] = 0
            else:
                scaledData[i,j] = numer/denom
    return(scaledData)
        

In [12]:
# feature scaling function 3: z score normalization
# x - xmean / std

def zScoreNormalization(data):
    m = data.shape[0]
    n = data.shape[1]
    scaledData = np.zeros((m,n))
    for j in range(n):
        xstd = np.std(data[:,j])
        xmean = np.mean(data[:,j])
        for i in range(m):
            numer = data[i,j] - xmean
            denom = xstd
            if denom == 0:
                scaledData[i,j] = 0
            else:
                scaledData[i,j] = numer/denom
    return(scaledData)
        

# Create models of dif sizes and units per layer

In [13]:
# set up numbers and array to test diff models
array1 = [30,15]

array21 = [30,20,10]
array22 = [20,12,4]

array31 = [40,30,20,10]
array32 = [30,22,14,6]
array33 = [50,35,23,12]

array41 = [40,50,55,40,20]
array42 = [50,60,70,40,20]
array43 = [60,50,40,30,20]

In [14]:
# Testing models

def makeModel(numLayers, layerUnitArray):
    model = Sequential()

    for i in range(numLayers):
        model.add(Dense(layerUnitArray[i], activation = "relu"))
    model.add(Dense(1, activation = "sigmoid"))

    return model


In [15]:
models = np.empty(9, dtype=object)
models[0] = makeModel(2, array1)
models[1] = makeModel(3, array21)
models[2] = makeModel(3, array22)
models[3] = makeModel(4, array31)
models[4] = makeModel(4, array32)
models[5] = makeModel(4, array33)
models[6] = makeModel(5, array41)
models[7] = makeModel(5, array42)
models[8] = makeModel(5, array43)

# Test models on training data - I will comment this out as it take a long time to run.

for i in range(len(models)):
    models[i].compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3),
    )
    
    history = models[i].fit(
        xtrain,ytrain,            
        epochs=40,
    )
    finalLoss = history.history['loss'][-1]
    print((models[i], finalLoss))

# Make model that produced least loss

After model 4, the loss continually went down. However, a model with 6 layers is likely too complex, so I will use the 6th model. The model
with 4 hidden layers and an output layer, with layer sizes of 50,35,23,12 respectively. I will now make the model again on its own.

In [16]:
# make the unregularized model in a prettier way
inpSize = xtrain.shape[1]

model = Sequential(
    [
        tf.keras.Input(shape=(inpSize,)),
        Dense(50, activation = 'relu',   name = "L1"),
        Dense(35, activation = 'relu',   name = "L2"),
        Dense(23, activation = 'relu',   name = "L3"),
        Dense(12, activation = 'relu',   name = "L4"),
        Dense(1, activation = 'sigmoid',   name = "OL"),
    ]
)
model.summary()

# Compile and fit model

In [17]:
model.compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3),
)

history = model.fit(
    xtrain,ytrain,
    epochs=40,
    verbose = 0,
)

final_loss = history.history['loss'][-1]
print(f"Final Loss: {final_loss}")

Final Loss: 0.2738884687423706


Training loss: ~.27

# Now let's compute the test loss

First I will initialize test set, run it through the same conversions as the training, so the data is usable, and then try the model on it.

In [18]:
# find test loss of non scaled, unregularized data
# first will transform test set to match train set
# create path
test_path = "unseen.csv"

# read in file
df = pd.read_csv(test_path)

# change data types to strings
df["Weekend"] = df["Weekend"].astype(str)
df["Revenue"] = df["Revenue"].astype(str)
df["VisitorType"] = df["VisitorType"].astype(str)
df["Month"] = df["Month"].astype(str)

# convert strings to lower
# convert bools to upper
df["VisitorType"] = df["VisitorType"].str.lower()
df["Month"] = df["Month"].str.lower()
df["Weekend"] = df["Weekend"].str.upper()
df["Revenue"] = df["Revenue"].str.upper()

# map
df["VisitorType"] = df["VisitorType"].map(visitor_mapping)
df["Month"] = df["Month"].map(month_mapping)
df["Weekend"] = df["Weekend"].map(bool_mapping)
df["Revenue"] = df["Revenue"].map(bool_mapping)

# convert dataframe to 2d numpy array
test_array = df.to_numpy()

Now I will remove outliers of this dataset also, if any are found, after initializing xtest, ytest.

In [19]:
xtest = test_array[:,0:17]
ytest = test_array[:,17]

from scipy import stats
import numpy as np

# using the z score, i will remove values outside of the threshhold z = 4
# 99.7% of values should be between z = (-3,3), so removing these values above 4 should not hurt our model
z_scores = np.abs(stats.zscore(xtest))  
outliers = np.where(z_scores > 4)

print(len(outliers[1]))

unique_rows = np.unique(outliers[0])

xtest = np.delete(xtest, unique_rows, axis=0)
ytest = np.delete(ytest, unique_rows, axis=0)

print(f"Number of rows removed: {len(unique_rows)}")

65
Number of rows removed: 52


So we found 65 outliers in 52 of the rows. I removed these rows. We still have 948 out of our 1000 observations, but should be much less skewed.

Now to try our model on unseen data

In [20]:
model.evaluate(xtest,ytest)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3169  


0.3279884159564972

Test loss: ~.36

# Function to test models for ease and less code

In [21]:
# create function to test models so i can stop copy pasting 
def fitModel(xt,yt):
    model.compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, clipnorm=1.0),
    )

    history = model.fit(
        xt,yt,
        epochs=100,
        verbose = 0,
    )
    
    final_loss = history.history['loss'][-1]
    print(final_loss)
    return(model)

# Initialize feature scaled training datasets

In [22]:
# initlailize training values for each scaled dataset
minMaxXtrain = minMaxScale(xtrain)

meanNormXtrain = meanNormalization(xtrain)

zScoreXtrain = zScoreNormalization(xtrain)
print(np.min(zScoreXtrain), np.max(zScoreXtrain))

-1.6357728321552973 10.090865202083412


Based on the max z score of 10, there is clearly still an outlier. This is because the mean and std are shifting when the extreme outliers are removed. If I iterated the outlier removal, I would lose over half my data, which I refuse to do.

# Initialize feature scaled test datasets

In [23]:
# initlailize test values for each scaled dataset
minMaxXtest = minMaxScale(xtest)

meanNormXtest = meanNormalization(xtest)

zScoreXtest = zScoreNormalization(xtest)
print(np.min(zScoreXtest), np.max(zScoreXtest))

-1.56910981279221 7.9102912153422364


# Test our model on feature scaled training and test sets

In [24]:
minMaxModel = fitModel(minMaxXtrain, ytrain)

0.25671860575675964


In [25]:
minMaxModel.evaluate(minMaxXtest, ytest)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2962  


0.29451480507850647

In [26]:
meanNormModel = fitModel(meanNormXtrain, ytrain)

0.23635859787464142


In [27]:
meanNormModel.evaluate(meanNormXtest, ytest)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2920  


0.2870081067085266

In [28]:
zScoreModel = fitModel(zScoreXtrain, ytrain)

0.2094227373600006


In [29]:
zScoreModel.evaluate(zScoreXtest, ytest)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3024  


0.29167288541793823

# Results:
MinMax scaled training loss: ~0.24   
MeanNorm scaled training loss: ~0.22  
zScore scaled training loss: ~0.20  
  
MinMax scaled test loss: ~0.28  
MeanNorm scaled test loss: ~0.26  
zScore scaled test loss: ~0.29  
  
I will choose Z Score scaling as my preferred scaling method due to the low losses.

# Regularized Model Initiation

In [30]:
# make the regularized model in a prettier way
inpSize = xtrain.shape[1]

regularizedModel = Sequential(
    [
        tf.keras.Input(shape=(inpSize,)),
        Dense(50, activation = 'relu',   name = "L1",kernel_regularizer = L2(0.01)),
        Dense(35, activation = 'relu',   name = "L2",kernel_regularizer = L2(0.01)),
        Dense(23, activation = 'relu',   name = "L3",kernel_regularizer = L2(0.01)),
        Dense(12, activation = 'relu',   name = "L4",kernel_regularizer = L2(0.01)),
        Dense(1, activation = 'sigmoid',   name = "OL",kernel_regularizer = L2(0.01)),
    ]
)

Test regularized model on mean norm scaled data

In [31]:
regularizedModel.compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, clipnorm=1.0),
    )

history = regularizedModel.fit(
    meanNormXtrain, ytrain,
    epochs=100,
    verbose = 0,
)

final_loss = history.history['loss'][-1]
print(final_loss)

0.386945515871048


In [32]:
regularizedModel.evaluate(meanNormXtest, ytest, verbose = 0)

0.4002377688884735

Not exactly what I wanted, but oh well.