# Shoe Price Determination: Machine Learning using sklearn and Tensorflow Regression Models



Notes: 
1. RandomForestRegressor model may not be the best fit for the data.
2. Further analysis suggests reversing the X and y to make a classification model

In [1]:
#import libraries
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

Import kaggle notebook as Data Frame
* Limited data for model, seems best use is Brand, and target as price.
* Could reverse the order and guess the brand from the price

In [2]:
df = pd.read_csv("/content/drive/My Drive/z_Shoe_prices/Datafiniti_Womens_Shoes.csv")

In [3]:
df_temp = df.copy()
df.head(2)

Unnamed: 0,id,dateAdded,dateUpdated,asins,brand,categories,primaryCategories,colors,dimension,ean,imageURLs,keys,manufacturer,manufacturerNumber,name,prices.amountMax,prices.amountMin,prices.availability,prices.color,prices.condition,prices.currency,prices.dateAdded,prices.dateSeen,prices.isSale,prices.merchant,prices.offer,prices.returnPolicy,prices.shipping,prices.size,prices.sourceURLs,sizes,sourceURLs,upc,weight
0,AVpfEf_hLJeJML431ueH,2015-05-04T12:13:08Z,2018-01-29T04:38:43Z,,Naturalizer,"Clothing,Shoes,Women's Shoes,All Women's Shoes...",Shoes,"Silver,Cream Watercolor Floral",,,https://i5.walmartimages.com/asr/861ca6cf-fa55...,"naturalizer/47147sc022,017136472311,womensnatu...",,47147SC022,Naturalizer Danya Women N/S Open Toe Synthetic...,55.99,55.99,,UWomens M Regular,,USD,2017-03-28T11:40:25Z,"2017-03-25T09:19:24.819Z,2017-03-25T09:19:19.600Z",False,Overstock.com,,,,S,https://www.overstock.com/Clothing-Shoes/Women...,"6W,9W,7.5W,12W,8.5M,9N,9M,9.5M,10.5M,10W,8.5W,...",https://www.walmart.com/ip/Naturalizer-Danya-W...,17136472311,
1,AVpi74XfLJeJML43qZAc,2017-01-27T01:23:39Z,2018-01-03T05:21:54Z,,MUK LUKS,"Clothing,Shoes,Women's Shoes,Women's Casual Sh...",Shoes,Grey,,33977050000.0,https://i5.walmartimages.com/asr/421de5d5-3a74...,"mukluks/00173650206,033977045743,muklukswomens...",Muk Luks,0017365020-6,MUK LUKS Womens Jane Suede Moccasin,47.0,35.25,In Stock,Grey,New,USD,2018-01-03T05:21:54Z,"2017-12-08T14:24:00.000Z,2017-11-01T02:52:00.000Z",True,Walmart.com,,,Standard,6,https://www.walmart.com/ip/MUK-LUKS-Womens-Jan...,107698,https://www.walmart.com/ip/MUK-LUKS-Womens-Jan...,33977045743,


In [4]:
# Created the test and target values for the regression model
X_temp = df['brand']
y = (df['prices.amountMax'] + df['prices.amountMin']) / 2

X_temp.head(5)

0    Naturalizer
1       MUK LUKS
2       MUK LUKS
3       MUK LUKS
4       MUK LUKS
Name: brand, dtype: object

In [5]:
pd.api.types.is_string_dtype(df_temp["brand"])

True

In [6]:
# Check missing values
df_temp.isna().sum()

id                         0
dateAdded                  0
dateUpdated                0
asins                   9997
brand                      0
categories                 0
primaryCategories          0
colors                  7369
dimension               9883
ean                     9329
imageURLs                  0
keys                       0
manufacturer            9473
manufacturerNumber      7518
name                       0
prices.amountMax           0
prices.amountMin           0
prices.availability     9566
prices.color               0
prices.condition        9562
prices.currency            0
prices.dateAdded         777
prices.dateSeen            0
prices.isSale              0
prices.merchant         9565
prices.offer            9879
prices.returnPolicy    10000
prices.shipping         9588
prices.size                0
prices.sourceURLs          0
sizes                      0
sourceURLs                 0
upc                      360
weight                  9701
dtype: int64

In [7]:
# Turn categorical variables into numbers
for label, content in df_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # # Add binary column to inidicate whether sample had missing value
        # df_temp[label+"_is_missing"] = pd.isnull(content)
        # Add the +1 because pandas encodes missing categories as -1
        df_temp[label] = pd.Categorical(content).codes+1   
df_temp.head(5)

Unnamed: 0,id,dateAdded,dateUpdated,asins,brand,categories,primaryCategories,colors,dimension,ean,imageURLs,keys,manufacturer,manufacturerNumber,name,prices.amountMax,prices.amountMin,prices.availability,prices.color,prices.condition,prices.currency,prices.dateAdded,prices.dateSeen,prices.isSale,prices.merchant,prices.offer,prices.returnPolicy,prices.shipping,prices.size,prices.sourceURLs,sizes,sourceURLs,upc,weight
0,390,1,210,0,45,5,1,188,0,,163,527,0,38,427,55.99,55.99,0,506,0,1,3,3,False,5,0,,0,71,686,261,631,1,0
1,456,87,8,0,40,19,1,122,0,33977050000.0,136,521,11,6,407,47.0,35.25,1,282,1,1,1307,1439,True,11,0,,5,38,743,4,628,3,0
2,456,87,8,0,40,19,1,122,0,33977050000.0,136,521,11,6,407,35.25,35.25,1,282,1,1,407,345,False,7,0,,7,38,743,4,628,3,0
3,470,89,20,0,40,10,2,13,16,33977050000.0,172,1,11,7,406,24.75,24.75,1,25,1,1,1325,1415,False,7,0,,7,38,742,4,627,4,0
4,441,89,71,0,40,10,1,122,16,33977050000.0,148,522,0,8,406,33.0,30.39,1,282,1,1,352,520,True,11,0,,1,38,741,4,626,5,0


In [8]:
# Form final X and y for Model and setup train/ test data
X = df_temp['brand']
y = (df['prices.amountMax'] + df['prices.amountMin']) / 2
X_train, X_test, y_train, y_test = train_test_split(X.values.reshape(-1,1),y.values.reshape(-1,1), test_size=0.2)

# Setup SKlearn Model

In [None]:
# Fit Random Forest Regressor Model to Data
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)

In [10]:
# Functions to Evaluate Model
def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_test, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_test, val_preds),
              "Training R^2 (accuracy)": model.score(X_train, y_train),
              "Test R^2 (accuracy)": model.score(X_test, y_test)}
    return scores

In [11]:
show_scores(model)

{'Test R^2 (accuracy)': 0.6774219895267201,
 'Training MAE': 7.672458186267788,
 'Training RMSLE': 0.18100585826910057,
 'Training R^2 (accuracy)': 0.6793651392598746,
 'Valid MAE': 8.14321619046561,
 'Valid RMSLE': 0.18814072759810913}

# Improve SKlearn Model with Randomized search

In [None]:
# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           }

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

In [13]:
# Find best Parameters
rs_model.best_params_

{'max_depth': 10,
 'max_features': 0.5,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 60}

In [None]:
best_param_model = RandomForestRegressor(n_estimators=60,
                                          min_samples_leaf=1,
                                          min_samples_split=4,
                                          max_features=0.5,
                                          n_jobs=-1,
                                          max_samples=None,
                                          max_depth=10)
best_param_model.fit(X_train, y_train)

In [22]:
show_scores(best_param_model)

{'Test R^2 (accuracy)': 0.6528212179530076,
 'Training MAE': 7.906661482272987,
 'Training RMSLE': 0.18904687728547917,
 'Training R^2 (accuracy)': 0.6455586239088651,
 'Valid MAE': 8.339397127088935,
 'Valid RMSLE': 0.19468645198291645}

Note: Random Forest Regressor model didn't perform too well even after using best params

# Setup Tensorflow Model

In [16]:
# TODO: Complete Tensorflow model and test data.

In [17]:
def build_model():
  model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=len(X.keys())),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [18]:
# model_TF = build_model()