# **Installing libraries**

In [None]:

#importing libraries
!pip install scikit-surprise
!pip install --upgrade category_encoders
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
import math
import re

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Normalizer , scale
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split , StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import precision_recall_fscore_support as metricScore
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Keras NN helpers
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )

#Encoding
import category_encoders as ce

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 269kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670923 sha256=4a3596c1075455611ab40a6471fa192d9aff09528a9a5891a1d075b87c0b69af
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1
Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)


# **Brainstorming**

# **References**

- Classification report sklearn library. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)
- SVC Sklearn [Link](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)
- SVC + GridSearchCV [Link](https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0)
- Logistic Regression Sklearn [Link](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- Logistic Regression + GridSearchCV [Link](https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5) [Link](https://www.kaggle.com/enespolat/grid-search-with-logistic-regression)
- Decision Tree Sklearn [Link](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
- Decision Tree + GridSearchCV [Link](https://medium.com/analytics-vidhya/decisiontree-classifier-working-on-moons-dataset-using-gridsearchcv-to-find-best-hyperparameters-ede24a06b489)
- Random Forest Classifier [Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
- Random Forest Classifier + GridSearchCV [Link](https://jamesrledoux.com/code/grid_search)
- Keras NN Tutorial [Link](https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/)
- GridSearchCV Keras NN Tutorial [Link](https://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/)
- Keras Activation Function [Link](https://keras.io/api/layers/activations/)
- Keras Categorical CrossEntropy Loss [Link](https://keras.io/api/losses/probabilistic_losses/#categoricalcrossentropy-class)
- Keras Initializers [Link](https://keras.io/api/layers/initializers/)
- Keras Optimizers [Link](https://keras.io/api/optimizers/)
- GridSearchCV [Link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
- Ordinal Encoding + Binary Encoding [Link](https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/)


# **Setting up helper functions**

In [None]:
# DataFrame function takes in an iterable
def findPercentageOfDataMissingInEachColumnOfDataframe(df, total):
    missingPercentageDataframe = pd.DataFrame({'numberOfCellsMissingValues':df.isnull().sum(), 'percentageMissing':(df.isnull().sum()*100)/total})
    missingPercentageDataframe = missingPercentageDataframe.sort_values(by=['percentageMissing'], ascending=False)
    print(missingPercentageDataframe)
    
# convert height from ft and inch to cm
# 1 ft = 30.48cm
# 1 inch = 2.54cm
# input value can be 5ft 6in or 5' 6"
def convertHeightIntoCm(value):
    # to handle nan which is of type float
    if isinstance(value,float):
        return value
    heightInCm=0.0
    splittedHeight = value.split()
    #ft part
    if(splittedHeight[0][-1] == "'"):
        heightInCm = int(splittedHeight[0][0:-1]) * 30.48
    else:
        heightInCm = int(splittedHeight[0][0:-2]) * 30.48
    #inch part
    if len(splittedHeight) > 1:
        if(splittedHeight[1][-1] == '"'):
            heightInCm = heightInCm + int(splittedHeight[1][0:-1]) * 2.54
        else:
            heightInCm = heightInCm + int(splittedHeight[1][0:-2]) * 2.54
#     print("Converted {} to {}".format(value,heightInCm))
    return round(heightInCm, 2)

# convert 57lbs to 57
def removeLBSfromWeight(value):
  if isinstance(value,float):
    return value
  return value[:-3]

def roundToNearestPointFive(value):
  if math.isnan(value):
    return value
  return round(value * 2) / 2

# imputing height by taking means of groups of different category of bra size, cup size & bust in increasing order of missing percentage
# Example take mean of height value where bra size is 34. Place that mean for na cells of height which have bra size 34.
# Example take mean of imputed_height value where shoe size is 34. Place that mean for na cells of imputed_height which have shoe size 34.
def imputeHeightForModCloth(df):
  df['imputed_height'] = df['height'].fillna(df.groupby('bra size')['height'].transform('mean'))
  df['imputed_height'] = df['imputed_height'].fillna(df.groupby('cup size')['height'].transform('mean'))
  df['imputed_height'] = df['imputed_height'].fillna(df.groupby('bust')['height'].transform('mean'))

  df['imputed_height'] = df['imputed_height'].fillna(df.groupby('shoe size')['height'].transform('mean'))
  df['imputed_height'] = df['imputed_height'].fillna(df.groupby('shoe width')['height'].transform('mean'))

  heightMeanValue = df['height'].mean()
  df['imputed_height'] = df['imputed_height'].fillna(heightMeanValue)
  return df

# Imputing shoe size by sequential grouping by imputed height, hips and bra size. This is in decreasing order of correlation with shoe size.
def imputeShoeSizeForModCloth(df):
  df['imputed_shoeSize'] = df['shoe size'].fillna(df.groupby('imputed_height')['shoe size'].transform('mean'))
  df['imputed_shoeSize'] = df['imputed_shoeSize'].fillna(df.groupby('hips')['shoe size'].transform('mean'))
  df['imputed_shoeSize'] = df['imputed_shoeSize'].fillna(df.groupby('bra size')['shoe size'].transform('mean'))

  shoeSizeMeanValue = df['shoe size'].mean()
  df['imputed_shoeSize'] = df['imputed_shoeSize'].fillna(shoeSizeMeanValue)
  df['imputed_shoeSize'] = df['imputed_shoeSize'].apply(roundToNearestPointFive)
  return df

# Since we have already imputed all shoe size, using it to group by and filling in with mode of each grouped category.
def imputeShoeWidthForModCloth(df):
  df['imputed_shoeWidth'] = df['shoe width'].fillna(df.groupby('imputed_shoeSize')['shoe width'].transform(lambda x: x.value_counts().index[0]))
  return df

# imputing height by taking means of groups of different category of body type, bust size & weight in increasing order of missing percentage
# Example take mean of height value where bra size is 34. Place that mean for na cells of height which have bra size 34.
# Example take mean of imputed_height value where shoe size is 34. Place that mean for na cells of imputed_height which have shoe size 34.
def imputeHeightForRentTheRunWay(df):
  df['imputed_height'] = df['height'].fillna(df.groupby('body type')['height'].transform('mean'))
  df['imputed_height'] = df['imputed_height'].fillna(df.groupby('bust size')['height'].transform('mean'))
  df['imputed_height'] = df['imputed_height'].fillna(df.groupby('weight')['height'].transform('mean'))

  heightMeanValue = df['height'].mean()
  df['imputed_height'] = df['imputed_height'].fillna(heightMeanValue)
  return df

def imputeWeightForRentTheRunWay(df):
  df['imputed_weight'] = df['weight'].fillna(df.groupby('size')['weight'].transform('mean'))
  df['imputed_weight'] = df['imputed_weight'].fillna(df.groupby('imputed_height')['weight'].transform('mean'))
  #check correlation between bust and weight and include it in computing weight

  weightMeanValue = df['weight'].mean()
  df['imputed_weight'] = df['imputed_weight'].fillna(weightMeanValue)
  return df

def imputeAgeForRentTheRunWay(df):
  ageMeanValue = df['age'].mean()
  df['age'] = df['age'].fillna(df.groupby('size')['age'].transform('mean'))
  df['age'] = df['age'].fillna(ageMeanValue)
  return df

def imputeHipsForModCloth(df):
  hipsMeanValue = df['hips'].mean()
  df['imputed_hips'] = df['hips'].fillna(df.groupby('size')['hips'].transform('mean'))
  df['imputed_hips'] = df['imputed_hips'].fillna(hipsMeanValue)
  return df



#imputing Bra Size by considering size and imputed hips.
def imputeBraSizeForModCloth(df):
  df['imputed_brasize'] = df['bra size'].fillna(df.groupby('size')['bra size'].transform('mean'))
  df['imputed_brasize'] = df['bra size'].fillna(df.groupby('imputed_hips')['bra size'].transform('mean'))
  braSizeMeanValue = df['bra size'].mean()
  df['imputed_brasize'] = df['imputed_brasize'].fillna(braSizeMeanValue)
  return df

#round odd bra sizes to even by adding 1 to them as suggested in referred mateiral
def roundToEvenBraSize(df):
  for i in range(len(df)) : 
    if df.loc[i, 'imputed_brasize'] % 2 !=0:
      df.loc[i, 'imputed_brasize'] = df.loc[i, 'imputed_brasize'] + 1
  return df
  
#imputing by grouping by bra size categories and finding out mode
def imputeCupSizeForModCloth(df):
  df['imputed_cupsize'] = df['cup size'].fillna(df.groupby('imputed_brasize')['cup size'].transform(lambda x: x.value_counts().index[0]))
  return df

#combine bra size and cup size to create new column
def combineBustSizeForModCloth(df):
  df['imputed_bustsize']=df['imputed_brasize'].str.cat(df['imputed_cupsize'], sep ="")
  return df

#imputing bust size by grouping categories and taking mode
def imputeBustSizeForRentTheRunWay(df):
  df['imputed_bustsize'] = df['bust size'].fillna(df.groupby('category')['bust size'].transform(lambda x: x.value_counts().index[0]))
  return df

#imputing bust sizes that are not currently in universal format
# d+ -> dd/e
# dd -> dd/e
# ddd/e -> ddd/f
# f -> ddd/f
def convertBustSizeForRentTheRunWay(df):
  for i in range(len(df)) :
    # print (df.loc[i, 'imputed_bustsize'])
    if re.search("d\+$", df.loc[i, 'imputed_bustsize']):
      # print('yes')
      df.loc[i,'imputed_bustsize'] = re.sub('(d\+)$', 'dd/e',df.loc[i,'imputed_bustsize'])  
    elif re.search("dd$", df.loc[i, 'imputed_bustsize']):
      # print('yes1')
      df.loc[i,'imputed_bustsize'] = re.sub('(dd)$', 'dd/e',df.loc[i,'imputed_bustsize'])
    elif re.search("ddd\/e$", df.loc[i, 'imputed_bustsize']):
      # print('yes2')
      df.loc[i,'imputed_bustsize'] = re.sub('(ddd\/e)$', 'ddd/f',df.loc[i,'imputed_bustsize'])
    elif re.search("f$", df.loc[i, 'imputed_bustsize']):
      # print('yes3')
      df.loc[i,'imputed_bustsize'] = re.sub('(f)$', 'ddd/f',df.loc[i,'imputed_bustsize'])
  return df 

def convertBustSizeToInches(df):
  for i in range(len(df)) :
    
    a=df.loc[i, 'imputed_bustsize']
    x=slice(0,2)
    y=slice(2,len(df.loc[i, 'imputed_bustsize']))
    cup=0
    if a[y]=="aa":
      cup=0
    elif a[y]=="a":
      cup=1
    elif a[y]=="b":
      cup=2
    elif a[y]=="c":
      cup=3
    elif a[y]=="d":
      cup=4
    elif a[y]=="dd/e":
      cup=5
    elif a[y]=="ddd/f":
      cup=6
    elif a[y]=="g":
      cup=7
    elif a[y]=="h":
      cup=8
    elif a[y]=="i":
      cup=9
    elif a[y]=="j":
      cup=10
    elif a[y]=="k":
      cup=11
    
    df.loc[i,'bustsize_in_inches']=int(a[x])+cup
  return df

def imputeLengthForModCloth(df):
  df['imputed_length'] = df['length'].fillna(df.groupby(['category','fit'])['length'].transform(lambda x: x.value_counts().index[0]))
  return df

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def frequencyCountByValue(df, col, datasetName ,ax):
    df[col][df[col].notnull()].value_counts().plot(kind='bar', facecolor='r', ax=ax)
    ax.set_xlabel('{}'.format(col), fontsize=80)
    ax.set_title("{} on {}".format(col, datasetName), fontsize= 80)
    return ax

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    hue = kwargs.get( 'hue' , None )
    facet = sns.FacetGrid( df , hue=hue , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , target ,shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    hue = kwargs.get( 'hue' , None )
    facet = sns.FacetGrid( df , aspect=6 , row = row , col = col,hue = hue )
    facet.map( sns.barplot , cat , target)
    facet.set_xticklabels(rotation=90)
    facet.add_legend()

def plot_barh(df,col, cmap = None, stacked=False, norm = None):
    df.plot(kind='barh', colormap=cmap, stacked=stacked)
    fig = plt.gcf()
    fig.set_size_inches(24,12)
    plt.title("Category vs {}-feedback -  Modcloth {}".format(col, '(Normalized)' if norm else ''), fontsize= 20)
    plt.ylabel('Category', fontsize = 18)
    plot = plt.xlabel('Frequency', fontsize=18)

def plot_barhRR(df,col, cmap = None, stacked=False, norm = None):
    df.plot(kind='barh', colormap=cmap, stacked=stacked)
    fig = plt.gcf()
    fig.set_size_inches(24,12)
    plt.title("Category vs {}-feedback -  Rent the Runway  {}".format(col, '(Normalized)' if norm else ''), fontsize= 20)
    plt.ylabel('Category', fontsize = 18)
    plot = plt.xlabel('Frequency', fontsize=18)
    
def norm_counts(t):
    norms = np.linalg.norm(t.fillna(0), axis=1)
    t_norm = t[0:0]
    for row, euc in zip(t.iterrows(), norms):
        t_norm.loc[row[0]] = list(map(lambda x: x/euc, list(row[1])))
    return t_norm

def imputeBodyType(df):
  df['imputed_bodyType'] = df['body type'].fillna(df.groupby('size')['body type'].transform(lambda x: x.value_counts().index[0]))
  return df

def plotFactorPlot(arr):
  vars = arr
  for variable in vars:
      var_df = data_renttherunway_df.groupby(variable)["binary_rating"].agg(["mean", "count"]).reset_index()
      var_df.sort_values(["mean"], ascending = False, inplace = True)
      sns.factorplot(y  = variable, x = "mean", 
                    orient = "h", 
                    hue = "count",
                    palette = "Set2",
                    size = 3, aspect = 3,
                    data = var_df)
      plt.title(variable + " " + "binary rating Plot")

# **Mounting Google Drive**

I have stored the dataset files in my private Google Drive. The way to mount google drive on Google colab was studied from the Colab's [getting started tutorial](https://colab.research.google.com/notebooks/io.ipynb).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **ModCloth Data Processing**

[ModCloth](https://www.modcloth.com/) sells women’s vintage clothing and accessories. The data was collected when user's purchased an item from category such as tops, bottoms, dresses etc. Additionally, when user returned an item, the reason was asked, and thus that transaction is included in the data too.

## Reading data from file

In [None]:
# There is one JSON object per line, and there are multiple lines, so reading entire file as one object per line
data_modcloth_df=pd.read_json('/content/drive/My Drive/CMPE256_CourseProjectShortcut/CMPE256_CourseProject/Datasets/modcloth.json', lines=True)
data_modcloth_df.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,


In [None]:
data_modcloth_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         82790 non-null  int64  
 1   waist           2882 non-null   float64
 2   size            82790 non-null  int64  
 3   quality         82722 non-null  float64
 4   cup size        76535 non-null  object 
 5   hips            56064 non-null  float64
 6   bra size        76772 non-null  float64
 7   category        82790 non-null  object 
 8   bust            11854 non-null  object 
 9   height          81683 non-null  object 
 10  user_name       82790 non-null  object 
 11  length          82755 non-null  object 
 12  fit             82790 non-null  object 
 13  user_id         82790 non-null  int64  
 14  shoe size       27915 non-null  float64
 15  shoe width      18607 non-null  object 
 16  review_summary  76065 non-null  object 
 17  review_text     76065 non-null 

In [None]:
data_modcloth_df.describe()

Unnamed: 0,item_id,waist,size,quality,hips,bra size,user_id,shoe size
count,82790.0,2882.0,82790.0,82722.0,56064.0,76772.0,82790.0,27915.0
mean,469325.22917,31.319223,12.661602,3.949058,40.358501,35.972125,498849.564718,8.145818
std,213999.803314,5.302849,8.271952,0.992783,5.827166,3.224907,286356.969459,1.336109
min,123373.0,20.0,0.0,1.0,30.0,28.0,6.0,5.0
25%,314980.0,28.0,8.0,3.0,36.0,34.0,252897.75,7.0
50%,454030.0,30.0,12.0,4.0,39.0,36.0,497913.5,8.0
75%,658440.0,34.0,15.0,5.0,43.0,38.0,744745.25,9.0
max,807722.0,50.0,38.0,5.0,60.0,48.0,999972.0,38.0


## Height

In [None]:
data_modcloth_df['height'] = data_modcloth_df['height'].apply(convertHeightIntoCm)

In [None]:
data_modcloth_df = imputeHeightForModCloth(data_modcloth_df)
data_modcloth_df = data_modcloth_df.round({'imputed_height':2})

## Shoe Size & Shoe Width

In [None]:
#Handling shoe size outlier. We have one record which has shoe size as 38. We replace it with nan before imputing unknown shoe size values
data_modcloth_df['shoe size'][data_modcloth_df['shoe size'] == 38] = float('nan')

In [None]:
data_modcloth_df = imputeShoeSizeForModCloth(data_modcloth_df)

In [None]:
# Grouping by imputed shoe size and taking mode of shoe width in each group. Then checking user's imputed size and filling in the moded shoe width
data_modcloth_df = imputeShoeWidthForModCloth(data_modcloth_df)

## Hips

In [None]:
data_modcloth_df = imputeHipsForModCloth(data_modcloth_df)
data_modcloth_df = data_modcloth_df.round({'imputed_hips':2})

## Bra Size, Cup Size & Bust Size

In [None]:
data_modcloth_df = imputeBraSizeForModCloth(data_modcloth_df) #imputing bra sizes
data_modcloth_df = data_modcloth_df.round({'imputed_brasize':0}) #rounding off the imputed values
data_modcloth_df = roundToEvenBraSize(data_modcloth_df) #convert odd bra sizes to even by adding 1 as suggested in referred material
data_modcloth_df.imputed_brasize=data_modcloth_df.imputed_brasize.astype(int) #converting from float to int
data_modcloth_df.imputed_brasize=data_modcloth_df.imputed_brasize.astype(str) #converting to string object

In [None]:
data_modcloth_df = imputeCupSizeForModCloth(data_modcloth_df) #impute cupsize
data_modcloth_df = combineBustSizeForModCloth(data_modcloth_df) #combine brasize and cupsize to form new column

In [None]:
data_modcloth_df=convertBustSizeToInches(data_modcloth_df) #convert bust sizes to inches

## Quality/Rating

In [None]:
qualityMeanModCloth = round(data_modcloth_df['quality'].mean())
data_modcloth_df['quality'] = data_modcloth_df['quality'].fillna(qualityMeanModCloth)

## Length

In [None]:
# data_modcloth_df.groupby(['category','fit'])['length'].value_counts().head(30)
print(data_modcloth_df['length'].isnull().sum())
data_modcloth_df = imputeLengthForModCloth(data_modcloth_df)
print(data_modcloth_df['imputed_length'].isnull().sum())

35
0


## Some Preprocessing (dropping extra columns)

1. Dropping column waist as it has a lot of missing values- around 95%.
2. Dropping some more columns which we feel are not needed as they won't be contributing towards our prediction for fit feature- user_name,user_id,item_id
3. Removing review_summary and review_text because though they can be of use, but to process text, Natural Language Processing (NLP) has to be applied. We are considering this section to be part of our future work.
4. Removing column length as it is one of the user's feedback, i.e. target class, and it will not be used for our predictive model. If time allows, we will be making another model to predict feature length.
5. Dropping some columns, which have already been imputed into new columns - height,shoe size, shoe width, hips, bra size, cup size, bust
6. As we have calculated bustsize in inches, we donot need the following columns anymore- imputed_brasize,imputed_cupsize, imputed_bustsize,imputed_length
7. After dropping, rename all columns by removing prefix imputed and adding '_'

In [None]:
data_modcloth_df_filtered=data_modcloth_df.copy()

In [None]:
# Dropping unnecessary columns.
data_modcloth_df_filtered.drop(['waist'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['user_name'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['user_id'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['item_id'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['review_summary'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['review_text'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['length'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['imputed_length'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['height'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['shoe size'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['shoe width'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['hips'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['bra size'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['cup size'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['bust'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['imputed_brasize'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['imputed_cupsize'], axis=1, inplace=True)
data_modcloth_df_filtered.drop(['imputed_bustsize'], axis=1, inplace=True)

In [None]:
# Renaming the columns
data_modcloth_df_filtered.columns = ['size', 'quality', 'category', 'fit', 'height', 'shoe_size', 'shoe_width', 'hips', 'bust']

In [None]:
data_modcloth_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   size        82790 non-null  int64  
 1   quality     82790 non-null  float64
 2   category    82790 non-null  object 
 3   fit         82790 non-null  object 
 4   height      82790 non-null  float64
 5   shoe_size   82790 non-null  float64
 6   shoe_width  82790 non-null  object 
 7   hips        82790 non-null  float64
 8   bust        82790 non-null  float64
dtypes: float64(5), int64(1), object(3)
memory usage: 5.7+ MB


- So now, we have 3 categorical features, i.e. category, fit and shoe_width which we need to convert into numerical values before feeding them to our ML algorithms. 
- Out of these 3, fit feature is our target variable.

In [None]:
# Finding out the unique values that our categorical features have
print(data_modcloth_df_filtered['category'].unique())
print(data_modcloth_df_filtered['fit'].unique())
print(data_modcloth_df_filtered['shoe_width'].unique())

['new' 'dresses' 'wedding' 'sale' 'tops' 'bottoms' 'outerwear']
['small' 'fit' 'large']
['average' 'wide' 'narrow']


## Encoding categorical features



1. Fit feature is our target feature and we can do ordinal encoding as we have 3 classes in order, small,fit and large, with values 0,1,2.


2. Similarly, shoe_width can also be encoded using ordinal encoding as we have 3 classes that too in order. So, narrow,average,wide can be encoded to 0,1,2.


3. For our variable category,we can perform Binary encoding on the classes.


4. This is because,if we do one hot encoding then we will have 7 new features.If there are a lot of categories for one hot encoding, it creates a phenomenon called dummy variable trap. It slows down learning of the model and also makes it ineffective. Due to the massive increase in the dataset, coding slows down the learning of the model along with deteriorating the overall performance that ultimately makes the model computationally expensive. Further, while using tree-based models these encodings are not an optimum choice.


5. Binary encoding is a combination of Hash encoding and one-hot encoding. In this encoding scheme, the categorical feature is first converted into numerical using an ordinal encoder. Then the numbers are transformed in the binary number. After that binary value is split into different columns.Binary encoding works really well when there are a high number of categories. For example the cities in a country where a company supplies its products.Binary encoding is a memory-efficient encoding scheme as it uses fewer features than one-hot encoding. Further, It reduces the curse of dimensionality for data with high cardinality.So, it can be applied to our category feature.


In [None]:
# Encoding fit category with ordinal encoding
encoder= ce.OrdinalEncoder(cols=['fit'],return_df=True,
                           mapping=[{'col':'fit','mapping':{'small':0,'fit':1,'large':2}}])
data_modcloth_df_filtered = encoder.fit_transform(data_modcloth_df_filtered)

In [None]:
# Encoding shoe_width category with ordinal encoding
encoder_shoe_width= ce.OrdinalEncoder(cols=['shoe_width'],return_df=True,
                           mapping=[{'col':'shoe_width','mapping':{'narrow':0,'average':1,'wide':2}}])
data_modcloth_df_filtered  = encoder_shoe_width.fit_transform(data_modcloth_df_filtered )

In [None]:
# Encoding category with Binary Encoding
encoder_binary= ce.BinaryEncoder(cols=['category'],return_df=True)
data_modcloth_df_filtered =encoder_binary.fit_transform(data_modcloth_df_filtered ) 

In [None]:
data_modcloth_df_filtered.head(10)

Unnamed: 0,size,quality,category_0,category_1,category_2,category_3,fit,height,shoe_size,shoe_width,hips,bust
0,7,5.0,0,0,0,1,0,167.64,8.5,1,38.0,38.0
1,13,3.0,0,0,0,1,0,157.48,7.0,1,30.0,38.0
2,7,2.0,0,0,0,1,0,170.18,9.0,1,36.97,34.0
3,21,5.0,0,0,0,1,1,165.53,8.0,1,44.79,45.0
4,18,5.0,0,0,0,1,0,157.48,7.0,1,44.13,38.0
5,11,5.0,0,0,0,1,0,162.56,8.0,1,41.0,39.0
6,5,1.0,0,0,0,1,2,160.02,7.5,1,35.29,34.0
7,11,5.0,0,0,0,1,0,165.1,8.5,1,42.0,42.0
8,30,4.0,0,0,0,1,0,177.8,11.0,2,50.0,46.0
9,13,5.0,0,0,0,1,1,167.64,9.0,1,41.0,41.0


In [None]:
data_modcloth_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   size        82790 non-null  int64  
 1   quality     82790 non-null  float64
 2   category_0  82790 non-null  int64  
 3   category_1  82790 non-null  int64  
 4   category_2  82790 non-null  int64  
 5   category_3  82790 non-null  int64  
 6   fit         82790 non-null  int64  
 7   height      82790 non-null  float64
 8   shoe_size   82790 non-null  float64
 9   shoe_width  82790 non-null  int64  
 10  hips        82790 non-null  float64
 11  bust        82790 non-null  float64
dtypes: float64(5), int64(7)
memory usage: 7.6 MB


- As we can see above, though category had 6 classes, after binary encoding , it got converted to only 4 new variables. This helps in dimensionality reduction and prevents dummy variable trap
- As of now we only have 12 features to work on.
- Also, all the features are numerical now.
- We can now pass our dataframe data_modcloth_df_filtered to our Machine Learning Algorithms

# **RenTheRunWay Data Processing**

## Reading data from file

In [None]:
# There is one JSON object per line, and there are multiple lines, so reading entire file as one object per line
data_renttherunway_df=pd.read_json('/content/drive/My Drive/CMPE256_CourseProjectShortcut/CMPE256_CourseProject/Datasets/renttherunway.json', lines=True)
data_renttherunway_df.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


In [None]:
data_renttherunway_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             192544 non-null  object 
 1   user_id         192544 non-null  int64  
 2   bust size       174133 non-null  object 
 3   item_id         192544 non-null  int64  
 4   weight          162562 non-null  object 
 5   rating          192462 non-null  float64
 6   rented for      192534 non-null  object 
 7   review_text     192544 non-null  object 
 8   body type       177907 non-null  object 
 9   review_summary  192544 non-null  object 
 10  category        192544 non-null  object 
 11  height          191867 non-null  object 
 12  size            192544 non-null  int64  
 13  age             191584 non-null  float64
 14  review_date     192544 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 22.0+ MB


In [None]:
data_renttherunway_df.describe()

Unnamed: 0,user_id,item_id,rating,size,age
count,192544.0,192544.0,192462.0,192544.0,191584.0
mean,499494.100149,1045684.0,9.092371,12.245175,33.871017
std,289059.719328,805314.8,1.430044,8.494877,8.058083
min,9.0,123373.0,2.0,0.0,0.0
25%,250654.25,195076.0,8.0,8.0,29.0
50%,499419.0,948396.0,10.0,12.0,32.0
75%,750974.0,1678888.0,10.0,16.0,37.0
max,999997.0,2966087.0,10.0,58.0,117.0


## Height

In [None]:
data_renttherunway_df['height'] = data_renttherunway_df['height'].apply(convertHeightIntoCm)
data_renttherunway_df.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,172.72,14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,167.64,12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,162.56,4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,165.1,8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,175.26,12,27.0,"September 26, 2016"


Removing lbs from weight column and making it float type

In [None]:
data_renttherunway_df['weight'] = data_renttherunway_df['weight'].apply(removeLBSfromWeight)
data_renttherunway_df['weight'] = data_renttherunway_df['weight'].astype(float)

In [None]:
# imputing values for height using other user body measurements - body type, bust size & weight - in inc. order of missing percentage
data_renttherunway_df = imputeHeightForRentTheRunWay(data_renttherunway_df)
data_renttherunway_df = data_renttherunway_df.round({'imputed_height':2})

## Bra Size, Cup Size & Bust Size

In [None]:
data_renttherunway_df = imputeBustSizeForRentTheRunWay(data_renttherunway_df) #imputed missing bust sizes based on grouping category column and taking mode

In [None]:
data_renttherunway_df = convertBustSizeForRentTheRunWay(data_renttherunway_df) #converting inconsistent sizes

In [None]:
data_renttherunway_df=convertBustSizeToInches(data_renttherunway_df) #convert bust sizes to inches

## Weight

In [None]:
data_renttherunway_df = imputeWeightForRentTheRunWay(data_renttherunway_df)

## Age

In [None]:
data_renttherunway_df = imputeAgeForRentTheRunWay(data_renttherunway_df)

### Handling age outlier

In [None]:
ageOutliersCount = data_renttherunway_df.loc[((data_renttherunway_df['age'] == 0) | (data_renttherunway_df['age'] > 90), 'age')].shape[0]
ageOutliersCount

111

In [None]:
ageMeanValueRounded = round(data_renttherunway_df['age'].mean())
ageMeanValueRounded

34

In [None]:
data_renttherunway_df.loc[((data_renttherunway_df['age'] == 0) | (data_renttherunway_df['age'] > 90), 'age')] = ageMeanValueRounded

## Quality/Rating

In [None]:
qualityMeanRentTheRunWay = round(data_renttherunway_df['rating'].mean())
data_renttherunway_df['imputedRating'] = data_renttherunway_df['rating'].fillna(qualityMeanRentTheRunWay)
data_renttherunway_df['imputedRating'] = data_renttherunway_df['imputedRating'].div(2).round()

## Body Type

In [None]:
data_renttherunway_df=imputeBodyType(data_renttherunway_df)

In [None]:
data_renttherunway_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fit                 192544 non-null  object 
 1   user_id             192544 non-null  int64  
 2   bust size           174133 non-null  object 
 3   item_id             192544 non-null  int64  
 4   weight              162562 non-null  float64
 5   rating              192462 non-null  float64
 6   rented for          192534 non-null  object 
 7   review_text         192544 non-null  object 
 8   body type           177907 non-null  object 
 9   review_summary      192544 non-null  object 
 10  category            192544 non-null  object 
 11  height              191867 non-null  float64
 12  size                192544 non-null  int64  
 13  age                 192544 non-null  float64
 14  review_date         192544 non-null  object 
 15  imputed_height      192544 non-nul

## Some Processing (Dropping extra columns)

1. Dropping some columns which we feel are not needed as they won't be contributing towards our prediction for fit feature- user_id,item_id


2. Removing review_summary, review_text, review_date because though they can be of use, but to process text, Natural Language Processing (NLP) has to be applied. We are considering this section to be part of our future work.


3. Dropping some columns, which have already been imputed into new columns - height,bust size,rating, body type,weight


4. As we have calculated bustsize in inches, we donot need the following column anymore- imputed_bustsize


5. Removing column age as it does not contribute towards our predictive model as for a wide age range like 20-35 or maybe 40, alot of people can have the same fit.


6. Dropping column rented for as well, again for the same reason. It has no role in predicting if a cloth would fit the user or not.


7. After dropping, rename all columns by removing prefix imputed and adding '_'

In [None]:
# Removing unnecessary columns
data_renttherunway_df_filtered=data_renttherunway_df.copy()
data_renttherunway_df_filtered.drop(['user_id'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['item_id'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['review_text'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['review_summary'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['review_date'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['height'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['bust size'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['rating'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['body type'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['imputed_bustsize'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['age'], axis=1, inplace=True)
data_renttherunway_df_filtered.drop(['rented for'], axis=1, inplace=True)

In [None]:
data_renttherunway_df_filtered.drop(['weight'], axis=1, inplace=True)

In [None]:
# Renaming the columns
data_renttherunway_df_filtered.columns = ['fit','category','size','height','bust_size','weight','rating','body_type']

In [None]:
data_renttherunway_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   fit        192544 non-null  object 
 1   category   192544 non-null  object 
 2   size       192544 non-null  int64  
 3   height     192544 non-null  float64
 4   bust_size  192544 non-null  float64
 5   weight     192544 non-null  float64
 6   rating     192544 non-null  float64
 7   body_type  192544 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 11.8+ MB


- So now, we have 3 categorical features, i.e. category, fit and body_type which we need to convert into numerical values before feeding them to our ML algorithms. 
- Out of these 3, fit feature is our target variable.

In [None]:
# Seeing what unique values do our categorical features have.
print(data_renttherunway_df_filtered['category'].unique())
print(data_renttherunway_df_filtered['fit'].unique())
print(data_renttherunway_df_filtered['body_type'].unique())

['romper' 'gown' 'sheath' 'dress' 'leggings' 'top' 'jumpsuit' 'sweater'
 'jacket' 'shirtdress' 'maxi' 'shift' 'pants' 'shirt' 'mini' 'skirt'
 'pullover' 'blouse' 'suit' 'coat' 'trench' 'bomber' 'cape' 'blazer'
 'vest' 'duster' 'ballgown' 'tank' 'poncho' 'frock' 'tunic' 'cardigan'
 'culottes' 'down' 'trouser' 'midi' 'pant' 'legging' 'print' 'knit'
 'culotte' 'sweatshirt' 'peacoat' 'kaftan' 'overalls' 'jogger' 'tee'
 'combo' 'henley' 'cami' 'blouson' 'turtleneck' 'trousers' 'overcoat'
 'hoodie' 't-shirt' 'caftan' 'tight' 'kimono' 'for' 'crewneck' 'skirts'
 'parka' 'buttondown' 'skort' 'sweatershirt' 'sweatpants' 'jeans']
['fit' 'small' 'large']
['hourglass' 'straight & narrow' 'petite' 'pear' 'athletic' 'full bust'
 'apple']


In [None]:
data_renttherunway_df_filtered['category'][data_renttherunway_df_filtered['category'].notnull()].value_counts(normalize=True,dropna=False).head(10)

dress       0.482404
gown        0.230498
sheath      0.100320
shift       0.027864
jumpsuit    0.026924
top         0.025610
maxi        0.017882
romper      0.015944
jacket      0.012485
mini        0.009094
Name: category, dtype: float64

- Here, top 9 contitute 93.95% of data

In [None]:
data_renttherunway_df_filtered['fit'][data_renttherunway_df_filtered['fit'].notnull()].value_counts(normalize=True,dropna=False).head(10)

fit      0.737795
small    0.133886
large    0.128319
Name: fit, dtype: float64

In [None]:
data_renttherunway_df_filtered['body_type'][data_renttherunway_df_filtered['body_type'].notnull()].value_counts(normalize=True,dropna=False).head(10)

hourglass            0.327385
athletic             0.245617
petite               0.131949
pear                 0.114982
full bust            0.077967
straight & narrow    0.076762
apple                0.025340
Name: body_type, dtype: float64

## Encoding categorical Features

1. Fit feature is our target feature and we can do ordinal encoding as we have 3 classes in order, small,fit and large, with values 0,1,2.


2. For body type, we can perform Binary Encoding. As of now, we have 7 classes, which would be reduced to a fewer columns after encoding.


3. For our variable category,we can perform Binary encoding on the classes after some preprocessing. As we can see, top 9 categories account for 93.35% of the data, so there is no point taking all the 68 categories. This would merely increase the number of dummy variables if we do one hot encoding. Even if we do Binary Encoding on 68 features, it would boil down to 7-8 features after encoding.So, we will take only those rows which have top 9 categories and perform Binary Encoding on top of it.





In [None]:
# Encoding fit category with ordinal encoding
encoder= ce.OrdinalEncoder(cols=['fit'],return_df=True,
                           mapping=[{'col':'fit','mapping':{'small':0,'fit':1,'large':2}}])
data_renttherunway_df_filtered = encoder.fit_transform(data_renttherunway_df_filtered)

In [None]:
# Encoding fit category with Binary encoding
encoder_binary= ce.BinaryEncoder(cols=['body_type'],return_df=True)
data_renttherunway_df_filtered =encoder_binary.fit_transform(data_renttherunway_df_filtered) 

In [None]:
# Removing the rows that do not contain the top 9 categories
data_renttherunway_df_filtered = data_renttherunway_df_filtered[data_renttherunway_df_filtered['category'].isin(['dress','gown','sheath','shift','jumpsuit','top','maxi','romper','jacket'])]


In [None]:
print(data_renttherunway_df_filtered['category'].unique())

['romper' 'gown' 'sheath' 'dress' 'top' 'jumpsuit' 'jacket' 'maxi' 'shift']


In [None]:
# Encoding category with Binary Encoding
encoder_binary_category= ce.BinaryEncoder(cols=['category'],return_df=True)
data_renttherunway_df_filtered =encoder_binary_category.fit_transform(data_renttherunway_df_filtered) 

In [None]:
data_renttherunway_df_filtered.head()

Unnamed: 0,fit,category_0,category_1,category_2,category_3,category_4,size,height,bust_size,weight,rating,body_type_0,body_type_1,body_type_2,body_type_3
0,1,0,0,0,0,1,14,172.72,38.0,137.0,5.0,0,0,0,1
1,1,0,0,0,1,0,12,167.64,36.0,132.0,5.0,0,0,1,0
2,1,0,0,0,1,1,4,162.56,36.0,120.507222,5.0,0,0,1,1
3,1,0,0,1,0,0,8,165.1,37.0,135.0,4.0,0,1,0,0
4,1,0,0,0,1,0,12,175.26,36.0,145.0,5.0,0,1,0,1


In [None]:
data_renttherunway_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180978 entries, 0 to 192543
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   fit          180978 non-null  int64  
 1   category_0   180978 non-null  int64  
 2   category_1   180978 non-null  int64  
 3   category_2   180978 non-null  int64  
 4   category_3   180978 non-null  int64  
 5   category_4   180978 non-null  int64  
 6   size         180978 non-null  int64  
 7   height       180978 non-null  float64
 8   bust_size    180978 non-null  float64
 9   weight       180978 non-null  float64
 10  rating       180978 non-null  float64
 11  body_type_0  180978 non-null  int64  
 12  body_type_1  180978 non-null  int64  
 13  body_type_2  180978 non-null  int64  
 14  body_type_3  180978 non-null  int64  
dtypes: float64(4), int64(11)
memory usage: 22.1 MB


- As we can see above, though category had 9 classes, after binary encoding , it got converted to only 5 new variables. This helps in dimensionality reduction and prevents dummy variable trap
- As of now we only have 15 features to work on.
- Also, all the features are numerical now.
- We can now pass our dataframe data_modcloth_df_filtered to our Machine Learning Algorithms

# **Assembling final datasets for modelling**

## ModCloth

In [None]:
modCloth_full_X = data_modcloth_df_filtered.loc[:, data_modcloth_df_filtered.columns != 'fit']
modCloth_full_X.head()

Unnamed: 0,size,quality,category_0,category_1,category_2,category_3,height,shoe_size,shoe_width,hips,bust
0,7,5.0,0,0,0,1,167.64,8.5,1,38.0,38.0
1,13,3.0,0,0,0,1,157.48,7.0,1,30.0,38.0
2,7,2.0,0,0,0,1,170.18,9.0,1,36.97,34.0
3,21,5.0,0,0,0,1,165.53,8.0,1,44.79,45.0
4,18,5.0,0,0,0,1,157.48,7.0,1,44.13,38.0


In [None]:
modCloth_full_Y = data_modcloth_df_filtered.filter(['fit'], axis=1)
# modCloth_full_Y['fit'] = LabelEncoder().fit_transform(modCloth_full_Y['fit'])
modCloth_full_Y.head()

Unnamed: 0,fit
0,0
1,0
2,0
3,1
4,0


In [None]:
train_modCloth_X , valid_modCloth_X , train_modCloth_Y , valid_modCloth_Y = train_test_split( modCloth_full_X , modCloth_full_Y , train_size = .7 )

print (modCloth_full_X.shape , train_modCloth_X.shape , valid_modCloth_X.shape , train_modCloth_Y.shape , valid_modCloth_Y.shape)

(82790, 11) (57952, 11) (24838, 11) (57952, 1) (24838, 1)


## RentTheRunWay

In [None]:
rentTheRunWay_full_X = data_renttherunway_df_filtered.loc[:, data_renttherunway_df_filtered.columns != 'fit']
rentTheRunWay_full_X.head()

Unnamed: 0,category_0,category_1,category_2,category_3,category_4,size,height,bust_size,weight,rating,body_type_0,body_type_1,body_type_2,body_type_3
0,0,0,0,0,1,14,172.72,38.0,137.0,5.0,0,0,0,1
1,0,0,0,1,0,12,167.64,36.0,132.0,5.0,0,0,1,0
2,0,0,0,1,1,4,162.56,36.0,120.507222,5.0,0,0,1,1
3,0,0,1,0,0,8,165.1,37.0,135.0,4.0,0,1,0,0
4,0,0,0,1,0,12,175.26,36.0,145.0,5.0,0,1,0,1


In [None]:
rentTheRunWay_full_Y = data_renttherunway_df_filtered.filter(['fit'], axis=1)
rentTheRunWay_full_Y.head()

Unnamed: 0,fit
0,1
1,1
2,1
3,1
4,1


In [None]:
train_rentTheRunWay_X , valid_rentTheRunWay_X , train_rentTheRunWay_Y , valid_rentTheRunWay_Y = train_test_split( rentTheRunWay_full_X , rentTheRunWay_full_Y , train_size = .7 )

print (rentTheRunWay_full_X.shape , train_rentTheRunWay_X.shape , valid_rentTheRunWay_X.shape , train_rentTheRunWay_Y.shape , valid_rentTheRunWay_Y.shape)

(180978, 14) (126684, 14) (54294, 14) (126684, 1) (54294, 1)


# **Modelling**

## **Logistic Regression**



### **ModCloth**

#### GridSearchCV

Following parameters can be varied for logistic regression to get the best params:
- Penalty : It specifies the norm for penalties. There are 4 types of penalties: l1, l2, elasticnet and none. It also means type of regularization.
- Solver: It describes which algorithm has to be used for optimization. Some solvers work with some penalties and not others. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- C: This describes the regularization strength/ size of penalty.

In [None]:
# The default solver for logistive regression is lbfgs. It only works with l2 penalty
grid_values_logistic_regression = [{'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10], 'solver':['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear']},
                                   {'penalty': ['l1'], 'C': [0.001, 0.01, 0.1, 1, 10], 'solver':['liblinear', 'saga']},
                                   {'penalty': ['elasticnet'], 'C': [0.001, 0.01, 0.1, 1, 10], 'solver':['saga']}]

In [None]:
model_logistic_regression = LogisticRegression()

In [None]:
gs_logistic_regression = GridSearchCV(model_logistic_regression, grid_values_logistic_regression, cv=5)

In [None]:
gs_logistic_regression.fit(train_modCloth_X,train_modCloth_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l2'],
                          'solver': ['newton-cg', 'lbfgs', 'sag', 'saga',
                                     'liblinear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l1'],
                          'solver': ['liblinear', 'saga']},
            

In [None]:
print("Best parameters for logistic regression: ",gs_logistic_regression.best_params_)

Best parameters for logistic regression:  {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


#### Best params model

In [None]:
bestParams_logistic_regression = LogisticRegression(C=gs_logistic_regression.best_params_['C'], penalty=gs_logistic_regression.best_params_['penalty'])

In [None]:
bestParams_logistic_regression.fit(train_modCloth_X,train_modCloth_Y)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
logistic_regression_predictions_valid_X = bestParams_logistic_regression.predict(valid_modCloth_X)

#### Classification Report

In [None]:
print(classification_report(valid_modCloth_Y, logistic_regression_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.40127   0.03217   0.05956      3917
           1    0.68169   0.99164   0.80796     16856
           2    0.25000   0.00025   0.00049      4065

    accuracy                        0.67807     24838
   macro avg    0.44432   0.34135   0.28934     24838
weighted avg    0.56682   0.67807   0.55778     24838



### **RentTheRunWay**

#### GridSearchCV

Following parameters can be varied for logistic regression to get the best params:
- Penalty : It specifies the norm for penalties. There are 4 types of penalties: l1, l2, elasticnet and none. It also means type of regularization.
- Solver: It describes which algorithm has to be used for optimization. Some solvers work with some penalties and not others. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- C: This describes the regularization strength/ size of penalty.

In [None]:
# The default solver for logistive regression is lbfgs. It only works with l2 penalty
grid_values_logistic_regression = [{'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10], 'solver':['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear']},
                                   {'penalty': ['l1'], 'C': [0.001, 0.01, 0.1, 1, 10], 'solver':['liblinear', 'saga']},
                                   {'penalty': ['elasticnet'], 'C': [0.001, 0.01, 0.1, 1, 10], 'solver':['saga']}]

In [None]:
model_logistic_regression = LogisticRegression()

In [None]:
gs_logistic_regression = GridSearchCV(model_logistic_regression, grid_values_logistic_regression, cv=5)

In [None]:
gs_logistic_regression.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l2'],
                          'solver': ['newton-cg', 'lbfgs', 'sag', 'saga',
                                     'liblinear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l1'],
                          'solver': ['liblinear', 'saga']},
            

In [None]:
print("Best parameters for logistic regression: ",gs_logistic_regression.best_params_)

Best parameters for logistic regression:  {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


#### Best params model

In [None]:
bestParams_logistic_regression = LogisticRegression(C=gs_logistic_regression.best_params_['C'], penalty=gs_logistic_regression.best_params_['penalty'])

In [None]:
bestParams_logistic_regression.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
logistic_regression_predictions_valid_X = bestParams_logistic_regression.predict(valid_rentTheRunWay_X)

#### Classification Report

In [None]:
print(classification_report(valid_rentTheRunWay_Y, logistic_regression_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.39225   0.04437   0.07971      7303
           1    0.74992   0.98910   0.85307     40287
           2    0.34639   0.01715   0.03269      6704

    accuracy                        0.74202     54294
   macro avg    0.49619   0.35021   0.32182     54294
weighted avg    0.65199   0.74202   0.64775     54294



## **Decision Tree**

### **ModCloth**

#### GridSearchCV

Following parameters have to be considered to find the best parameters for decision tree classifier:
- Criterion: It describes the quality of split. It has two poissible values: gini (gini impurity) and entropy (information gain)
- Splitter: To choose the split strategy at each node. It has two possible values: best and random.
- max_depth: It describes the max depth of the tree.
- min_samples_split: It describes the minimum samples required to split a node.

In [None]:
grid_values_dt = {'criterion':['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': range(1,20,2), 'min_samples_split' : range(50,300,5)}

In [None]:
dt=DecisionTreeClassifier()

In [None]:
gs_dt = GridSearchCV(dt, grid_values_dt, cv=5)

In [None]:
gs_dt.fit(train_modCloth_X, train_modCloth_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                   

In [None]:
print("Best parameters for decision tree: ",gs_dt.best_params_)

Best parameters for decision tree:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 50, 'splitter': 'best'}


#### Best Params Model

In [None]:
bestParams_dt = DecisionTreeClassifier(criterion=gs_dt.best_params_['criterion'], max_depth=gs_dt.best_params_['max_depth'], min_samples_split=gs_dt.best_params_['min_samples_split'])

In [None]:
bestParams_dt.fit(train_modCloth_X,train_modCloth_Y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=50,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
dt_predictions_valid_X = bestParams_dt.predict(valid_modCloth_X)

#### Classification report

In [None]:
print(classification_report(valid_modCloth_Y, dt_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.41459   0.06382   0.11062      3917
           1    0.68537   0.98541   0.80845     16856
           2    0.00000   0.00000   0.00000      4065

    accuracy                        0.67880     24838
   macro avg    0.36666   0.34974   0.30636     24838
weighted avg    0.53050   0.67880   0.56609     24838



### **RentTheRunWay**

#### GridSearchCV

Following parameters have to be considered to find the best parameters for decision tree classifier:
- Criterion: It describes the quality of split. It has two poissible values: gini (gini impurity) and entropy (information gain)
- Splitter: To choose the split strategy at each node. It has two possible values: best and random.
- max_depth: It describes the max depth of the tree.
- min_samples_split: It describes the minimum samples required to split a node.

In [None]:
grid_values_dt = {'criterion':['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': range(1,20,2), 'min_samples_split' : range(50,300,5)}

In [None]:
dt=DecisionTreeClassifier()

In [None]:
gs_dt = GridSearchCV(dt, grid_values_dt, cv=5)

In [None]:
gs_dt.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                   

In [None]:
print("Best parameters for decision tree: ",gs_dt.best_params_)

Best parameters for decision tree:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 50, 'splitter': 'best'}


#### Best Params Model

In [None]:
bestParams_dt = DecisionTreeClassifier(criterion=gs_dt.best_params_['criterion'], max_depth=gs_dt.best_params_['max_depth'], min_samples_split=gs_dt.best_params_['min_samples_split'])

In [None]:
bestParams_dt.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=50,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
dt_predictions_valid_X = bestParams_dt.predict(valid_rentTheRunWay_X)

#### Classification report

In [None]:
print(classification_report(valid_rentTheRunWay_Y, dt_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.41345   0.04546   0.08191      7303
           1    0.75065   0.98997   0.85386     40287
           2    0.51389   0.02760   0.05238      6704

    accuracy                        0.74410     54294
   macro avg    0.55933   0.35434   0.32938     54294
weighted avg    0.67606   0.74410   0.65106     54294



## **LinearSVC**

### **ModCloth**

#### GridSearchCV

Following params can be used to find the best params for SVC:
- C: This is the regularization strength.
- kernel: It describes the kernel type to be used for the algo. It has 5 possible values: linear, poly, rbf, sigmoid, precomputed.
- gamma - It is the kernel coefficient for three types of kernels (rbf, poly and sigmoid).

In [None]:
grid_values_linearSVC = [{'C': [0.01, 0.1, 1, 10]}]

In [None]:
model_linearSVC = LinearSVC()

In [None]:
gs_linearSVC = GridSearchCV(model_linearSVC, grid_values_linearSVC, cv=5)

In [None]:
gs_linearSVC.fit(train_modCloth_X, train_modCloth_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.01, 0.1, 1, 10]}], pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [None]:
print("Best parameters for linear svc: ",gs_linearSVC.best_params_)

Best parameters for linear svc:  {'C': 0.01}


#### Best Params model

In [None]:
bestParams_linearSVC = LinearSVC(C=gs_linearSVC.best_params_['C'])

In [None]:
bestParams_linearSVC.fit(train_modCloth_X, train_modCloth_Y)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
svc_predictions_valid_X = bestParams_linearSVC.predict(valid_modCloth_X)

#### Classification report

In [None]:
print(classification_report(valid_modCloth_Y, svc_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.30000   0.00154   0.00306      3896
           1    0.68578   0.99924   0.81335     17032
           2    0.00000   0.00000   0.00000      3910

    accuracy                        0.68544     24838
   macro avg    0.32859   0.33359   0.27214     24838
weighted avg    0.51731   0.68544   0.55822     24838



### **RentTheRunWay**

#### GridSearchCV

Following params can be used to find the best params for SVC:
- C: This is the regularization strength.
- kernel: It describes the kernel type to be used for the algo. It has 5 possible values: linear, poly, rbf, sigmoid, precomputed.
- gamma - It is the kernel coefficient for three types of kernels (rbf, poly and sigmoid).

In [None]:
grid_values_linearSVC = [{'C': [0.01, 0.1, 1, 10]}]

In [None]:
model_linearSVC = LinearSVC()

In [None]:
gs_linearSVC = GridSearchCV(model_linearSVC, grid_values_linearSVC, cv=5)

In [None]:
gs_linearSVC.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.01, 0.1, 1, 10]}], pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [None]:
print("Best parameters for linear svc: ",gs_linearSVC.best_params_)

Best parameters for linear svc:  {'C': 0.01}


#### Best Params model

In [None]:
bestParams_linearSVC = LinearSVC(C=gs_linearSVC.best_params_['C'])

In [None]:
bestParams_linearSVC.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
svc_predictions_valid_X = bestParams_linearSVC.predict(valid_rentTheRunWay_X)

#### Classification report

In [None]:
print(classification_report(valid_rentTheRunWay_Y, svc_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.29412   0.00134   0.00267      7447
           1    0.80492   0.69372   0.74520     40029
           2    0.17843   0.51716   0.26532      6818

    accuracy                        0.57658     54294
   macro avg    0.42582   0.40408   0.33773     54294
weighted avg    0.65619   0.57658   0.58309     54294



## **RandomForest Classifier**

### **ModCloth**

#### GridSearchCV

Following parameters can be varied to find best paramets for random forest classifier:
- n_estimators: Number of trees in the forest.
- Criterion: It describes the quality of split. It has two poissible values: gini (gini impurity) and entropy (information gain)
- max_depth: It describes the max depth of the tree.
- min_samples_split: It describes the minimum samples required to split a node.
- min_samples_leaf: Minimum number of samples required to be at leaf node.
- max_features: Number of features to consider when finding best split. Possible values: auto, sqrt, log2, or any int or float

In [None]:
grid_values_rfc = {'criterion':['gini', 'entropy'], 
                   'max_depth': [3, 5, 7],
                   'max_features': ['auto', 'sqrt', 'log2'],
                   'min_samples_leaf': [3, 5, 7],
                   'min_samples_split': [5, 10],
                   'n_estimators': [100, 200, 500]}

In [None]:
rfc=RandomForestClassifier()

In [None]:
gs_rfc = GridSearchCV(rfc, grid_values_rfc, cv=5)

In [None]:
gs_rfc.fit(train_modCloth_X, train_modCloth_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [None]:
print("Best parameters for rfc: ",gs_rfc.best_params_)

Best parameters for rfc:  {'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 7, 'min_samples_split': 5, 'n_estimators': 100}


#### Best Params Model

In [None]:
bestParams_rfc = RandomForestClassifier(criterion=gs_rfc.best_params_['criterion'],
                                        max_depth=gs_rfc.best_params_['max_depth'],
                                        min_samples_split=gs_rfc.best_params_['min_samples_split'],
                                        max_features=gs_rfc.best_params_['max_features'],
                                        min_samples_leaf=gs_rfc.best_params_['min_samples_leaf'],
                                        n_estimators=gs_rfc.best_params_['n_estimators'])

In [None]:
bestParams_rfc.fit(train_modCloth_X,train_modCloth_Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
rfc_predictions_valid_X = bestParams_rfc.predict(valid_modCloth_X)

#### Classification report

In [None]:
print(classification_report(valid_modCloth_Y, rfc_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.00051   0.00102      3904
           1    0.68292   1.00000   0.81159     16961
           2    0.00000   0.00000   0.00000      3973

    accuracy                        0.68295     24838
   macro avg    0.56097   0.33350   0.27087     24838
weighted avg    0.62352   0.68295   0.55437     24838



### **RentTheRunWay**

#### GridSearchCV

Following parameters can be varied to find best paramets for random forest classifier:
- n_estimators: Number of trees in the forest.
- Criterion: It describes the quality of split. It has two poissible values: gini (gini impurity) and entropy (information gain)
- max_depth: It describes the max depth of the tree.
- min_samples_split: It describes the minimum samples required to split a node.
- min_samples_leaf: Minimum number of samples required to be at leaf node.
- max_features: Number of features to consider when finding best split. Possible values: auto, sqrt, log2, or any int or float

In [None]:
grid_values_rfc = {'criterion':['gini', 'entropy'], 
                   'max_depth': [5, 7],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_leaf': [5, 7],
                   'min_samples_split': [5, 10]}

In [None]:
rfc=RandomForestClassifier()

In [None]:
gs_rfc = GridSearchCV(rfc, grid_values_rfc, cv=5)

In [None]:
gs_rfc.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [None]:
print("Best parameters for rfc: ",gs_rfc.best_params_)

Best parameters for rfc:  {'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 7, 'min_samples_split': 5}


#### Best Params Model

In [None]:
bestParams_rfc = RandomForestClassifier(criterion=gs_rfc.best_params_['criterion'],
                                        max_depth=gs_rfc.best_params_['max_depth'],
                                        min_samples_split=gs_rfc.best_params_['min_samples_split'],
                                        max_features=gs_rfc.best_params_['max_features'],
                                        min_samples_leaf=gs_rfc.best_params_['min_samples_leaf'],
                                        n_estimators=100)

In [None]:
bestParams_rfc.fit(train_rentTheRunWay_X, train_rentTheRunWay_Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
rfc_predictions_valid_X = bestParams_rfc.predict(valid_rentTheRunWay_X)

#### Classification report

In [None]:
print(classification_report(valid_rentTheRunWay_Y, rfc_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.00013   0.00027      7462
           1    0.73844   1.00000   0.84954     40090
           2    1.00000   0.00044   0.00089      6742

    accuracy                        0.73846     54294
   macro avg    0.91281   0.33353   0.28357     54294
weighted avg    0.80687   0.73846   0.62744     54294



## **Keras NN Model**

### **ModCloth**

#### Convert categorical output to numerical via one-hot encoding

In [None]:
# # Encoding categorical unique values as integers
# encoder = LabelEncoder()
# encoder.fit(train_modCloth_Y)
# encoded_train_modCloth_Y = encoder.transform(train_modCloth_Y)

# # Performing one-hot encoding
# dummy_train_modCloth_Y = np_utils.to_categorical(encoded_train_modCloth_Y)

dummy_train_modCloth_Y = np_utils.to_categorical(train_modCloth_Y, num_classes=3)

#### Defining our NN model

In [None]:
def threeLayeredModel(optimizer='adam', init='glorot_uniform'):
  model = Sequential()

  # Defining model layers
  model.add(Dense(11, input_dim=11, kernel_initializer=init, activation='relu'))
  model.add(Dense(7, kernel_initializer=init, activation='relu'))
  model.add(Dense(3,kernel_initializer=init, activation='sigmoid'))
  
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model

#### GridSearchCV

In [None]:
grid_values_keras_NN = {
  'optimizer': ['sgd', 'rmsprop', 'adam'],
  'init': ['glorot_uniform', 'random_normal', 'random_uniform'],
  'batch_size': [50, 100],
  'epochs': [5]
}

In [None]:
estimator = KerasClassifier(build_fn=threeLayeredModel, verbose=1)

In [None]:
gs_keras_NN = GridSearchCV(estimator=estimator, param_grid=grid_values_keras_NN)

In [None]:
gs_keras_NN.fit(train_modCloth_X, dummy_train_modCloth_Y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


GridSearchCV(cv=None, error_score=nan,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f6dc188eb00>,
             iid='deprecated', n_jobs=None,
             param_grid={'batch_size': [50, 100], 'epochs': [5],
                         'init': ['glorot_uniform', 'random_normal',
                                  'random_uniform'],
                         'optimizer': ['sgd', 'rmsprop', 'adam']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
print("Best parameters for logistic regression: ",gs_keras_NN.best_params_)

Best parameters for logistic regression:  {'batch_size': 100, 'epochs': 5, 'init': 'random_normal', 'optimizer': 'rmsprop'}


#### Best Params Model

In [None]:
bestParams_keras_NN = KerasClassifier(build_fn=threeLayeredModel,
                                      optimizer=gs_keras_NN.best_params_['optimizer'],
                                      init=gs_keras_NN.best_params_['init'],
                                      epochs=250,
                                      batch_size=gs_keras_NN.best_params_['batch_size'],
                                      verbose=0)

In [None]:
bestParams_keras_NN.fit(train_modCloth_X, dummy_train_modCloth_Y)

<tensorflow.python.keras.callbacks.History at 0x7f6dc1430978>

In [None]:
keras_NN_predictions_valid_X = bestParams_keras_NN.predict(valid_modCloth_X)
# decoded_keras_NN_predictions_valid_X = encoder.inverse_transform(keras_NN_predictions_valid_X)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


#### Classification Report

In [None]:
# print(classification_report(valid_modCloth_Y, decoded_keras_NN_predictions_valid_X, digits=5))
print(classification_report(valid_modCloth_Y, keras_NN_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.42009   0.04743   0.08524      3879
           1    0.69119   0.98979   0.81397     17039
           2    0.00000   0.00000   0.00000      3920

    accuracy                        0.68641     24838
   macro avg    0.37043   0.34574   0.29974     24838
weighted avg    0.53977   0.68641   0.57170     24838



### **RentTheRunWay**

#### Convert categorical output to numerical via one-hot encoding

In [None]:
# # Encoding categorical unique values as integers
# encoder = LabelEncoder()
# encoder.fit(train_rentTheRunWay_Y)
# encoded_train_rentTheRunWay_Y = encoder.transform(train_rentTheRunWay_Y)

# # Performing one-hot encoding
# dummy_train_rentTheRunWay_Y = np_utils.to_categorical(encoded_train_rentTheRunWay_Y)

dummy_train_rentTheRunWay_Y = np_utils.to_categorical(train_rentTheRunWay_Y, num_classes=3)

#### Defining our NN model

In [None]:
def threeLayeredModel(optimizer='adam', init='glorot_uniform'):
  model = Sequential()

  # Defining model layers
  model.add(Dense(14, input_dim=14, kernel_initializer=init, activation='relu'))
  model.add(Dense(9, kernel_initializer=init, activation='relu'))
  model.add(Dense(3,kernel_initializer=init, activation='sigmoid'))
  
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model

#### GridSearchCV

In [None]:
grid_values_keras_NN = {
  'optimizer': ['sgd', 'rmsprop', 'adam'],
  'init': ['glorot_uniform', 'random_normal', 'random_uniform'],
  'batch_size': [50, 100],
  'epochs': [5]
}

In [None]:
estimator = KerasClassifier(build_fn=threeLayeredModel, verbose=1)

In [None]:
gs_keras_NN = GridSearchCV(estimator=estimator, param_grid=grid_values_keras_NN)

In [None]:
gs_keras_NN.fit(train_rentTheRunWay_X, dummy_train_rentTheRunWay_Y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


GridSearchCV(cv=None, error_score=nan,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f6dc17b2400>,
             iid='deprecated', n_jobs=None,
             param_grid={'batch_size': [50, 100], 'epochs': [5],
                         'init': ['glorot_uniform', 'random_normal',
                                  'random_uniform'],
                         'optimizer': ['sgd', 'rmsprop', 'adam']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
print("Best parameters for logistic regression: ",gs_keras_NN.best_params_)

Best parameters for logistic regression:  {'batch_size': 50, 'epochs': 5, 'init': 'random_uniform', 'optimizer': 'adam'}


#### Best Params Model

In [None]:
bestParams_keras_NN = KerasClassifier(build_fn=threeLayeredModel,
                                      optimizer=gs_keras_NN.best_params_['optimizer'],
                                      init=gs_keras_NN.best_params_['init'],
                                      epochs=250,
                                      batch_size=gs_keras_NN.best_params_['batch_size'],
                                      verbose=0)

In [None]:
bestParams_keras_NN.fit(train_rentTheRunWay_X, dummy_train_rentTheRunWay_Y)

<tensorflow.python.keras.callbacks.History at 0x7f6dc0962d30>

In [None]:
keras_NN_predictions_valid_X = bestParams_keras_NN.predict(valid_rentTheRunWay_X)
# decoded_keras_NN_predictions_valid_X = encoder.inverse_transform(keras_NN_predictions_valid_X)

#### Classification Report

In [None]:
print(classification_report(valid_rentTheRunWay_Y, keras_NN_predictions_valid_X, digits=5))

              precision    recall  f1-score   support

           0    0.41916   0.05658   0.09970      7423
           1    0.75081   0.98446   0.85191     40158
           2    0.43171   0.04097   0.07483      6713

    accuracy                        0.74095     54294
   macro avg    0.53389   0.36067   0.34215     54294
weighted avg    0.66601   0.74095   0.65299     54294

