## Data Preprocessing

### Import Package

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import nltk
import re

### Import Data

In [2]:
# Read outfit_combinations
outfit = pd.read_csv("outfit_combinations.csv")
outfit.head(3)

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel


In [3]:
# Read full_data
full = pd.read_csv("full_data.csv")

# Get the useful columns
# full = full[['product_id', 'brand', 'name', 'description', 'brand_category', 'details']]

# Drop unused columns
full = full.drop(["mpn", "created_at", "updated_at", "deleted_at", "brand_canonical_url", \
                  "labels", "bc_product_id"], axis=1)

full.head(3)

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an..."
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection Case and cleaning cloth inc...


In [4]:
# Read extra_data
extra = pd.read_csv("extra_data.csv")

# Drop unused columns
extra = extra.drop(["mpn", "created_at", "updated_at", "saleable", "brand_canonical_url", \
                    "labels", "bc_product_id", "notes"], axis=1)

# Rename column
extra = extra.rename(columns={"name": "product_full_name"})

# Reorder columns (the same as full_data)
extra = extra[['product_id', 'brand', 'product_full_name', 'description', 'brand_category', 'details']]
# extra.columns = ['product_id', 'brand', 'name', 'description', 'brand_category', 'details']

print(extra.shape)
extra.head(1)

(6621, 6)


Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,A.L.C.,Lennox High Waist Cotton & Linen Pants,High-rise trousers tailored from a cool Italia...,Unknown,"True to size. High rise.\n31"" inseam; 14"" leg ..."


In [5]:
# Concat full_data and extra_data

fullextra = pd.concat([full, extra], axis = 0)
fullextra.head(5)

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an..."
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection Case and cleaning cloth inc...
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",Canvas upper Round toe Lace-up vamp SmartFOAM ...
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,100% UV protection Gradient lenses Adjustable ...


In [6]:
# Merge product data (aka fullextra) with outfit data (aka outfit)
df = pd.merge(fullextra, outfit, on=["product_id"], how='inner')
print(df.shape)
df.head(3)

(15363, 10)


Unnamed: 0,product_id,brand_x,product_full_name_x,description,brand_category,details,outfit_id,outfit_item_type,brand_y,product_full_name_y
0,01DVA59VHYAPT4PVX32NXW91G5,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,women:SHOES:MULES,As seen on the Pre-Fall ‘19 runway Heel measur...,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules
1,01DVA59VHYAPT4PVX32NXW91G5,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,women:SHOES:MULES,As seen on the Pre-Fall ‘19 runway Heel measur...,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules
2,01DVA59VHYAPT4PVX32NXW91G5,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,women:SHOES:MULES,\nAs seen on the Pre-Fall ‘19 runway\nHeel mea...,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules


In [7]:
# Rename columns
df = df.rename(columns={"brand_x": "brand", "product_full_name_x": "product_full_name"})

# Drop unused columns
df = df.drop(["brand_y", "product_full_name_y"], axis=1)

# Drop duplicates
df = df.drop_duplicates(subset=["product_id", "outfit_id"], keep="first")
print(df.shape)
df.head(3)

(5199, 8)


Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,outfit_id,outfit_item_type
0,01DVA59VHYAPT4PVX32NXW91G5,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,women:SHOES:MULES,As seen on the Pre-Fall ‘19 runway Heel measur...,01DVA879D7TQ59VPTTGCMJWWSK,shoe
3,01DVA4XY7A0QMMSK3V3SBR52J9,Alexandre Birman,Clarita Bow-Embellished Suede Sandals,Alexandre Birman's 'Clarita' sandals have quic...,women:SHOES:SANDALS,Heel height measures approximately 50mm / 2 in...,01DVA8GAYP45BCEMYGEK7FXGDQ,shoe
4,01DVA4XY7A0QMMSK3V3SBR52J9,Alexandre Birman,Clarita Bow-Embellished Suede Sandals,Alexandre Birman's 'Clarita' sandals have quic...,women:SHOES:SANDALS,Heel height measures approximately 50mm / 2 in...,01DWJE4FDNYRV6ZJBG25HJFYY2,shoe


### Data Cleaning

In [8]:
df["product_id"] = df["product_id"].str.upper()
df["brand"] = df["brand"].str.lower()
df['product_full_name'] = df["product_full_name"].str.lower()
df["description"] = df["description"].str.lower()
df["description"] = df["description"].str.replace("\n", " ")
df["brand_category"] = df["brand_category"].str.title()
df["brand_category"] = df["brand_category"].str.replace(r"(/|,|:)", " ")
df["brand_category"] = df["brand_category"].str.lower()
df["details"] = df["details"].str.title()
df["details"] = df["details"].str.replace("\n", " ")
df["outfit_id"] = df["outfit_id"].str.upper()
df["outfit_item_type"] = df["outfit_item_type"].str.lower()
df = df.replace(np.nan, "", regex=True)
df = df.replace(r"(unknown|Unknown)", "", regex=True)
df.reset_index(inplace = True)

df['outfit_id']= df['outfit_id'].astype(str)
df['product_id']= df['product_id'].astype(str)

In [9]:
# Merge columns: brand, product_full_name, description, details, outfit_item_type

df["new_column"] = df["brand"] + " " + df["product_full_name"] + " " + df["description"] + " " + df["details"] + " " + df["outfit_item_type"]
df.head(3)

Unnamed: 0,index,product_id,brand,product_full_name,description,brand_category,details,outfit_id,outfit_item_type,new_column
0,0,01DVA59VHYAPT4PVX32NXW91G5,tibi,juan embossed mules,tibi's juan embossed mules are made from shiny...,women shoes mules,As Seen On The Pre-Fall ‘19 Runway Heel Measur...,01DVA879D7TQ59VPTTGCMJWWSK,shoe,tibi juan embossed mules tibi's juan embossed ...
1,3,01DVA4XY7A0QMMSK3V3SBR52J9,alexandre birman,clarita bow-embellished suede sandals,alexandre birman's 'clarita' sandals have quic...,women shoes sandals,Heel Height Measures Approximately 50Mm / 2 In...,01DVA8GAYP45BCEMYGEK7FXGDQ,shoe,alexandre birman clarita bow-embellished suede...
2,4,01DVA4XY7A0QMMSK3V3SBR52J9,alexandre birman,clarita bow-embellished suede sandals,alexandre birman's 'clarita' sandals have quic...,women shoes sandals,Heel Height Measures Approximately 50Mm / 2 In...,01DWJE4FDNYRV6ZJBG25HJFYY2,shoe,alexandre birman clarita bow-embellished suede...


In [10]:
# Merge columns:outfit_item_type, product_full_name, product_id

df['output_name']=df['outfit_item_type']+': '+df['product_full_name']+' ('+df['product_id']+')'
df.head(3)

Unnamed: 0,index,product_id,brand,product_full_name,description,brand_category,details,outfit_id,outfit_item_type,new_column,output_name
0,0,01DVA59VHYAPT4PVX32NXW91G5,tibi,juan embossed mules,tibi's juan embossed mules are made from shiny...,women shoes mules,As Seen On The Pre-Fall ‘19 Runway Heel Measur...,01DVA879D7TQ59VPTTGCMJWWSK,shoe,tibi juan embossed mules tibi's juan embossed ...,shoe: juan embossed mules (01DVA59VHYAPT4PVX32...
1,3,01DVA4XY7A0QMMSK3V3SBR52J9,alexandre birman,clarita bow-embellished suede sandals,alexandre birman's 'clarita' sandals have quic...,women shoes sandals,Heel Height Measures Approximately 50Mm / 2 In...,01DVA8GAYP45BCEMYGEK7FXGDQ,shoe,alexandre birman clarita bow-embellished suede...,shoe: clarita bow-embellished suede sandals (0...
2,4,01DVA4XY7A0QMMSK3V3SBR52J9,alexandre birman,clarita bow-embellished suede sandals,alexandre birman's 'clarita' sandals have quic...,women shoes sandals,Heel Height Measures Approximately 50Mm / 2 In...,01DWJE4FDNYRV6ZJBG25HJFYY2,shoe,alexandre birman clarita bow-embellished suede...,shoe: clarita bow-embellished suede sandals (0...


In [11]:
# Tokenize the sentences into words

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"(\w+['-]?[a-zA-Z']*[a-z]|[0-9]+-*[0-9]*)")
df["new_column"] = df["new_column"].apply(lambda x: tokenizer.tokenize(x))

In [12]:
# Remove stopwords

from nltk.corpus import stopwords
stop = stopwords.words('english')

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

cleaned = []
for item in df["new_column"]:
    new_words = []
    for word in item:
        if word in stop:
            continue
        new_words.append(word)

    cleaned.append(new_words)

df["new_column"] = cleaned

In [13]:
# Lemmatize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
#     print(text)
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

df["new_column"] = df["new_column"].apply(lambda x: word_lemmatizer(x))

In [14]:
# Drop index column
df.drop(labels = ['index'], axis = 1, inplace = True)
df.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,outfit_id,outfit_item_type,new_column,output_name
0,01DVA59VHYAPT4PVX32NXW91G5,tibi,juan embossed mules,tibi's juan embossed mules are made from shiny...,women shoes mules,As Seen On The Pre-Fall ‘19 Runway Heel Measur...,01DVA879D7TQ59VPTTGCMJWWSK,shoe,"[tibi, juan, embossed, mule, tibi's, juan, emb...",shoe: juan embossed mules (01DVA59VHYAPT4PVX32...
1,01DVA4XY7A0QMMSK3V3SBR52J9,alexandre birman,clarita bow-embellished suede sandals,alexandre birman's 'clarita' sandals have quic...,women shoes sandals,Heel Height Measures Approximately 50Mm / 2 In...,01DVA8GAYP45BCEMYGEK7FXGDQ,shoe,"[alexandre, birman, clarita, bow-embellished, ...",shoe: clarita bow-embellished suede sandals (0...
2,01DVA4XY7A0QMMSK3V3SBR52J9,alexandre birman,clarita bow-embellished suede sandals,alexandre birman's 'clarita' sandals have quic...,women shoes sandals,Heel Height Measures Approximately 50Mm / 2 In...,01DWJE4FDNYRV6ZJBG25HJFYY2,shoe,"[alexandre, birman, clarita, bow-embellished, ...",shoe: clarita bow-embellished suede sandals (0...
3,01DVBP9AHVQTZXJSBNJ0N2NYJP,khaite,leather ankle boots,heel measures approximately 50mm/ 2 inches bla...,shoes boots ankle,"Fits True To Size, Take Your Normal Size Ital...",01DVBPKBK15VKQYKXV1YWR5PZ1,shoe,"[khaite, leather, ankle, boot, heel, measure, ...",shoe: leather ankle boots (01DVBP9AHVQTZXJSBNJ...
4,01DVBP9AHVQTZXJSBNJ0N2NYJP,khaite,leather ankle boots,heel measures approximately 50mm/ 2 inches bla...,shoes boots ankle,"Fits True To Size, Take Your Normal Size Ital...",01DVBPKBK1CN1AMKDTSKJKGY7S,shoe,"[khaite, leather, ankle, boot, heel, measure, ...",shoe: leather ankle boots (01DVBP9AHVQTZXJSBNJ...


## Define functions for different kinds of input

### Recommend by product_id using fuzzy wuzzy

In [15]:
# !pip install fuzzywuzzy

In [16]:
import random
import json
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import warnings;
warnings.filterwarnings('ignore')
from fuzzywuzzy import process
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
def recommend_id(test):
    '''Searches user's inputted product id and returns the recommended outfit. '''
    
    # List of all product id
    strOptions =list(set(df['product_id'].to_list()))
    
    # Str2match = user input
    str2Match = test
    
    # Use fuzzywuzzy's process.extract() to get similarity ratio of the most similar product id to user input
    Ratios = process.extract(str2Match,strOptions)
    
    # Most similar product id to the user input
    highest = process.extractOne(str2Match,strOptions)
    
    # Product id of the most smilar
    final_prod=highest[0]
    
    # Top few of outfit code of the most similar products to user input
    outfit_code=df.loc[df['product_id']==final_prod]['outfit_id'].to_list()
    
    # Get the top one of the outfit code
    outfit_code=outfit_code[0]
    
    # Return outfit
    final_result=df[df["outfit_id"] == outfit_code]
    
    return final_result

### Recommend by description using TFIDF

In [18]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities

In [19]:
def clean(words):
    word = words.replace("[", "").replace("\'", "").replace("\"", "").replace(" ", "").split(",")
    return word

In [20]:
def recommend_description(test):
    '''Searches user's inputted product description and returns the recommended outfit. '''
    
    test = test.lower()
    data = list(df.new_column)
    d = []
    for words in data:
        words = str(words)
        d.append(words)
    
    # creating test_model and dictionary
    test_model = [[word for word in clean(words)] for words in d]
    dictionary = corpora.Dictionary(test_model,prune_at=2000000)
    
    # constructing corpus
    corpus_model= [dictionary.doc2bow(test) for test in test_model]
    tfidf_model = models.TfidfModel(corpus_model)
    
    # constructing tfidf based on processed corpus
    corpus_tfidf = tfidf_model[corpus_model]
    
    # creating the bag of words and calculating tfidf
    test_bow = dictionary.doc2bow([word for word in word_tokenize(test)])
    test_tfidf = tfidf_model[test_bow]
    
    # calculating similarities between test and original data
    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = pd.DataFrame(index[test_tfidf])
    sims.columns = ["similaritie"]
    sims["information"] = data
    sims = sims[sims["similaritie"] <= 0.98]
    sims = sims.sort_values(by="similaritie", ascending=False).head(1)
    
    # get the product's id with the highest similarity
    target_product = list(sims["information"])[0]
    
    # get the outfit id of the target product
    outfitid = list(df[[v == target_product for v in df['new_column'].tolist()]].outfit_id)[0]
    
    # return all products with the same outfit id
    target = df[df["outfit_id"] == outfitid]

    return target

### Functions to get user inputs for outfit recommendations

In [23]:
def get_recommendation(input_str):
    if test[0].isdigit(): 
        result = recommend_id(test)
        result_short = result[['outfit_item_type','product_full_name','product_id']]
    else:
        result = recommend_description(test)
        result_short = result[['outfit_item_type','product_full_name','product_id']]
    for i in result_short.index:
        print(f'\t{result_short.loc[i][0]} : {result_short.loc[i][1]} ({result_short.loc[i][2]})')

def more_rec_details(input_str):
    if input_str.lower().startswith('y'):
        if test[0].isdigit(): 
            result = recommend_id(test)
        else:
            result = recommend_description(test)
        return result[['product_full_name','brand','outfit_item_type','product_id','details','description']]

## Try it!
## Get your outfit recommendation now!

In [24]:
test = input("Enter your content (product_id or product descriptions/details): \n")
get_recommendation(test)

more_details = input("\n\nDo you want more details on the outfit (y/n): \n")
more_rec_details(more_details)

Enter your content (product_id or product descriptions/details): 
panelope mid cap toe punp
	bottom : slim knit skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
	top : rib mock neck tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
	accessory1 : medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
	shoe : penelope mid cap toe pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


Do you want more details on the outfit (y/n): 
y


Unnamed: 0,product_full_name,brand,outfit_item_type,product_id,details,description
1717,slim knit skirt,eileen fisher,bottom,01DMBRYVA2P5H24WK0HTK4R0A1,,a nice skirt
1803,rib mock neck tank,eileen fisher,top,01DMBRYVA2PEPWFTT7RMP5AA1T,,a nice tank
3975,medium margaux leather satchel,kate spade new york,accessory1,01DMBRYVA2S5T9W793F4CY41HE,,a nice bag
5061,penelope mid cap toe pump,tory burch,shoe,01DMBRYVA2ZFDYRYY5TRQZJTBD,,a nice shoe
