# Getting Product Labels

## IMPORT LIBRARIES

In [87]:
import pandas as pd
import numpy as np

## LOAD DATASETS

In [88]:
women = pd.read_csv(r'./../data/cleaned_data/womens_collection.csv')
men = pd.read_csv(r'./../data/cleaned_data/mens_collection.csv')

## FUNCTION TO EXTRACT LABEL FROM PRODUCT DESCRIPTION

> There will be 12 different labels applied to the datasets:
> 1. jewelry
> 
> 2. hair accessory
> 
> 3. accessories
> 
> 4. bags
> 
> 5. swimsuit
> 
> 6. underwear
> 
> 7. sleepwear
> 
> 8. top
> 
> 9. bottom
> 
> 10. dress
> 
> 11. overalls
> 
> 12. coat

> **The actual product categories will also be scraped from the website but because this entails getting information from each individual product link, it takes quite some time to obtain. In order to move on to the next steps, we will temporarily use labels taken from the product details column.**

In [90]:
# lists of items per label/category
jewelry = ["medallion", "hoops", "watch", "cuff", "pearl", "pearly", "pendant", "bracelet", "bracelets", "earring", "earrings", "chain", "necklace", "phone accessory", "strap", "leash", "ring", "rings"]
hair_accessory = ["hair clip", "headband", "scrunchie", "scrunchies", "barrettes", "claw", "katla", "hairclip", "chouchou", "bandana", "bandeau", "hair"]
accessories = ["docker", "umbrella", "bangle", "mat", "tapis", "fan", "glasses", "sunglasses", "hat", "cap", "casquette", "bonnet", "cagoule", "balaclava", "berret", "beret", "belt", "ceinture", "beanie", "mitten", "mittens", "gloves", "scarf", "foulard", "echarpe", "warmer", "snood", "socks", "chaussettes"]
bags = ["trousse", "panier", "pochette", "tote", "totebag", "purse", "backpack", "sac a dos", "bag", "banane", "sac", "basket", "portefeuille", "porte carte", "wallet", "cabas", "pouch", "bumbag", "clutch", "pack", "handbag", "pencil case", "case"]

swimsuit = ["swimsuit", "bikini", "short de bain", "swim shorts", "trunks", "maillot"]
underwear = ["underwear", "thong", "panty", "panties", "bustier", "soutien-gorge", "bra", "briefs", "body", "brassiere", "bodysuit", "tanga",  "culotte", "caleçon", "boxer", "thongs", "knickers", "g-strings"]
sleepwear = ["nightgown", "pyjama", "pyjamas", "mask", "sleepdress", "kimono"]

top = ["overshirt", "polo", "t-shirt", "debardeur", "blouse", "shirt", "cardigan", "top", "pull", "jumper", "sweater", "pullover", "chemise", "hoodie", "sweat", "fleece", "gilet", "sweatshirt", "sweatshirts"]
bottom = ["jupe", "bermuda", "pants", "skirt", "skirts", "pantskirt", "short", "shorts", "pantalon", "jean", "jeans", "legging", "leggings", "joggers", "jogger", "sweatpants", "jogging", "trousers"]
dress= ["dress", "robe", "sundress"]
overalls = ["jumpsuit", "overall", "overalls", "playsuit"]
coat = ["windcheater", "coat", "jacket", "parka", "manteau", "raincoat", "trench", "cape", "rain", "doudoune", "blouson", "veste", "windbreaker", "coupe-vent", "blazer", "vest"]

In [115]:
# function to extract labels
def extract_label(df):
    """This function will use the lists provided above to extract the category of a product using the product column of the dataframe.
    Depending on the percentage of the data lost during the process, we will either review the list or proceed with the dataset acquired."""
    
    categories = [coat, jewelry, hair_accessory, accessories, bags, swimsuit, underwear, sleepwear, top, bottom, dress, overalls]
    labels = ["coat", "jewelry", "hair_accessory", "accessories", "bags", "swimsuit", "underwear", "sleepwear", "top", "bottom", "dress", "overalls"]
    
    # empty lists to store the data
    index = []
    label = []
    
    for ind in df.index:
        found = False # Flag to track if condition is satisfied
        for i in df['product'][ind].split(): # the product column contains phrases per row so this is to split them into words
            for c, l in zip(categories, labels):
                if i in c:
                    index.append(f"{ind}") # index number
                    label.append(l) # category label taken
                    found = True 
                    break # move on to the next row and avoid duplicates

            if found: # check if flag is True
                break
            
        if not found:
            index.append(f"{ind}")
            label.append(np.nan) # fill missing labels with NaN
    
    # create new label column 
    df['label'] = label
    missing = df[df.label.isna()==True]
    print(round((len(missing)/len(df))*100, 2), "% missing labels :",len(missing))
    
    return df, missing

In [92]:
# women's collection
women_labelled, women_missing = extract_label(women)

2.22 % missing labels : 163


In [93]:
# reset index
women_labelled.reset_index(drop=True, inplace=True)

In [94]:
#women_labelled.to_csv('women_labels.csv', index=False)

In [95]:
#men's collection
men_labelled, men_missing = extract_label(men)

2.7 % missing labels : 43


In [96]:
# reset index
men_labelled.reset_index(drop=True, inplace=True)

In [97]:
#men_labelled.to_csv('men_labels.csv', index=False)

In [98]:
women_labelled["gender"] = "w"

In [99]:
men_labelled["gender"] = "m"

In [100]:
all = pd.concat([women_labelled, men_labelled])

In [101]:
all.isna().sum()

link            0
img             0
product         0
color           0
brand           0
greenable       0
discounted      0
disc_price      0
orig_price      0
collection      0
label         206
gender          0
dtype: int64

In [102]:
all.duplicated().sum()

1

In [103]:
all[all.duplicated()]

Unnamed: 0,link,img,product,color,brand,greenable,discounted,disc_price,orig_price,collection,label,gender
5332,/en/product/sac-billy-nano-taureau-raspberry-r...,https://static.smallable.com/1701061-648x648q8...,sac billy nano taureau,raspberry red,jérôme dreyfuss,1,0,670.0,670.0,fw,bags,w


In [104]:
all.drop_duplicates(inplace=True)

> **NOTE: There are probably more duplicates to be found. We will check them in more detail later on.**

In [105]:
all_collection = all.dropna()

In [106]:
all_collection.reset_index(drop=True, inplace=True)

In [107]:
all_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8735 entries, 0 to 8734
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   link        8735 non-null   object 
 1   img         8735 non-null   object 
 2   product     8735 non-null   object 
 3   color       8735 non-null   object 
 4   brand       8735 non-null   object 
 5   greenable   8735 non-null   int64  
 6   discounted  8735 non-null   int64  
 7   disc_price  8735 non-null   float64
 8   orig_price  8735 non-null   float64
 9   collection  8735 non-null   object 
 10  label       8735 non-null   object 
 11  gender      8735 non-null   object 
dtypes: float64(2), int64(2), object(8)
memory usage: 819.0+ KB


In [108]:
#all_collection.to_csv("all_collection.csv", index=False)