# Create new category column (generalize categories taken from web scraping)

## IMPORT LIBRARIES

In [83]:
import pandas as pd
import re
import numpy as np

## LOAD DATA

In [84]:
all = pd.read_csv(r'./../data/cleaned_data/all_categories_unqiue.csv')

In [85]:
all.head()

Unnamed: 0,link,img,product,color,brand,greenable,discounted,disc_price,orig_price,collection,label,gender,product_category
0,/en/product/debardeur-namast-hey-white-yuj-331457,https://static.smallable.com/1730424-648x648q8...,debardeur namast'hey,white,yuj,1,0,48.0,48.0,ss,top,w,sports top
1,/en/product/tapis-chakra-power-purple-yuj-331460,https://static.smallable.com/1722207-648x648q8...,tapis chakra power,purple,yuj,1,0,79.0,79.0,ss,accessories,w,yoga mats
2,/en/product/legging-manipura-marled-blue-yuj-3...,https://static.smallable.com/1722188-648x648q8...,legging manipura,marled blue,yuj,1,0,85.0,85.0,ss,bottom,w,sports leggings & shorts
3,/en/product/brassiere-ajna-marled-blue-yuj-331458,https://static.smallable.com/1722184-648x648q8...,brassiere ajna,marled blue,yuj,1,0,59.0,59.0,ss,underwear,w,sports bras
4,/en/product/denize-bubble-crepe-dress-sand-bel...,https://static.smallable.com/1686202-648x648q8...,denize bubble crepe dress,sand,bellepiece,0,1,139.3,199.0,ss,dress,w,long dresses


## PREPARING GENERALIZED LABELS

In [86]:
# check all categories taken from the website
all.product_category.unique()

array(['sports top', 'yoga mats', 'sports leggings & shorts',
       'sports bras', 'long dresses', 'long sleeve shirts',
       'short sleeve blouses', 'necklaces', 'straight leg trousers',
       'shorts', 'caps', 'earrings', 'long skirts', 'shoulder bags',
       'straight', 'short sleeve t-shirts', 'belts, braces',
       'long sleeve blouses', 'handbags', 'bikini bottom', 'fanny packs',
       'bikinis', 'small leather goods', 'bras', 'bracelets', 'jackets',
       'hair clips', '1 piece swimsuits', 'pencil cases', 'pyjamas',
       'rings', 'pouches', 'uv protective clothing', 'tank tops',
       'tote bags', 'umbrellas', 'round neck jumpers',
       'long sleeve t-shirts', 'sleeveless tops', 'hats', 'sunglasses',
       'short dresses', 'short sleeve shirts', 'bikini top',
       'toiletry bags, pouches', 'fleece sweatshirts', 'sets',
       'round neck sweatshirts', 'jogging bottoms', 'jumpsuits',
       'short skirts', 'beanies', 'capes, ponchos',
       'skateboards, scooters

In [87]:
# lists of keywords to look for in the product_category column to apply different labels
sports_accessories = [r"mat[s]{0,1}", r"sport[s]{0,1}"]
hair_accessories = [r"clip[s]{0,1}", r'headband[s]{0,1}']
wallets = [r"leather goods", r"case[s]{0,1}"]
pouches = [r"pouch[es]{0,1}", r'toilet[s]{0,1}',]
outdoor_toys = [r"skateboard[s]{0,1}", r"scooter[s]{0,1}"]
umbrellas = [r"umbrella[s]{0,1}"]

swimsuits = [r"bikini[s]{0,1}", r"swimsuit[s]{0,1}", r"uv", r"swim"]
underwears = [r"boxer[s]{0,1}", r"brief[s]{0,1}",  r"bra[s]{0,1}", r"knicker[s]{0,1}", r"culotte[s]{0,1}", r"tight[s]{0,1}", r"underwear[s]{0,1}"]
sleepwear = [r"pyjama[s]{0,1}", r"kimono[s]{0,1}", r"nightdress[es]{0,1}", r"gown[s]{0,1}", r"robe[s]{0,1}"]

tops = [r"top[s]{0,1}", r"blouse[s]{0,1}", r"[t-]{0,1}shirt[s]{0,1}"]
jumpers = [r"jumper[s]{0,1}", r"cardigan[s]{0,1}",  r"sweatshirt[s]{0,1}", r"sweater[s]{0,1}", r"pull[s]{0,1}", r"hood[s]{0,1}"]
bottoms = [r"bottom[s]{0,1}", r"trouser[s]{0,1}", r"legging[s]{0,1}", r"shorts", r"pant[s]{0,1}", r"jean[s]{0,1}", r'skirt[s]{0,1}', r'chino[s]{0,1}', r'flare[s]{0,1}', r'slim[s]{0,1}', r'loose[s]{0,1}', r'tapered', r'bottom[s]{0,1}', r"jogger[s]{0,1}", r'straight[s]{0,1}', r"boyfriend[s]{0,1}"]
dress_overalls = [r"dress[es]{0,1}", r"dungaree[s]{0,1}", r"bodysuit[s]{0,1}", r"playsuit[s]{0,1}", r"set[s]{0,1}", r"jumpsuit[s]{0,1}"]
coats = [r"capes", r"ponchos", r"coat[s]{0,1}", r"jacket[s]{0,1}", r"parka[s]{0,1}"]

jewelries = [r'necklace[s]{0,1}', r"earring[s]{0,1}", r"bracelet[s]{0,1}", r"ring[s]{0,1}", r"watch[es]{0,1}", r"jewellery"]
accessories = [r"caps", r"[sun]{0,1}hat[s]{0,1}", r"beanie[s]{0,1}", r"belt[s]{0,1}", r"[sun]{0,1}glass[es]{0,1}", r"scarf", r"scarves", r"accessories", r"foulard[s]{0,1}", r"snood[s]{0,1}", r"glove[s]{0,1}", r"mitten[s]{0,1}", r"sock[s]{0,1}"]
bags = [r"bag[s]{0,1}", r"[back]{0,1}pack[s]{0,1}"]

> **NOTE: Although everything is being marked for their categories, we will only keep products with the following results:**
> | category     |label|
> |----------------|-----|
> | accessories    | 0 |
> | bag            | 1 |
> | bottom         | 2 |
> | coat           | 3 |
> | dress_overalls | 4 |
> | jewelry        | 5 |
> | jumper         | 6 |
> | top            | 7 |

In [88]:
# function to extract labels
def extract_label(df):
    """This function will use the lists provided above to extract the category of a product using the product column of the dataframe.
    Depending on the percentage of the data lost during the process, we will either review the list or proceed with the dataset acquired."""
    
    categories = [sports_accessories, hair_accessories, wallets,  pouches, outdoor_toys, umbrellas, swimsuits, underwears, sleepwear, 
                accessories, dress_overalls, bottoms, jewelries, jumpers, tops,  coats, bags]
    labels = ["sportswear", "hair_accessories", "wallet", "pouch", "scooter", "umbrella", "swimwear", "underwear", "sleepwear", 
              "accessories", "dress_overalls", "bottom", "jewelry", "jumper", "top", "coat", "bag"]
    
    # empty lists to store the data
    index = []
    category = []
    
    for ind in df.index:
        found=False
        for c, l in zip(categories, labels):
            for r in c:
                if re.search(r, df.product_category[ind]):
                    index.append(ind)
                    category.append(l)
                    found = True 
                    break # move on to the next row and avoid duplicates
                
                elif re.search(r, df['product'][ind]):
                    index.append(ind)
                    category.append(l)
                    found = True 
                    break # move on to the next row and avoid duplicates
                        
            if found: # check if flag is True
                break
            
        if not found:
            index.append(ind)
            category.append(np.nan) # fill missing labels with NaN

    # Create new label column 
    new_df = pd.DataFrame(category, index=index, columns=['category'])
    df_add = pd.concat([df, new_df], axis=1)
    missing = df_add[df_add.category.isna()==True]
    print(round((len(missing)/len(df))*100, 2), "% missing labels :",len(missing))
    
    return new_df, df_add, missing

In [89]:
new_df, all1, missing = extract_label(all)

0.33 % missing labels : 27


In [90]:
missing

Unnamed: 0,link,img,product,color,brand,greenable,discounted,disc_price,orig_price,collection,label,gender,product_category,category
712,/en/product/menstrual-tanga-antheia-light-flow...,https://static.smallable.com/1666374-648x648q8...,menstrual tanga antheia - light flow,black,réjeanne,1,0,39.0,39.0,ss,underwear,w,/,
713,/en/product/menstrual-thong-antheia-light-flow...,https://static.smallable.com/1666367-648x648q8...,menstrual thong antheia - light flow,black,réjeanne,1,0,39.0,39.0,ss,underwear,w,/,
1121,/en/product/scrunchie-lilac-bond-eye-301484,https://static.smallable.com/1616781-648x648q8...,scrunchie,lilac,bond-eye,1,0,35.0,35.0,ss,hair_accessory,w,/,
1627,/en/product/menstrual-feel-thong-light-flow-bl...,https://static.smallable.com/1628209-648x648q8...,menstrual feel thong - light flow,black,pantys,1,0,32.0,32.0,ss,underwear,w,/,
1682,/en/product/menstrual-feel-thong-light-flow-pi...,https://static.smallable.com/1628212-648x648q8...,menstrual feel thong - light flow,pink,pantys,1,0,32.0,32.0,ss,underwear,w,/,
2256,/en/product/sirocco-fan-black-lastelier-274888,https://static.smallable.com/1439264-648x648q8...,sirocco fan,black,lastelier,1,0,105.0,105.0,ss,accessories,w,/,
2258,/en/product/sirocco-fan-brown-lastelier-274886,https://static.smallable.com/1439249-648x648q8...,sirocco fan,brown,lastelier,1,0,105.0,105.0,ss,accessories,w,/,
2327,/en/product/akimmi-scrunchie-blue-louise-misha...,https://static.smallable.com/1601432-648x648q8...,akimmi scrunchie,blue,louise misha,0,1,10.2,17.0,ss,hair_accessory,w,/,
2689,/en/product/scrunchie-grass-green-bond-eye-301483,https://static.smallable.com/1616779-648x648q8...,scrunchie,grass green,bond-eye,1,0,35.0,35.0,ss,hair_accessory,w,/,
2726,/en/product/scrunchie-navy-blue-call-it-by-you...,https://static.smallable.com/1380518-648x648q8...,scrunchie,navy blue,call it by your name,1,0,15.0,15.0,ss,hair_accessory,w,/,


> **NOTE:** Since the number of missing values is minimal and most of them are not in the categories we wish to keep for the recommender, we will opt to drop them.

In [91]:
all2 = all1.dropna().reset_index(drop=True)

In [92]:
# save to csv
all2.to_csv('all_labelled_complete.csv', index=False)

In [93]:
# list of categories to keep
categories_to_keep = ["jumper", "top", "bottom", "dress_overalls", "coat", "jewelry", "accessories", "bag"]

In [94]:
# new df to take only items in the categories above
all_filtered = all2[all2['category'].isin(categories_to_keep)]

In [95]:
# double-checking unique values in the category column
all_filtered.category.unique() # seems to be in order

array(['dress_overalls', 'top', 'jewelry', 'bottom', 'accessories', 'bag',
       'coat', 'jumper'], dtype=object)

In [96]:
all_filtered.category.value_counts()

category
top               1447
bottom             983
accessories        764
dress_overalls     643
jumper             601
bag                548
jewelry            547
coat               345
Name: count, dtype: int64

In [97]:
# reset index
all_filtered.reset_index(drop=True, inplace=True)

In [98]:
all_filtered.head()

Unnamed: 0,link,img,product,color,brand,greenable,discounted,disc_price,orig_price,collection,label,gender,product_category,category
0,/en/product/denize-bubble-crepe-dress-sand-bel...,https://static.smallable.com/1686202-648x648q8...,denize bubble crepe dress,sand,bellepiece,0,1,139.3,199.0,ss,dress,w,long dresses,dress_overalls
1,/en/product/cabane-printed-shirt-blue-hartford...,https://static.smallable.com/1588974-648x648q8...,cabane printed shirt,blue,hartford,0,0,155.0,155.0,ss,top,w,long sleeve shirts,top
2,/en/product/garden-striped-cotton-poplin-top-b...,https://static.smallable.com/1677878-648x648q8...,garden striped cotton poplin top,blue,girls of dust,0,1,108.0,135.0,ss,top,w,short sleeve blouses,top
3,/en/product/tsikis-x-alma-deia-exclusive-blava...,https://static.smallable.com/1641315-648x648q8...,tsikis x alma deia exclusive - blava body chain,orange,alma deia,0,0,215.0,215.0,ss,underwear,w,necklaces,jewelry
4,/en/product/peggy-ibiza-striped-dress-yellow-t...,https://static.smallable.com/1669582-648x648q8...,peggy ibiza striped dress,yellow,the label edition,0,1,308.0,440.0,ss,dress,w,long dresses,dress_overalls


In [99]:
# save to csv
all_filtered.to_csv('filtered_all.csv', index=False)

In [102]:
all_filtered.iloc[2199]

link                /en/product/solid-rain-cape-lavender-beck-sond...
img                 https://static.smallable.com/1604505-648x648q8...
product                                               solid rain cape
color                                                        lavender
brand                                                 becksöndergaard
greenable                                                           0
discounted                                                          1
disc_price                                                       50.0
orig_price                                                      100.0
collection                                                         ss
label                                                            coat
gender                                                              w
product_category                                       capes, ponchos
category                                                         coat
Name: 2199, dtype: o

In [101]:
all_filtered.iloc[4979]

link                /en/product/t-shirt-homme-coton-bio-dusty-pink...
img                 https://static.smallable.com/1697376-648x648q8...
product                                       t-shirt homme coton bio
color                                                      dusty pink
brand                                                smallable basics
greenable                                                           1
discounted                                                          0
disc_price                                                       45.0
orig_price                                                       45.0
collection                                                         ss
label                                                             top
gender                                                              m
product_category                                short sleeve t-shirts
category                                                          top
Name: 4979, dtype: o