In [1]:
# @copyright 2022, Shardul Rajhans.

# This is a research study for the course 'Master of Science in Data Science' performed by Shardul Rajhans 
# under the supervision of Dr Sandeep Raghuwanshi.

In [2]:
# Importing Necessary Libraries.

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Standardize price values used across datasets
import re

In [3]:
df_list=[]

aerie = pd.read_csv("Dataset/ae_com.csv")
df_list.append(aerie)

calvin_klein = pd.read_csv("Dataset/calvinklein_com.csv")
df_list.append(calvin_klein)

amazon = pd.read_csv("Dataset/amazon_com.csv")
df_list.append(amazon)

btemptd = pd.read_csv("Dataset/btemptd_com.csv")
df_list.append(btemptd)

hanky_panky = pd.read_csv("Dataset/hankypanky_com.csv")
df_list.append(hanky_panky)

macys_com = pd.read_csv("Dataset/macys_com.csv")
df_list.append(macys_com)

nordstrom_com = pd.read_csv("Dataset/shop_nordstrom_com.csv")
df_list.append(nordstrom_com)

topshop = pd.read_csv("Dataset/us_topshop_com.csv")
df_list.append(topshop)

victoria_secret = pd.read_csv("Dataset/victoriassecret_com.csv")
df_list.append(victoria_secret)

In [4]:
fashion_data = pd.DataFrame()

for df in df_list:
    fashion_data = fashion_data.append(df)

In [5]:
# Printing shape
fashion_data.shape

(159772, 14)

In [41]:
# Displaying the information of all the columns
fashion_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159772 entries, 0 to 14
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   product_name      159772 non-null  object 
 1   mrp               159772 non-null  object 
 2   price             159772 non-null  object 
 3   pdp_url           159772 non-null  object 
 4   brand_name        159772 non-null  object 
 5   product_category  159772 non-null  object 
 6   retailer          159772 non-null  object 
 7   description       159772 non-null  object 
 8   rating            95822 non-null   float64
 9   review_count      84296 non-null   float64
 10  style_attributes  156675 non-null  object 
 11  total_sizes       159772 non-null  object 
 12  available_size    159772 non-null  object 
 13  color             159095 non-null  object 
dtypes: float64(2), object(12)
memory usage: 18.3+ MB


In [42]:
# Displaying the information of all the numeric columns
fashion_data.describe()

Unnamed: 0,rating,review_count
count,95822.0,84296.0
mean,4.381627,190.682915
std,0.55616,337.040504
min,0.0,1.0
25%,4.2,3.0
50%,4.4,19.0
75%,4.8,247.0
max,5.0,1894.0


# Cleaning the Dataset
In this section, we will analyze the missing values, remove the unnecessary columns, and perform all the necessary handling before Exploratory Data Analysis.

In [43]:
# There are few columns that are not required throughout this study, hence removing the unnecessary columns.
columns_to_drop = ['pdp_url', 'retailer', 'rating', 'review_count', 'style_attributes']
fashion_data.drop(columns_to_drop, axis=1, inplace=True)
fashion_data.head()

Unnamed: 0,product_name,mrp,price,brand_name,product_category,description,total_sizes,available_size,color
0,Aerie Everyday Loves Lace Cheeky,12.50 USD,12.50 USD,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Rugged Green
1,Aerie Everyday Loves Lace Cheeky,12.50 USD,12.50 USD,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Natural Nude
2,Aerie Everyday Loves Lace Cheeky,12.50 USD,12.50 USD,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",True Black
3,Aerie Everyday Loves Lace Cheeky,12.50 USD,12.50 USD,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",White
4,Aerie Everyday Loves Lace Cheeky,12.50 USD,12.50 USD,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Royal Navy


In [44]:
# Conversion to USD for MRP and Price Coloumn
# Reference: https://www.kaggle.com/code/justinekays/eda-for-lingerie?scriptVersionId=8012060&cellId=14

colon_to_dollar_conversion_rate = 0.0017
ind_rp_to_dollar_conversion_rate = 0.000066

def extract_usd_value(value_str):
    if type(value_str) is str:
        value_str= value_str.strip().lower().replace('usd', '').replace('$','').replace("\-.*","").strip()
        value_str = re.sub(r"-.*", "", value_str)
        value_str = re.sub(r"–.*", "", value_str)
        value_str = re.sub(r"\s.*", "", value_str)
        value_str.strip()
        
        if "₡" in value_str:
            value_str = value_str.replace("₡", "").strip()
            value_str = pd.to_numeric(value_str) * colon_to_dollar_conversion_rate            
        elif "rp" in value_str:
            value_str = value_str.replace("rp", "").strip()
            value_str = pd.to_numeric(value_str) * ind_rp_to_dollar_conversion_rate                
            
        return value_str
    else:
        return value_str
    
fashion_data['mrp']=fashion_data['mrp'].apply(extract_usd_value).apply(pd.to_numeric)
fashion_data['price']=fashion_data['price'].apply(extract_usd_value).apply(pd.to_numeric)
fashion_data.head(10)

Unnamed: 0,product_name,mrp,price,brand_name,product_category,description,total_sizes,available_size,color
0,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Rugged Green
1,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Natural Nude
2,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",True Black
3,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",White
4,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Royal Navy
5,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",Slab Gray
6,Aerie Lace Hi-Rise Boybrief,12.5,12.5,AERIE,Boybriefs,Laced with love and extra comfort. No cheek pe...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L""]",White
7,Aerie Lace Hi-Rise Boybrief,12.5,12.5,AERIE,Boybriefs,Laced with love and extra comfort. No cheek pe...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M""]",Natural Nude
8,Aerie Lace Hi-Rise Boybrief,12.5,12.5,AERIE,Boybriefs,Laced with love and extra comfort. No cheek pe...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S""]",True Black
9,Aerie Lace Hi-Rise Boybrief,12.5,12.5,AERIE,Boybriefs,Laced with love and extra comfort. No cheek pe...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M""]",Buff


In [45]:
fashion_data['color'] = fashion_data.color.str.lower()
fashion_data.head()

Unnamed: 0,product_name,mrp,price,brand_name,product_category,description,total_sizes,available_size,color
0,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",rugged green
1,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",natural nude
2,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",true black
3,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",white
4,Aerie Everyday Loves Lace Cheeky,12.5,12.5,AERIE,Cheekies,Introducing Everyday Loves™: Made with love. E...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",royal navy


In [46]:
fashion_data['color'].unique().shape[0]

1138

In [47]:
# Categorizing the Products into Sub-Groups for generalization.
# Reference: https://www.kaggle.com/code/jkokatjuhha/data-driven-lingerie-shopping?scriptVersionId=6849300&cellId=16
def manipul_regex(str_array):
    string = '|'.join(str_array)
    return '(^|\s)(' + string + ')(\s\.|$)'

def categorize_product(data):
    panties = manipul_regex(['thong','g string','pant','v kini','boypant','pants','panty','thongs','panties',
                             'ladypant','knickers','thong','twist knicker','brief','boyshort',
                             'lace v front short','signature lace bike short','side tie bikini',
                             'signature lace string bikini','tanga','panty','hipster','vikini',
                             'cheekster', 'boypants','ladypants', 'boyshorts', 'hiphugger', 
                             'pink high leg logo bikini', 'pink shortie', 'pink logo bikini',
                             'lace sexy shortie', 'body base shorty', 'bikini bottom', 'lace mini bikini', 
                             'ruched mini bikini', 'high leg bikini'])  
    
    bodys = manipul_regex(['bodysuit', 'teddy', 'wink plaything', 'legging', 'dress'])
    bras = manipul_regex(['bra', 'bustier', 'strapless', 'balconette', 'bandeau', 'body by victoria unlined demi', 
                          'push-up', 'push up', 'push', 'pink seamless lightly lined racerback',
                          'body by wacoal seamless underwire','basic beauty wireless contour',
                          'pink lace lightly lined triangle', 'lace cross front unlined halter', 
                          'high neck keyhole halter', 'high-neck wrap'])
    
    activewear = manipul_regex(['sports bra', 'sport bra', 'sport bralette', 'sports bralette', 'strappybralette', 'full-zip'])
    suspenders = manipul_regex(['suspenders','belt'])
    bralettes = manipul_regex(['bralettes','bralette', 'bralet', 'silicone petal'])
    tops = manipul_regex(['tops','top','tee', 'tunic'])
    babydoll = manipul_regex(['babydoll','camisole and bikini set by bluebella','chemise'])
    shorts = manipul_regex(['short','shorts','chiffon tap pant'])
    slip = manipul_regex(['slip', 'waist slip', 'half-slip', 'slips', 'half-slips', 'petticoat'])
    robe = manipul_regex(['robe', 'kimonos','kimono'])
    camisole = manipul_regex(['camisole','cami','tank'])
    rompers = manipul_regex(['romper']) 
    onepiece = manipul_regex(['One-Piece', 'one piece', 'One-Piece Halter'])
    
    categories_list = [['slip',slip], ['shorts',shorts], ['robe', robe], ['tops',tops], ['suspenders',suspenders],
                 ['rompers', rompers], ['babydoll', babydoll], ['bodys',bodys], ['bralettes', bralettes], 
                 ['activewear', activewear], ['camisoles',camisole], ['bras',bras], ['panties',panties], ['onepiece', onepiece]]

    for items in categories_list:
        naming, reg = items
        data.loc[(data['product_name'].str.contains(reg, case=False)), 'product_category_wide'] = naming
    return data

In [48]:
def manipul_regex(str_array):
    string = '|'.join(str_array)
    return '(^|\s)(' + string + ')(\s|$)'

def categorize_colors(data):
    green = manipul_regex(["green", "emerald", "fir", "bayberry", "cocoon", "olive", "turquoise", "basil", 
                           "seafoam glow", "smokey pearl", "cactus"])
    blue = manipul_regex(["blue", "navy", "teal", "denim", "azure", "celeste", "turkish", "sea", "cloud", 
                          "neon", "ensign", "sky", "sapphire", "saphire", "blues", "ink", "aqua", "watercolor", 
                          "bluebell", "northstar", "bluebird", "inkblot", "nordic", "Evening Tide", "Marine"])
    white = manipul_regex(["white", "ivory", "cashew", "coconut", "marshmallow", "marble", "White/Victoria", 
                           "sugar", "suger", "moon", "Dragonfruit", "pillow", "white/rose", "White/Red"])
    red = manipul_regex(["red", "candy apple", "ginger glaze", "plum", "maroon", "ruby","cherry", "strawberry", 
                         "rose", "apple", "berry", "crimson"])
    yellow = manipul_regex(["yellow", "gold", "buff", "coral rock", 'daisy', "haze", "lime", "leopard", "bronzer", 
                            "ginger", "mango"])
    orange = manipul_regex(["orange", "peach", "sunset", "melon", "floral", "tropical", "teak", "peel", "fireworks", "firework"])
    pink = manipul_regex(["pink", "rosewater", "fuschia", "blush", "peach", "lotus", "fair orchid", "plum dust", "begonia", 
                          "hello lovely", "forever young", "lip", "lipsmacker", "dazzle", "geo", "celestial", 
                          "galaxy", "flamingo", "cream", "pinky", "mauvelous"])
    grey = manipul_regex(["gray", "grey", "pewter", "slate", "silver", "grey/blush", "heather", "Black/White", "marl", 
                          "shadow", "charcoal", "Marl/Victoria", "White/Black"])
    brown = manipul_regex(["brown", "taupe", "chai", "cappuccino", "sienna", "toast", "french roast"])
    maroon = manipul_regex(["maroon", "kir", "cinnamon"])
    purple = manipul_regex(["mauve", "lilac", "purple", "violet", "grape", "blackberry", "amethyst", "lavender", 
                            "lavendar", "red/Blue", "ignited", "eggplant", "fair orichid"])
    black = manipul_regex(["black", "midnight", "night", "Black/Victoria", "dark", "Sapphire/Black", "Black/Mesh", 
                           "Black/Zig Zag", "Black/", "black/rose", "Black/Rock", "Black/Red"])
    nude = manipul_regex(["nude", "bare", "champagne", "light", "neutral", "infinity", "dust", "flake", "Mint", 
                          "nightlife", "Snowflake", "cool", "clean", "Warm"])
    multicolor = manipul_regex(["multi", "blend", "blended", "radiating", "print", "stripe", "stripes", "chevron", 
                                "abstract", "colorblock", "spraypaint", "americana", "mesh", "spring", 
                                "french", "arrows", "arrrows", "moonray", "blocks", "curves", "coral", "rainbow", 
                                "metropolis", "border", "niagara", "aztec", "Striples", "bliss", "kaleidoscope", "party",
                               "paradise palm", "gradient", "2VN", "Seychelles", "multifolk", "C3K", "Multicolor"])
    
    colors_list = [['green',green], ['blue',blue], ['white', white], ['black',black], ['red',red],
                 ['yellow', yellow], ['orange', orange], ['pink',pink], ['nude', nude], 
                 ['grey', grey], ['brown',brown], ['maroon',maroon], ['purple',purple], ['multicolor', multicolor]]
    
    for items in colors_list:
        naming, reg = items
        data.loc[(data['color'].str.contains(reg, case=False)), 'color_group'] = naming
    return data

In [49]:
fashion_data.color.fillna('other', inplace=True)
fashion_data = categorize_colors(fashion_data)
fashion_data['color_group'].unique()

array(['green', 'nude', 'black', 'white', 'blue', 'grey', 'yellow', nan,
       'red', 'multicolor', 'maroon', 'pink', 'brown', 'purple', 'orange'],
      dtype=object)

In [50]:
fashion_data = fashion_data[~fashion_data.product_category.isnull()]
fashion_data = categorize_product(fashion_data)
fashion_data['product_category_wide'].unique()

array([nan, 'panties', 'bras', 'bralettes', 'babydoll', 'tops',
       'camisoles', 'shorts', 'rompers', 'bodys', 'suspenders', 'slip',
       'robe', 'activewear'], dtype=object)

In [51]:
fashion_data['product_category_wide'].value_counts()

panties       55881
bras          53948
bralettes     16928
camisoles      2751
babydoll       2681
bodys          1510
slip            703
tops            577
shorts          529
suspenders      249
robe            241
rompers          56
activewear       47
Name: product_category_wide, dtype: int64

In [52]:
fashion_data['product_category_wide'].isnull().sum()

23671

In [53]:
# There are some products like bottle, basketball, etc. that are not relevant for this study, thus can be dropped.
# Also, the suspenders cateogory can be removed as it has only 2 entries.

fashion_data = fashion_data[~(fashion_data['product_category_wide'].isnull()) & 
                                      ~(fashion_data['product_category_wide'] == 'suspenders')]
fashion_data.head()

Unnamed: 0,product_name,mrp,price,brand_name,product_category,description,total_sizes,available_size,color,color_group,product_category_wide
16,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",true black,black,panties
17,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",slab gray,grey,panties
18,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""XL"", ""XXL""]",coral,multicolor,panties
19,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""XL"", ""XXL""]",bright cobalt,,panties
20,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""XL"", ""XXL""]",valentine,,panties


In [54]:
fashion_data.color_group.fillna('other', inplace=True)
fashion_data.head()

Unnamed: 0,product_name,mrp,price,brand_name,product_category,description,total_sizes,available_size,color,color_group,product_category_wide
16,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",true black,black,panties
17,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",slab gray,grey,panties
18,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""XL"", ""XXL""]",coral,multicolor,panties
19,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""XL"", ""XXL""]",bright cobalt,other,panties
20,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""XL"", ""XXL""]",valentine,other,panties


In [55]:
fashion_data['product_category_wide'].value_counts()

panties       55881
bras          53948
bralettes     16928
camisoles      2751
babydoll       2681
bodys          1510
slip            703
tops            577
shorts          529
robe            241
rompers          56
activewear       47
Name: product_category_wide, dtype: int64

# Feature Engineering

In [57]:
fashion_data.drop_duplicates().shape

(36114, 11)

In [59]:
fashion_data.brand_name.value_counts()

HankyPanky                                                                                             31311
Wacoal                                                                                                 28418
AERIE                                                                                                  18302
Calvin Klein                                                                                           15283
Hanky Panky                                                                                             7876
Calvin-Klein                                                                                            6651
WACOAL                                                                                                  4782
b.tempt'd by Wacoal                                                                                     4525
HANKY PANKY                                                                                             3972
b-temptd           

In [23]:
victoria_secret = fashion_data[(fashion_data['brand_name']=="Victoria's Secret") | 
                               (fashion_data['brand_name']=="Victoria's Secret Pink")]
# victoria_secret.groupby(['product_name', 'total_sizes']).count()
victoria_secret.shape

(15, 11)

In [231]:
# Dropping unnecessary sizes from the dataframe.
victoria_secret = fashion_data[fashion_data['brand_name']=="Victoria's Secret" | 
                               fashion_data['brand_name']=="Victoria's Secret Pink"]
sizes = pd.DataFrame(victoria_secret['total_sizes'].unique())
sizes.columns = ['total_sizes']
fashion_data = pd.merge(left=fashion_data, right=sizes, on='total_sizes', how='left')
print(fashion_data.shape[0])
fashion_data = fashion_data[~fashion_data['total_sizes'].isnull()]
print(fashion_data.shape[0])
fashion_data.head()

35867
35867


Unnamed: 0,product_name,mrp,price,brand_name,product_category,description,total_sizes,available_size,color,color_group,product_category_wide
0,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",true black,black,panties
1,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]",slab gray,grey,panties
2,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""XL"", ""XXL""]",coral,multicolor,panties
3,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""S"", ""XL"", ""XXL""]",bright cobalt,other,panties
4,Aerie Seamless Thong,12.5,12.5,AERIE,Thongs,Total cheek! Smooth: No seams. All comfort. Re...,"[""XS"", ""S"", ""M"", ""L"", ""XL"", ""XXL""]","[""XS"", ""XL"", ""XXL""]",valentine,other,panties


In [94]:
# Calculating the size count available for individual product.
size_count = fashion_data.groupby(['product_name', 'available_size'])['product_name'].agg({'count'})
size_count.columns = ['size_count']
size_count.reset_index(inplace = True)
size_count.head()

Unnamed: 0,product_name,available_size,size_count
0,"""I DO"" Signature Lace Cheeky Hipster with Gift...","[""Select"", ""XS"", ""S"", ""M"", ""L""]",2
1,"""I DO"" Signature Lace Cheeky Hipster with Gift...","[""XS"", ""S"", ""M"", ""L""]",2
2,'After Midnight' Open Gusset Lace Thong,[nil],3
3,'Annabelle' Lace Camisole,"[""1X"", ""2X"", ""3X""]",1
4,'B Delighted' Convertible Underwire Contour Bra,"[""34B"", ""36B"", ""32C"", ""34DDD"", ""34C"", ""36C"", ""...",4


In [95]:
# Calculating the size count for each available size.
size_index = fashion_data.groupby('available_size')['available_size'].agg({'count'})/fashion_data.shape[0]
size_index.columns = ['size_index']
size_index.head()

Unnamed: 0_level_0,size_index
available_size,Unnamed: 1_level_1
10,0.000129
1012,7.5e-05
1014,1.1e-05
12,0.000226
14,1.1e-05


In [96]:
# Calculating the color count available for individual product.
color_count = fashion_data.groupby(['product_name', 'color'])['product_name'].agg({'count'})
color_count.columns = ['color_count']
color_count.reset_index(inplace = True)
color_count.head()

Unnamed: 0,product_name,color,color_count
0,"""I DO"" Signature Lace Cheeky Hipster with Gift...",white/clear crystals,4
1,'After Midnight' Open Gusset Lace Thong,leopard,3
2,'Annabelle' Lace Camisole,white/ baby blue,1
3,'B Delighted' Convertible Underwire Contour Bra,night,4
4,'B Delighted' Convertible Underwire Contour Bra,praline,4


In [97]:
# Calculating the color count index using average of available color numbers.
color_index = fashion_data.groupby('color')['color'].agg({'count'})/fashion_data.shape[0]
color_index.columns = ['color_index']
color_index.head()

Unnamed: 0_level_0,color_index
color,Unnamed: 1_level_1
2vn,0.000216
80s print,0.000485
abstract palm,0.000259
acid peel,8.6e-05
admiral navy,2.2e-05


In [98]:
# Available Size Count
fashion_data['total_sizes_count'] = fashion_data.total_sizes.apply(eval).apply(lambda x: len(x))

# Count of sizes per product
fashion_data = pd.merge(left=fashion_data, right=size_count, on=['product_name', 'available_size'])

# Size Index for the particular size
fashion_data = pd.merge(left=fashion_data, right=size_index, on='available_size')

# Size Weight for each size per product
fashion_data['size_weight'] = fashion_data['size_count'] * fashion_data['size_index']

# Count of colors per product
fashion_data = pd.merge(left=fashion_data, right=color_count, on=['product_name', 'color'])

# Color Index for the particular color
fashion_data = pd.merge(left=fashion_data, right=color_index, on='color')

# Color Weight for each color per product
fashion_data['color_weight'] = fashion_data['color_count'] * fashion_data['color_index']

fashion_data.head()

AttributeError: 'DataFrame' object has no attribute 'total_sizes'

In [207]:
# Calculation of size_popularity and color_popularity
victoria_secret_data = fashion_data[['product_name', 'mrp', 'price', 'product_category', 
                           'description', 'product_category_wide', 'size_weight', 'color_weight']]
victoria_secret_data = victoria_secret_data.groupby(['product_name', 'product_category', 
                           'description', 'product_category_wide']).mean()
victoria_secret_data.reset_index(inplace=True)
victoria_secret_data.columns = ['product_name', 'product_category', 'description', 'product_category_group', 
                'mrp', 'price', 'size_popularity', 'color_popularity']
victoria_secret_data.head()

Unnamed: 0,product_name,product_category,description,product_category_group,mrp,price,size_popularity,color_popularity
0,Allover Lace from Cotton Lingerie NEW! Dotted ...,Dotted Mesh Bikini Panty,Dotted mesh in the ultimate everyday shape mak...,panties,10.5,10.5,0.126413,0.154872
1,Allover Lace from Cotton Lingerie NEW! Dotted ...,Dotted Mesh Thong Panty,Dotted mesh in a sexy little shape makes this ...,panties,10.5,10.5,0.126413,0.154872
2,Body by Victoria Cheekini Panty,Cheekini Panty,"The comfiest way to show a little cheek peek, ...",panties,14.5,7.142542,0.754805,0.163567
3,Body by Victoria Daisy Lace Slip,Daisy Lace Slip,What dreams are made of: this little slip in f...,slip,52.0,34.99,0.126413,0.155092
4,Body by Victoria Demi Bra,Demi Bra,A lower cut makes this supersoft bra perfect f...,bras,50.617021,50.617021,1.535736,0.311891


In [208]:
# %discount calculation for each product.
victoria_secret_data['%discount'] = round((1-(victoria_secret_data.price/victoria_secret_data.mrp))*100, 2)
victoria_secret_data.head()

Unnamed: 0,product_name,product_category,description,product_category_group,mrp,price,size_popularity,color_popularity,%discount
0,Allover Lace from Cotton Lingerie NEW! Dotted ...,Dotted Mesh Bikini Panty,Dotted mesh in the ultimate everyday shape mak...,panties,10.5,10.5,0.126413,0.154872,0.0
1,Allover Lace from Cotton Lingerie NEW! Dotted ...,Dotted Mesh Thong Panty,Dotted mesh in a sexy little shape makes this ...,panties,10.5,10.5,0.126413,0.154872,0.0
2,Body by Victoria Cheekini Panty,Cheekini Panty,"The comfiest way to show a little cheek peek, ...",panties,14.5,7.142542,0.754805,0.163567,50.74
3,Body by Victoria Daisy Lace Slip,Daisy Lace Slip,What dreams are made of: this little slip in f...,slip,52.0,34.99,0.126413,0.155092,32.71
4,Body by Victoria Demi Bra,Demi Bra,A lower cut makes this supersoft bra perfect f...,bras,50.617021,50.617021,1.535736,0.311891,0.0


In [209]:
victoria_secret_data.shape

(669, 9)