# Data Analysis

>Importing libraries

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import ast
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

# Data Cleaning & Transformation

>Importing and viewing the fashion dataset


In [2]:
fashion_dataset= pd.read_csv(r".\fashion dataset.csv")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


>Finding the number of unique brands in the fashion dataset

In [3]:
len(pd.unique(fashion_dataset['brand']))

1021

>Importing the brand details dataset

In [4]:
brand_details = pd.read_excel(r".\fashion brand details.xlsx")
brand_details

Unnamed: 0,brand_id,brand_name
0,1,513
1,2,109F
2,3,20Dresses
3,4,250 Designs
4,5,3Pin
...,...,...
1015,1016,Ziva Fashion
1016,1017,Zivame
1017,1018,Ziyaa
1018,1019,Zoella


>Counting the number of unique brand names in the brand details dataset

In [5]:
len(pd.unique(brand_details['brand_name']))

1020

>Finding the number of null and duplicated values in each dataset

In [6]:
brand_details.isna().sum()

brand_id      0
brand_name    0
dtype: int64

In [7]:
fashion_dataset.isna().sum()

p_id              18
name              19
price             19
colour            22
brand             24
ratingCount     7748
avg_rating      7748
description       19
p_attributes      19
dtype: int64

In [8]:
brand_details.duplicated().sum()

0

In [9]:
fashion_dataset.duplicated().sum()

59

>Testing out duplicate dropping

In [10]:
df_dupl = fashion_dataset.drop_duplicates(keep = "last")
len(pd.unique(df_dupl["brand"]))

1021

>Dropping duplicates in fashion dataset

In [11]:
fashion_dataset.drop_duplicates(inplace = True, keep = "last")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


> Expanding the p_attributes column


In [12]:
fashion_dataset['p_attributes'] = fashion_dataset['p_attributes']
not_null = pd.notnull(fashion_dataset['p_attributes'])
fash3 = fashion_dataset[not_null]['p_attributes'].apply(lambda x: ast.literal_eval(x))
temp = pd.DataFrame([*fash3], fash3.index)
fashion_dataset = pd.concat([fashion_dataset, temp], axis=1)

>Viewing all columns of the dataset

In [13]:
print(list(fashion_dataset.columns))

['p_id', 'name', 'price', 'colour', 'brand', 'ratingCount', 'avg_rating', 'description', 'p_attributes', 'Occasion', 'Pattern', 'Print or Pattern Type', 'Body Shape ID', 'Body or Garment Size', 'Closure', 'Fabric', 'Hemline', 'Hood', 'Length', 'Neck', 'Number of Pockets', 'Pocket', 'Sleeve Length', 'Surface Styling', 'Type', 'Wash Care', 'Bottom Fabric', 'Bottom Pattern', 'Dupatta Border', 'Dupatta Fabric', 'Dupatta Pattern', 'Kurta Fabric', 'Kurta Pattern', 'Ornamentation', 'Set Content', 'Stitch', 'Technique', 'Add-Ons', 'Brand Fit Name', 'Character', 'Fabric 2', 'Features', 'Fit', 'Fly Type', 'Main Trend', 'Multipack Set', 'Sustainable', 'Type of Pleat', 'Waist Rise', 'Weave Type', 'Distress', 'Effects', 'Fabric 3', 'Fade', 'Reversible', 'Shade', 'Stretch', 'Type of Distress', 'Waistband', 'Center Front Open', 'Fabric Type', 'Lining', 'Sleeve Styling', 'Transparency', 'taxMaterial', 'Front Styling', 'Knit or Woven', 'Sport', 'Technology', 'Blouse Closure', 'Blouse Fabric', 'Choli St

>Renaming columns for use in SQL and MapReduce

In [14]:
fashion_dataset.rename(columns={'Where-to-wear':'Where_to_wear','Dupatta Pattern':'Dupatta_Pattern', 'Kurta Pattern':'Kurta_Pattern','Bottom Pattern': 'Bottom_Pattern', 'Top Pattern': 'Top_Pattern','Print or Pattern Type': 'Print_Pattern_Type', 'Knit or Woven':'Knit_or_Woven', 'Weave Type':'Weave_Type', 'Weave Pattern':'Weave_Pattern','Dupatta Fabric':'Dupatta_Fabric','Bottom Fabric':'Bottom_Fabric', 'Top Fabric':'Top_Fabric','Fabric Purity':'Fabric_Purity', 'Fabric Type':'Fabric_Type','Blouse Fabric':'Blouse_Fabric', 'Better Cotton Initiative':'Better_Cotton', 'Saree Fabric':'Saree_Fabric', 'Colour Family':'Colour_Family'}, inplace=True)

In [15]:
fashion_dataset['Kurta_Pattern'].isna().sum()

13657

In [16]:
fashion_dataset['Weave_Type'].isna().sum()

8923

In [17]:
fashion_dataset['Knit_or_Woven'].isna().sum()

12353

In [18]:
fashion_dataset['Weave_Pattern'].isna().sum()

13087

### Filling the Fabric column

In [19]:
pd.unique(fashion_dataset['Fabric'])

array([nan, 'Cotton', 'Polyester', 'Poly Crepe', 'Polyviscose',
       'Viscose Rayon', 'Wool', 'Pure Cotton', 'Cotton Blend',
       'Poly Chiffon', 'Linen', 'Poly Georgette', 'Georgette',
       'Silk Blend', 'Fleece', 'Net', 'Modal', 'Art Silk', 'Acrylic',
       'Other', 'Polycotton', 'Nylon', 'Denim', 'Leather', 'Velvet',
       'Liva', 'Poly Silk', 'Silk', 'Pure Silk', 'PU', 'Cotton Silk',
       'Corduroy', 'Organza', 'Viscose Georgette', 'Wool Blend',
       'Chiffon', 'Organic Cotton', 'Brocade', 'Faux Fur', 'Suede',
       'Satin', 'Pure Georgette', 'Chanderi Silk', 'Crepe', 'Tencel',
       'Elastane', 'Dupion', 'Jacquard', 'Synthetic Leather', 'Voile',
       'Dupion Silk', 'Velour', 'Pashmina', 'Pure Crepe', 'Lyocell',
       'Raw Silk', 'Polyester PU Coated', 'Hemp', 'Livaeco'], dtype=object)

In [20]:
sqldf("select count(Fabric) from fashion_dataset where Fabric = 'NA'")

Unnamed: 0,count(Fabric)
0,0


In [21]:
fashion_dataset['Fabric'].isna().sum()

3882

In [22]:
pd.unique(fashion_dataset['Bottom_Fabric'])

array([nan, 'Cotton Blend', 'Viscose Rayon', 'Pure Cotton', 'Satin',
       'Polyester', 'Poly Georgette', 'Shantoon', 'Silk Georgette',
       'Poly Crepe', 'Organic Cotton', 'Poly Silk', 'Silk Blend',
       'Art Silk', 'Pure Silk', 'Pure Wool', 'Poly Chiffon', 'Net',
       'Silk Crepe', 'Silk Chiffon', 'Linen', 'NA', 'Pashmina',
       'Jute Cotton', 'Nylon', 'Velvet', 'Voile'], dtype=object)

In [23]:
fashion_dataset['Bottom_Fabric'] = fashion_dataset['Bottom_Fabric'].where(fashion_dataset['Bottom_Fabric'] != 'NA', None)

In [24]:
pd.unique(fashion_dataset['Blouse_Fabric'])

array([nan, 'Viscose Rayon', 'Organza', 'Poly Silk', 'Silk Blend',
       'Cotton', 'Linen Blend', 'Poly Georgette', 'Pure Georgette',
       'Art Silk', 'Net', 'Silk', 'Satin', 'NA', 'Pure Silk', 'Polyester',
       'Brocade', 'Tissue', 'Cotton Blend', 'Silk Cotton', 'Velvet',
       'Poly Crepe', 'Pure Cotton', 'Raw Silk', 'Polycotton',
       'Pure Chiffon', 'Jute Silk', 'Poly Chiffon', 'Liva', 'Pure Crepe',
       'Pure Linen', 'Supernet', 'Jute Cotton'], dtype=object)

In [25]:
fashion_dataset['Blouse_Fabric'] = fashion_dataset['Blouse_Fabric'].where(fashion_dataset['Blouse_Fabric'] != 'NA', None)

In [26]:
pd.unique(fashion_dataset['Top_Fabric'])

array([nan, 'Cotton Blend', 'Viscose Rayon', 'Pure Cotton', 'Polyester',
       'Liva', 'Poly Crepe', 'Organic Cotton', 'Poly Silk', 'Silk Blend',
       'Poly Georgette', 'Art Silk', 'Pure Wool', 'Net', 'Silk Chiffon',
       'Linen', 'Poly Chanderi', 'Pure Silk', 'Chanderi Cotton',
       'Chanderi Silk', 'Acrylic', 'Silk Georgette', 'Nylon', 'Velvet',
       'Poly Chiffon', 'Voile'], dtype=object)

In [27]:
pd.unique(fashion_dataset['Fabric_Type'])

array([nan, 'Georgette', 'NA', 'Denim', 'Cotton', 'Lace', 'Crepe',
       'Chiffon', 'Cotton Cambric', 'Net', 'Dobby', 'Scuba', 'Satin',
       'Liva', 'Twill', 'Linen', 'Chambray', 'Velvet', 'Jacquard',
       'Corduroy', 'Schiffli', 'Terry'], dtype=object)

In [28]:
fashion_dataset['Fabric_Type'] = fashion_dataset['Fabric_Type'].where(fashion_dataset['Fabric_Type'] != 'NA', None)

In [29]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Top_Fabric'])

In [30]:
fashion_dataset['Fabric'].isna().sum()

2708

In [31]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Bottom_Fabric'])

In [32]:
fashion_dataset['Fabric'].isna().sum()

2076

In [33]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Fabric_Type'])

In [34]:
fashion_dataset['Fabric'].isna().sum()

2025

In [35]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Blouse_Fabric'])

In [36]:
fashion_dataset['Fabric'].isna().sum()

202

In [37]:
pd.unique(fashion_dataset['Better_Cotton'])

array([nan, 'Regular', 'Better Cotton Initiative'], dtype=object)

In [38]:
sqldf("select Better_Cotton from fashion_dataset where Fabric is null and Better_Cotton in('Better Cotton Initiative','Regular')")

Unnamed: 0,Better_Cotton
0,Regular
1,Regular
2,Regular


In [39]:
fashion_dataset['New_Cotton'] = fashion_dataset['Better_Cotton'].replace(['Regular', 'Better Cotton Initiative'], ['Cotton',None])

In [40]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['New_Cotton'])

In [41]:
fashion_dataset['Fabric'].isna().sum()

199

In [42]:
pd.unique(fashion_dataset['Dupatta_Fabric'])
fashion_dataset['Dupatta_Fabric'] = fashion_dataset['Dupatta_Fabric'].where(fashion_dataset['Dupatta_Fabric'] != 'NA', None)

In [43]:
sqldf("select Dupatta_Fabric from fashion_dataset where Dupatta_Fabric is not null and Fabric is null")

Unnamed: 0,Dupatta_Fabric
0,Net


In [None]:
sqldf("select Weave_Pattern, Fabric from fashion_dataset")

In [None]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Weave_Pattern'])

In [None]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Dupatta_Fabric'])

In [None]:
fashion_dataset['Fabric'].isna().sum()

In [None]:
sqldf("select Saree_Fabric from fashion_dataset where Saree_Fabric is not null and Fabric is null")

In [None]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Saree_Fabric'])

In [None]:
fashion_dataset['Fabric'].isna().sum()

In [None]:
fashion_dataset['Fabric'].mode()

In [None]:
fashion_dataset.fillna({'Fabric':'Cotton'}, inplace=True)
fashion_dataset['Fabric'].isna().sum()

In [None]:
sqldf("select count(Fabric) from fashion_dataset where Fabric = 'NA'")

### Viewing Weave Type & Knit or Woven

In [None]:
pd.unique(fashion_dataset['Weave_Type'])

In [None]:
pd.unique(fashion_dataset['Weave_Type'])

In [None]:
pd.unique(fashion_dataset['Knit_or_Woven'])

In [None]:
pd.unique(fashion_dataset['Weave_Pattern'])

### Formatting Weave Pattern  and Weave Type to fill Knit or Woven

In [None]:
fashion_dataset['Weave_Pattern_2'] = fashion_dataset['Weave_Pattern'].replace(['Regular', 'Jacquard', 'Brocade', 'Dobby', 'Khadi'], ['Woven','Woven','Woven','Woven','Woven'])

In [None]:
pd.unique(fashion_dataset['Weave_Pattern_2'])

In [None]:
fashion_dataset['Weave_Type_2'] = fashion_dataset['Weave_Type'].replace(['Machine Weave',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'], ['Woven','Woven','Woven','Woven','Knitted','Woven','Woven'])

In [None]:
pd.unique(fashion_dataset['Weave_Type_2'])

### Filling Knit_or_Woven

In [None]:
fashion_dataset['Knit_or_Woven'] = fashion_dataset['Knit_or_Woven'].combine_first(fashion_dataset['Weave_Pattern_2'])

In [None]:
fashion_dataset['Knit_or_Woven'] = fashion_dataset['Knit_or_Woven'].combine_first(fashion_dataset['Weave_Type_2'])

In [None]:
fashion_dataset['Knit_or_Woven'].isna().sum()

In [None]:
pd.unique(fashion_dataset['Knit_or_Woven'])

In [None]:
len(pd.unique(fashion_dataset['Fabric']))

In [None]:
fabrics_knit = sqldf("select distinct Fabric from fashion_dataset where Knit_or_Woven = 'Knitted'")

In [None]:
fabrics_knit.isna().sum()

In [None]:
fabrics_weave = sqldf("select distinct Fabric from fashion_dataset where Knit_or_Woven = 'Woven'")

In [None]:
fabrics_weave.isna().sum()

In [None]:
fabrics_both = sqldf("select distinct Fabric from fashion_dataset where Knit_or_Woven = 'Knitted and Woven'")

In [None]:
fabrics_both.isna().sum()

In [None]:
knit_n_woven = sqldf("select * from fabrics_knit where fabrics_knit.Fabric in(select fabrics_weave.Fabric from fabrics_weave)")

In [None]:
knit_n_woven = sqldf("select * from knit_n_woven where Fabric not in ('Other')")
knit_n_woven

In [None]:
fabrics_both = sqldf("select * from fabrics_both where Fabric not in (select Fabric from knit_n_woven)")

In [None]:
both_fabrics = ps.sqldf("select * from knit_n_woven union all select * from fabrics_both")

In [None]:
both_fabrics.loc[:,'weave'] = 'Knitted and Woven'

In [None]:
fabrics_knit = sqldf("select * from fabrics_knit where Fabric not in (select Fabric from both_fabrics) and Fabric != 'Other'")
fabrics_knit.loc[:,['weave']]='Knitted'

In [None]:
fabrics_knit

In [None]:
fabrics_weave = sqldf("select * from fabrics_weave where Fabric not in (select Fabric from both_fabrics) and Fabric != 'Other'")
fabrics_weave.loc[:,['weave']]='Woven'

In [None]:
fabrics_weave

In [None]:
fabric_weave_type = sqldf("select * from fabrics_weave union all select * from fabrics_knit union all select * from both_fabrics")
fabric_weave_type

In [None]:
fashion_dataset = sqldf("select fashion_dataset.*, fabric_weave_type.weave from fashion_dataset left join fabric_weave_type on (fabric_weave_type.Fabric=fashion_dataset.Fabric)")

In [None]:
fashion_dataset['Knit_or_Woven'] = fashion_dataset['Knit_or_Woven'].combine_first(fashion_dataset['weave'])

In [None]:
fashion_dataset['Knit_or_Woven'].isna().sum()

### Filling Occasion

In [None]:
pd.unique(fashion_dataset['Occasion'])

In [None]:
pd.unique(fashion_dataset['Where_to_wear'])

In [None]:
where_to = sqldf("select Where_to_wear, Occasion from fashion_dataset where Occasion is not null and Where_to_wear is not null")
where_to

In [None]:
wear = sqldf("select distinct Where_to_wear from fashion_dataset where Occasion is not null and Where_to_wear is not null")

In [None]:
wear = sqldf("select wear.*, where_to.Occasion from wear left join where_to on (wear.Where_to_wear=where_to.Where_to_wear)")

In [None]:
wear.info()

In [None]:
sqldf("select ")