# Data Analysis

>Importing libraries

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import ast
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

# Data Cleaning & Transformation

>Importing and viewing the fashion dataset


In [2]:
fashion_dataset= pd.read_csv(r".\fashion dataset.csv")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


>Finding the number of unique brands in the fashion dataset

In [3]:
len(pd.unique(fashion_dataset['brand']))

1021

>Importing the brand details dataset

In [4]:
brand_details = pd.read_excel(r".\fashion brand details.xlsx")
brand_details

Unnamed: 0,brand_id,brand_name
0,1,513
1,2,109F
2,3,20Dresses
3,4,250 Designs
4,5,3Pin
...,...,...
1015,1016,Ziva Fashion
1016,1017,Zivame
1017,1018,Ziyaa
1018,1019,Zoella


>Counting the number of unique brand names in the brand details dataset

In [5]:
len(pd.unique(brand_details['brand_name']))

1020

>Finding the number of null and duplicated values in each dataset

In [6]:
brand_details.isna().sum()

brand_id      0
brand_name    0
dtype: int64

In [7]:
fashion_dataset.isna().sum()

p_id              18
name              19
price             19
colour            22
brand             24
ratingCount     7748
avg_rating      7748
description       19
p_attributes      19
dtype: int64

In [8]:
brand_details.duplicated().sum()

0

In [9]:
fashion_dataset.duplicated().sum()

59

>Testing out duplicate dropping

In [10]:
df_dupl = fashion_dataset.drop_duplicates(keep = "last")
len(pd.unique(df_dupl["brand"]))

1021

>Dropping duplicates in fashion dataset

In [11]:
fashion_dataset.drop_duplicates(inplace = True, keep = "last")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


> Expanding the p_attributes column


In [12]:
fashion_dataset['p_attributes'] = fashion_dataset['p_attributes']
not_null = pd.notnull(fashion_dataset['p_attributes'])
fash3 = fashion_dataset[not_null]['p_attributes'].apply(lambda x: ast.literal_eval(x))
temp = pd.DataFrame([*fash3], fash3.index)
fashion_dataset = pd.concat([fashion_dataset, temp], axis=1)

>Renaming columns for use in SQL and MapReduce

In [13]:
fashion_dataset.rename(columns={'Dupatta Pattern':'Dupatta_Pattern', 'Kurta Pattern':'Kurta_Pattern','Bottom Pattern': 'Bottom_Pattern', 'Top Pattern': 'Top_Pattern','Print or Pattern Type': 'Print_Pattern_Type', 'Knit or Woven':'Knit_or_Woven', 'Weave Type':'Weave_Type', 'Weave Pattern':'Weave_Pattern','Dupatta Fabric':'Dupatta_Fabric','Bottom Fabric':'Bottom_Fabric', 'Top Fabric':'Top_Fabric','Fabric Purity':'Fabric_Purity', 'Fabric Type':'Fabric_Type','Blouse Fabric':'Blouse_Fabric', 'Better Cotton Initiative':'Better_Cotton', 'Saree Fabric':'Saree_Fabric', 'Colour Family':'Colour_Family'}, inplace=True)

In [14]:
fashion_dataset['Kurta_Pattern'].isna().sum()

13657

In [15]:
fashion_dataset['Weave_Type'].isna().sum()

8923

In [16]:
fashion_dataset['Knit_or_Woven'].isna().sum()

12353

In [17]:
fashion_dataset['Weave_Pattern'].isna().sum()

13087

>Viewing all columns of the dataset

In [18]:
print(list(fashion_dataset.columns))

['p_id', 'name', 'price', 'colour', 'brand', 'ratingCount', 'avg_rating', 'description', 'p_attributes', 'Occasion', 'Pattern', 'Print_Pattern_Type', 'Body Shape ID', 'Body or Garment Size', 'Closure', 'Fabric', 'Hemline', 'Hood', 'Length', 'Neck', 'Number of Pockets', 'Pocket', 'Sleeve Length', 'Surface Styling', 'Type', 'Wash Care', 'Bottom_Fabric', 'Bottom_Pattern', 'Dupatta Border', 'Dupatta_Fabric', 'Dupatta_Pattern', 'Kurta Fabric', 'Kurta_Pattern', 'Ornamentation', 'Set Content', 'Stitch', 'Technique', 'Add-Ons', 'Brand Fit Name', 'Character', 'Fabric 2', 'Features', 'Fit', 'Fly Type', 'Main Trend', 'Multipack Set', 'Sustainable', 'Type of Pleat', 'Waist Rise', 'Weave_Type', 'Distress', 'Effects', 'Fabric 3', 'Fade', 'Reversible', 'Shade', 'Stretch', 'Type of Distress', 'Waistband', 'Center Front Open', 'Fabric_Type', 'Lining', 'Sleeve Styling', 'Transparency', 'taxMaterial', 'Front Styling', 'Knit_or_Woven', 'Sport', 'Technology', 'Blouse Closure', 'Blouse_Fabric', 'Choli Stitc

### Filling the Fabric column

In [19]:
pd.unique(fashion_dataset['Fabric'])

array([nan, 'Cotton', 'Polyester', 'Poly Crepe', 'Polyviscose',
       'Viscose Rayon', 'Wool', 'Pure Cotton', 'Cotton Blend',
       'Poly Chiffon', 'Linen', 'Poly Georgette', 'Georgette',
       'Silk Blend', 'Fleece', 'Net', 'Modal', 'Art Silk', 'Acrylic',
       'Other', 'Polycotton', 'Nylon', 'Denim', 'Leather', 'Velvet',
       'Liva', 'Poly Silk', 'Silk', 'Pure Silk', 'PU', 'Cotton Silk',
       'Corduroy', 'Organza', 'Viscose Georgette', 'Wool Blend',
       'Chiffon', 'Organic Cotton', 'Brocade', 'Faux Fur', 'Suede',
       'Satin', 'Pure Georgette', 'Chanderi Silk', 'Crepe', 'Tencel',
       'Elastane', 'Dupion', 'Jacquard', 'Synthetic Leather', 'Voile',
       'Dupion Silk', 'Velour', 'Pashmina', 'Pure Crepe', 'Lyocell',
       'Raw Silk', 'Polyester PU Coated', 'Hemp', 'Livaeco'], dtype=object)

In [20]:
sqldf("select count(Fabric) from fashion_dataset where Fabric = 'NA'")

Unnamed: 0,count(Fabric)
0,0


In [21]:
fashion_dataset['Fabric'].isna().sum()

3882

In [22]:
pd.unique(fashion_dataset['Bottom_Fabric'])

array([nan, 'Cotton Blend', 'Viscose Rayon', 'Pure Cotton', 'Satin',
       'Polyester', 'Poly Georgette', 'Shantoon', 'Silk Georgette',
       'Poly Crepe', 'Organic Cotton', 'Poly Silk', 'Silk Blend',
       'Art Silk', 'Pure Silk', 'Pure Wool', 'Poly Chiffon', 'Net',
       'Silk Crepe', 'Silk Chiffon', 'Linen', 'NA', 'Pashmina',
       'Jute Cotton', 'Nylon', 'Velvet', 'Voile'], dtype=object)

In [23]:
fashion_dataset['Bottom_Fabric'] = fashion_dataset['Bottom_Fabric'].where(fashion_dataset['Bottom_Fabric'] != 'NA', None)

In [24]:
pd.unique(fashion_dataset['Blouse_Fabric'])

array([nan, 'Viscose Rayon', 'Organza', 'Poly Silk', 'Silk Blend',
       'Cotton', 'Linen Blend', 'Poly Georgette', 'Pure Georgette',
       'Art Silk', 'Net', 'Silk', 'Satin', 'NA', 'Pure Silk', 'Polyester',
       'Brocade', 'Tissue', 'Cotton Blend', 'Silk Cotton', 'Velvet',
       'Poly Crepe', 'Pure Cotton', 'Raw Silk', 'Polycotton',
       'Pure Chiffon', 'Jute Silk', 'Poly Chiffon', 'Liva', 'Pure Crepe',
       'Pure Linen', 'Supernet', 'Jute Cotton'], dtype=object)

In [25]:
fashion_dataset['Blouse_Fabric'] = fashion_dataset['Blouse_Fabric'].where(fashion_dataset['Blouse_Fabric'] != 'NA', None)

In [26]:
pd.unique(fashion_dataset['Top_Fabric'])

array([nan, 'Cotton Blend', 'Viscose Rayon', 'Pure Cotton', 'Polyester',
       'Liva', 'Poly Crepe', 'Organic Cotton', 'Poly Silk', 'Silk Blend',
       'Poly Georgette', 'Art Silk', 'Pure Wool', 'Net', 'Silk Chiffon',
       'Linen', 'Poly Chanderi', 'Pure Silk', 'Chanderi Cotton',
       'Chanderi Silk', 'Acrylic', 'Silk Georgette', 'Nylon', 'Velvet',
       'Poly Chiffon', 'Voile'], dtype=object)

In [27]:
pd.unique(fashion_dataset['Fabric_Type'])

array([nan, 'Georgette', 'NA', 'Denim', 'Cotton', 'Lace', 'Crepe',
       'Chiffon', 'Cotton Cambric', 'Net', 'Dobby', 'Scuba', 'Satin',
       'Liva', 'Twill', 'Linen', 'Chambray', 'Velvet', 'Jacquard',
       'Corduroy', 'Schiffli', 'Terry'], dtype=object)

In [28]:
fashion_dataset['Fabric_Type'] = fashion_dataset['Fabric_Type'].where(fashion_dataset['Fabric_Type'] != 'NA', None)

In [29]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Top_Fabric'])

In [30]:
fashion_dataset['Fabric'].isna().sum()

2708

In [31]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Bottom_Fabric'])

In [32]:
fashion_dataset['Fabric'].isna().sum()

2076

In [33]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Fabric_Type'])

In [34]:
fashion_dataset['Fabric'].isna().sum()

2025

In [35]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Blouse_Fabric'])

In [36]:
fashion_dataset['Fabric'].isna().sum()

202

In [37]:
pd.unique(fashion_dataset['Better_Cotton'])

array([nan, 'Regular', 'Better Cotton Initiative'], dtype=object)

In [38]:
sqldf("select Better_Cotton from fashion_dataset where Fabric is null and Better_Cotton in('Better Cotton Initiative','Regular')")

Unnamed: 0,Better_Cotton
0,Regular
1,Regular
2,Regular


In [39]:
fashion_dataset['Better_Cotton'] = fashion_dataset['Better_Cotton'].replace(['Regular', 'Better Cotton Initiative'], ['Cotton',None])

In [40]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Better_Cotton'])

In [41]:
fashion_dataset['Fabric'].isna().sum()

199

In [42]:
pd.unique(fashion_dataset['Dupatta_Fabric'])
fashion_dataset['Dupatta_Fabric'] = fashion_dataset['Dupatta_Fabric'].where(fashion_dataset['Dupatta_Fabric'] != 'NA', None)

In [43]:
sqldf("select Dupatta_Fabric from fashion_dataset where Dupatta_Fabric is not null and Fabric is null")

Unnamed: 0,Dupatta_Fabric
0,Net


In [44]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Dupatta_Fabric'])

In [45]:
fashion_dataset['Fabric'].isna().sum()

198

In [46]:
sqldf("select Saree_Fabric from fashion_dataset where Saree_Fabric is not null and Fabric is null")

Unnamed: 0,Saree_Fabric
0,Poly Georgette
1,Pure Linen
2,Pure Cotton
3,Cotton Blend
4,Cotton Blend
5,Pure Linen
6,Pure Cotton
7,Pure Chiffon
8,Poly Chiffon
9,Pure Cotton


In [47]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Saree_Fabric'])

In [109]:
fashion_dataset['Fabric'].isna().sum()

143

In [117]:
fashion_dataset.fillna({'Fabric':'fashion_dataset.mode()'}, inplace=True)
fashion_dataset['Fabric'].isna().sum()

0

In [49]:
sqldf("select count(Fabric) from fashion_dataset where Fabric = 'NA'")

Unnamed: 0,count(Fabric)
0,0


### Viewing Weave Type & Knit or Woven

In [50]:
pd.unique(fashion_dataset['Weave_Type'])

array([nan, 'Woven', 'Machine Weave', 'Knitted', 'Knitted and Woven',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'],
      dtype=object)

In [51]:
pd.unique(fashion_dataset['Weave_Type'])

array([nan, 'Woven', 'Machine Weave', 'Knitted', 'Knitted and Woven',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'],
      dtype=object)

In [52]:
pd.unique(fashion_dataset['Knit_or_Woven'])

array([nan, 'Knitted', 'Knitted and Woven', 'Woven'], dtype=object)

In [53]:
pd.unique(fashion_dataset['Weave_Pattern'])

array([nan, 'Regular', 'Jacquard', 'Brocade', 'Dobby', 'Khadi'],
      dtype=object)

### Formatting Weave Pattern  and Weave Type to fill Knit or Woven

In [54]:
fashion_dataset['Weave_Pattern'] = fashion_dataset['Weave_Pattern'].replace(['Regular', 'Jacquard', 'Brocade', 'Dobby', 'Khadi'], ['Woven','Woven','Woven','Woven','Woven'])

In [55]:
pd.unique(fashion_dataset['Weave_Pattern'])

array([nan, 'Woven'], dtype=object)

In [56]:
fashion_dataset['Weave_Type'] = fashion_dataset['Weave_Type'].replace(['Machine Weave',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'], ['Woven','Woven','Woven','Woven','Knitted','Woven','Woven'])

In [57]:
pd.unique(fashion_dataset['Weave_Type'])

array([nan, 'Woven', 'Knitted', 'Knitted and Woven'], dtype=object)

> Use 'Occasion', 'Print or Pattern Type', 'Pattern', 'Top Pattern', 'Bottom Pattern', 'Dupatta Pattern', 'Kurta Pattern', 'Weave Pattern', 'Knit or Woven', 'Sustainable', 'Sport', 'Fusion_Wear'

### Merging the datasets

In [58]:
merged_dataset = ps.sqldf("select fashion_dataset.p_id, fashion_dataset.name, fashion_dataset.price, fashion_dataset.colour,fashion_dataset.Colour_Family, brand_details.*, fashion_dataset.brand, fashion_dataset.ratingCount, fashion_dataset.avg_rating, fashion_dataset.Occasion, fashion_dataset.Print_Pattern_Type, fashion_dataset.Pattern, fashion_dataset.Top_Pattern, fashion_dataset.Bottom_Pattern, fashion_dataset.Dupatta_Pattern, fashion_dataset.Kurta_Pattern, fashion_dataset.Sustainable, fashion_dataset.Weave_Type, fashion_dataset.Knit_or_Woven,fashion_dataset.Weave_Pattern, fashion_dataset.Fabric, fashion_dataset.Fabric_Purity from fashion_dataset left join brand_details on (fashion_dataset.brand = brand_details.brand_name)")
merged_dataset

Unnamed: 0,p_id,name,price,colour,Colour_Family,brand_id,brand_name,brand,ratingCount,avg_rating,...,Top_Pattern,Bottom_Pattern,Dupatta_Pattern,Kurta_Pattern,Sustainable,Weave_Type,Knit_or_Woven,Weave_Pattern,Fabric,Fabric_Purity
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,,242.0,Dupatta Bazaar,Dupatta Bazaar,1321.0,4.548827,...,,,,,,,,,,
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,,750.0,Roadster,Roadster,5462.0,4.313255,...,,,,,,,,,Cotton,
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,,389.0,Inddus,Inddus,145.0,4.068966,...,,Woven Design,Woven Design,Woven Design,,,,,Cotton Blend,
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,,,,SASSAFRAS,9124.0,4.147523,...,,,,,Regular,Woven,,,,
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,,482.0,Kotty,Kotty,12260.0,4.078467,...,,,,,,,,,Cotton,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14265,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,,880.0,The Chennai Silks,The Chennai Silks,,,...,,,,,Regular,,,,Jute Cotton,
14266,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,,471.0,Kinder Kids,Kinder Kids,,,...,Printed,Printed,,,Regular,,,,Cotton,
14267,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,,,,KLOTTHE,,,...,,,,,Regular,Woven,,,Polycotton,
14268,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,,,,InWeave,,,...,,,,,Regular,,Woven,,Viscose Rayon,


### Dropping unnecessary null values

In [59]:
merged_dataset.isna().sum()

p_id                      1
name                      2
price                     2
colour                    5
Colour_Family         13807
brand_id               6199
brand_name             6199
brand                     7
ratingCount            7707
avg_rating             7707
Occasion               1101
Print_Pattern_Type     4277
Pattern                3807
Top_Pattern           12112
Bottom_Pattern        11477
Dupatta_Pattern       12132
Kurta_Pattern         13657
Sustainable            1303
Weave_Type             8923
Knit_or_Woven         12353
Weave_Pattern         13087
Fabric                  143
Fabric_Purity         13727
dtype: int64

In [60]:
sqldf("select * from merged_dataset where p_id is null")

Unnamed: 0,p_id,name,price,colour,Colour_Family,brand_id,brand_name,brand,ratingCount,avg_rating,...,Top_Pattern,Bottom_Pattern,Dupatta_Pattern,Kurta_Pattern,Sustainable,Weave_Type,Knit_or_Woven,Weave_Pattern,Fabric,Fabric_Purity
0,,,,,,,,,,,...,,,,,,,,,,


In [61]:
merged_dataset.dropna(subset = "p_id", inplace = True)
merged_dataset.isna().sum()

p_id                      0
name                      1
price                     1
colour                    4
Colour_Family         13806
brand_id               6198
brand_name             6198
brand                     6
ratingCount            7706
avg_rating             7706
Occasion               1100
Print_Pattern_Type     4276
Pattern                3806
Top_Pattern           12111
Bottom_Pattern        11476
Dupatta_Pattern       12131
Kurta_Pattern         13656
Sustainable            1302
Weave_Type             8922
Knit_or_Woven         12352
Weave_Pattern         13086
Fabric                  142
Fabric_Purity         13726
dtype: int64

In [62]:
sqldf("select * from merged_dataset where name is null")

Unnamed: 0,p_id,name,price,colour,Colour_Family,brand_id,brand_name,brand,ratingCount,avg_rating,...,Top_Pattern,Bottom_Pattern,Dupatta_Pattern,Kurta_Pattern,Sustainable,Weave_Type,Knit_or_Woven,Weave_Pattern,Fabric,Fabric_Purity
0,19068208.0,,,,,,,,,,...,,,,,,,,,,


In [63]:
merged_dataset.dropna(subset = "name", inplace = True)
merged_dataset.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
Colour_Family         13805
brand_id               6197
brand_name             6197
brand                     5
ratingCount            7705
avg_rating             7705
Occasion               1099
Print_Pattern_Type     4275
Pattern                3805
Top_Pattern           12110
Bottom_Pattern        11475
Dupatta_Pattern       12130
Kurta_Pattern         13655
Sustainable            1301
Weave_Type             8921
Knit_or_Woven         12351
Weave_Pattern         13085
Fabric                  141
Fabric_Purity         13725
dtype: int64

>Upon inspection, the rows where brand was null were duplicate rows, and were therefore dropped.

In [64]:
merged_dataset.dropna(subset = "brand", inplace = True)
merged_dataset.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
Colour_Family         13800
brand_id               6192
brand_name             6192
brand                     0
ratingCount            7703
avg_rating             7703
Occasion               1099
Print_Pattern_Type     4275
Pattern                3805
Top_Pattern           12105
Bottom_Pattern        11470
Dupatta_Pattern       12125
Kurta_Pattern         13650
Sustainable            1300
Weave_Type             8917
Knit_or_Woven         12350
Weave_Pattern         13080
Fabric                  141
Fabric_Purity         13720
dtype: int64

>Finding why there are null brand name and brand ID values

In [65]:
sqldf("select distinct brand from merged_dataset where brand_id is null")

Unnamed: 0,brand
0,SASSAFRAS
1,KASSUALLY
2,Saree mall
3,MONTREZ
4,DOLCE CRUDO
...,...
515,SARIYA
516,kasee
517,SHIVANGI clothing
518,ARTICALE


In [66]:
sqldf("select distinct brand, brand_id from merged_dataset order by brand_id asc")

Unnamed: 0,brand,brand_id
0,SASSAFRAS,
1,KASSUALLY,
2,Saree mall,
3,MONTREZ,
4,DOLCE CRUDO,
...,...,...
1015,Zigo,1011.0
1016,Zima Leto,1013.0
1017,Zink London,1014.0
1018,Ziva Fashion,1016.0


In [67]:
sqldf("select distinct brand_id from merged_dataset order by brand_id asc")

Unnamed: 0,brand_id
0,
1,1.0
2,2.0
3,3.0
4,7.0
...,...
496,1011.0
497,1013.0
498,1014.0
499,1016.0


In [68]:
sqldf("select brand_id from brand_details where brand_name = 'KASSUALLY'")

Unnamed: 0,brand_id


In [69]:
sqldf("select distinct * from merged_dataset where brand is null")

Unnamed: 0,p_id,name,price,colour,Colour_Family,brand_id,brand_name,brand,ratingCount,avg_rating,...,Top_Pattern,Bottom_Pattern,Dupatta_Pattern,Kurta_Pattern,Sustainable,Weave_Type,Knit_or_Woven,Weave_Pattern,Fabric,Fabric_Purity


In [70]:
sqldf("select * from merged_dataset where brand_name is null")

Unnamed: 0,p_id,name,price,colour,Colour_Family,brand_id,brand_name,brand,ratingCount,avg_rating,...,Top_Pattern,Bottom_Pattern,Dupatta_Pattern,Kurta_Pattern,Sustainable,Weave_Type,Knit_or_Woven,Weave_Pattern,Fabric,Fabric_Purity
0,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,,,,SASSAFRAS,9124.0,4.147523,...,,,,,Regular,Woven,,,,
1,12742100.0,KASSUALLY Women Black & Pink Printed Basic Jum...,2199.0,Black,,,,KASSUALLY,6297.0,4.349214,...,,,,,Regular,,,,Polyester,
2,13842966.0,Sassafras Brown & Red Geometric Printed George...,1499.0,Brown,,,,SASSAFRAS,7358.0,4.395352,...,,,,,Regular,Woven,,,Polyester,
3,16595858.0,Saree Mall Floral Saree,3599.0,Pink,,,,Saree mall,1005.0,3.980100,...,,,,,Regular,,,,Organza,
4,18601482.0,MONTREZ Women White Black Open Front Jacket,1999.0,White,,,,MONTREZ,61.0,4.377049,...,,,,,Regular,Woven,,,Cotton,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6187,18055840.0,tantkatha Black Front Closure Saree Blouse Wit...,1699.0,Black,,,,tantkatha,,,...,,,,,Regular,,,,Dupion,
6188,19361072.0,BoStreet Women Green Solid Mom Fit Trousers,2599.0,Green,,,,BoStreet,,,...,,,,,Regular,Knitted,,,Polyester,
6189,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,,,,KLOTTHE,,,...,,,,,Regular,Woven,,,Polycotton,
6190,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,,,,InWeave,,,...,,,,,Regular,,Woven,,Viscose Rayon,


In [71]:
sqldf("select count(distinct brand_id) from merged_dataset")

Unnamed: 0,count(distinct brand_id)
0,500


In [72]:
sqldf("select max(brand_id), brand_name from brand_details")

Unnamed: 0,max(brand_id),brand_name
0,1020,Zola


### Filling brand ID

>Getting rid of null values in brand and brand ID by assigning new IDs

In [73]:
new_id = sqldf("select distinct brand, brand_id from merged_dataset order by brand asc")
new_id

Unnamed: 0,brand,brand_id
0,109F,2.0
1,20Dresses,3.0
2,250 DESIGNS,
3,3PIN,
4,513,1.0
...,...,...
1015,trueBrowns,
1016,urSense,
1017,wild U,
1018,zebu,


In [74]:
new_id.loc[:, 'brandID'] = range(1, 1021)
new_id.drop('brand_id', axis = 'columns')

Unnamed: 0,brand,brandID
0,109F,1
1,20Dresses,2
2,250 DESIGNS,3
3,3PIN,4
4,513,5
...,...,...
1015,trueBrowns,1016
1016,urSense,1017
1017,wild U,1018
1018,zebu,1019


In [75]:
dataset_final = sqldf("select merged_dataset.*, new_id.brandID from merged_dataset left join new_id on (merged_dataset.brand = new_id.brand)")
dataset_final.drop(['brand_id','brand_name'], axis = 'columns', inplace = True)
dataset_final.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
Colour_Family         13800
brand                     0
ratingCount            7703
avg_rating             7703
Occasion               1099
Print_Pattern_Type     4275
Pattern                3805
Top_Pattern           12105
Bottom_Pattern        11470
Dupatta_Pattern       12125
Kurta_Pattern         13650
Sustainable            1300
Weave_Type             8917
Knit_or_Woven         12350
Weave_Pattern         13080
Fabric                  141
Fabric_Purity         13720
brandID                   0
dtype: int64

### Filling Print_Pattern_Type null values

In [76]:
pd.unique(dataset_final['Print_Pattern_Type'])

array(['Floral', 'Solid', 'Ethnic Motifs', None, 'Geometric', 'Graphic',
       'Self Design', 'Checked', 'Typography', 'Washed', 'Kalamkari',
       'Embellished', 'Humour and Comic', 'Animal', 'Paisley', 'Ribbed',
       'Striped', 'Bandhani', 'Abstract', 'Camouflage', 'Tie and Dye',
       'Woven Design', 'Colourblocked', 'Vertical Stripes', 'Polka Dots',
       'Leheriya', 'Cable Knit', 'Brand Logo', 'Open Knit',
       'Alphanumeric', 'Other', 'Conversational', 'NA', 'Chevron',
       'Ombre', 'Horizontal Stripes', 'Textured', 'Superhero', 'Warli',
       'Boucle', 'Batik', 'Quirky', 'Tribal', 'Ajrak', 'Embroidered',
       'Sequinned Stripes', 'Botanical', 'Shimmer', 'Bagh', 'Houndstooth',
       'Fair Isle', 'Dabu', 'Cartoon Characters', 'Argyle'], dtype=object)

In [77]:
sqldf("select Print_Pattern_Type, Pattern, Top_Pattern, Bottom_Pattern from dataset_final where Print_Pattern_Type is null and Top_Pattern is not null or Bottom_Pattern is not null or Pattern is not null")

Unnamed: 0,Print_Pattern_Type,Pattern,Top_Pattern,Bottom_Pattern
0,Floral,Embroidered,,
1,Solid,Solid,,
2,Ethnic Motifs,,,Woven Design
3,Solid,Solid,,
4,,Printed,,
...,...,...,...,...
13246,Floral,Woven Design,,
13247,,,Printed,Printed
13248,Floral,Printed,,
13249,Floral,Printed,,


In [78]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Pattern'])

In [79]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Bottom_Pattern'])

In [80]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Top_Pattern'])

In [81]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Kurta_Pattern'])

In [82]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Dupatta_Pattern'])

In [83]:
dataset_final.drop(['Pattern','Top_Pattern','Bottom_Pattern', 'Kurta_Pattern','Dupatta_Pattern'], axis = 'columns', inplace=True)

In [84]:
dataset_final.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
Colour_Family         13800
brand                     0
ratingCount            7703
avg_rating             7703
Occasion               1099
Print_Pattern_Type     1000
Sustainable            1300
Weave_Type             8917
Knit_or_Woven         12350
Weave_Pattern         13080
Fabric                  141
Fabric_Purity         13720
brandID                   0
dtype: int64

### Filling Knit_or_Woven

In [85]:
#dataset_final['Weave_Type'] = dataset_final['Weave_Type'].combine_first(dataset_final['Weave_Pattern'])

In [86]:
#dataset_final['Weave_Type'] = dataset_final['Weave_Type'].combine_first(dataset_final['Knit_or_Woven'])

In [87]:
#dataset_final.isna().sum()

In [88]:
pd.unique(dataset_final['Weave_Type'])

array([None, 'Woven', 'Knitted', 'Knitted and Woven'], dtype=object)

In [89]:
fabrics_knit = sqldf("select distinct Fabric from dataset_final where Weave_Type = 'Knitted'")
fabrics_knit
#fabrics_knit.dropna(inplace =True)

Unnamed: 0,Fabric
0,Polyester
1,Cotton
2,Pure Cotton
3,Cotton Blend
4,
5,Viscose Rayon
6,Fleece
7,Art Silk
8,Poly Georgette
9,Linen


In [90]:
#fabrics_knit.loc[:, 'weave'] = 'Knitted'

In [91]:
sqldf("select Fabric, Weave_Type, Weave_pattern, Print_Pattern_Type, Knit_or_Woven, brand from dataset_final where Fabric = 'Other'")

Unnamed: 0,Fabric,Weave_Type,Weave_Pattern,Print_Pattern_Type,Knit_or_Woven,brand
0,Other,,,Floral,,Sangria
1,Other,Woven,,Solid,,La Aimee
2,Other,Woven,,Solid,,La Aimee
3,Other,,,Solid,,Swtantra
4,Other,,,Embroidered,,SALWAR STUDIO
5,Other,,,Solid,,Triyah
6,Other,,,Solid,,Swtantra
7,Other,,,Printed,,Triyah
8,Other,,,Solid,,Swtantra
9,Other,,,Printed,,Saaki


In [92]:
fabrics_weave = sqldf("select distinct Fabric from dataset_final where Weave_Type = 'Woven'")
fabrics_weave

Unnamed: 0,Fabric
0,
1,Polyester
2,Viscose Rayon
3,Cotton
4,Pure Cotton
5,Linen
6,Georgette
7,Poly Crepe
8,Modal
9,Poly Silk


In [93]:
sqldf("select * from fabrics_weave where Fabric in (select Fabric from fabrics_knit)")

Unnamed: 0,Fabric
0,Polyester
1,Viscose Rayon
2,Cotton
3,Pure Cotton
4,Linen
5,Georgette
6,Modal
7,Silk Blend
8,Cotton Blend
9,Nylon


In [94]:
#fabrics_weave.dropna(inplace = True)

In [95]:
#fabrics_weave.loc[:,'weave'] = 'Woven'

In [96]:
fabrics_both = sqldf("select distinct Fabric from dataset_final where Weave_Type = 'Knitted and Woven'")
fabrics_both
#fabrics_both.dropna(inplace=True)

Unnamed: 0,Fabric
0,Pure Cotton
1,Viscose Rayon
2,Fleece
3,Art Silk
4,Cotton
5,Polyester
6,Cotton Blend
7,Chanderi Silk
8,Cotton Silk
9,Satin


In [97]:
#fabrics_weave.loc[:,'weave'] = 'Knitted and Woven'

In [98]:
#dataset_final = sqldf("select dataset_final.*, fabrics_knit.weave from dataset_final left join fabrics_knit on (fabrics_knit.Fabric=dataset_final.Fabric)")
#dataset_final.isna().sum()

In [99]:
#dataset_final.drop(['Weave_Pattern','Knit_or_Woven'], axis = 'columns', inplace=True)

>Filling in remaining null values

In [100]:
sqldf("select * from dataset_final where colour is null")

Unnamed: 0,p_id,name,price,colour,Colour_Family,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Weave_Type,Knit_or_Woven,Weave_Pattern,Fabric,Fabric_Purity,brandID
0,19145038.0,Baisacrafts Women Pure Cotton Kurta with Trous...,5450.0,,,Baisacrafts,,,Festive,Geometric,Regular,Woven,,Woven,Pure Cotton,,109
1,19142060.0,LIVE OK Women Boyfriend Fit High-Rise Stretcha...,1999.0,,,LIVE OK,,,Casual,,Regular,,,,Cotton,,463
2,16124786.0,MANGO Women Hooded Sweatshirt,2390.0,,,MANGO,,,Casual,Solid,Regular,Knitted,,,Polyester,,498


In [101]:
avg_rating_mean = round(dataset_final["avg_rating"].mean(),6)

In [102]:
dataset_final["ratingCount"].median()

23.0

In [103]:
dataset_final["ratingCount"].mode()

0    5.0
Name: ratingCount, dtype: float64

In [104]:
ratingCount_mean = dataset_final["ratingCount"].mean()

In [105]:
dataset_final.fillna({"colour":"dataset_final.mode()","avg_rating":avg_rating_mean, "ratingCount":ratingCount_mean, "Print_Pattern_Type":"dataset_final.mode()", "Occasion":"dataset_final.mode()"}, inplace = True)
dataset_final.isna().sum()

p_id                      0
name                      0
price                     0
colour                    0
Colour_Family         13800
brand                     0
ratingCount               0
avg_rating                0
Occasion                  0
Print_Pattern_Type        0
Sustainable            1300
Weave_Type             8917
Knit_or_Woven         12350
Weave_Pattern         13080
Fabric                  141
Fabric_Purity         13720
brandID                   0
dtype: int64

In [106]:
dataset_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14263 entries, 0 to 14262
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   p_id                14263 non-null  float64
 1   name                14263 non-null  object 
 2   price               14263 non-null  float64
 3   colour              14263 non-null  object 
 4   Colour_Family       463 non-null    object 
 5   brand               14263 non-null  object 
 6   ratingCount         14263 non-null  float64
 7   avg_rating          14263 non-null  float64
 8   Occasion            14263 non-null  object 
 9   Print_Pattern_Type  14263 non-null  object 
 10  Sustainable         12963 non-null  object 
 11  Weave_Type          5346 non-null   object 
 12  Knit_or_Woven       1913 non-null   object 
 13  Weave_Pattern       1183 non-null   object 
 14  Fabric              14122 non-null  object 
 15  Fabric_Purity       543 non-null    object 
 16  bran

>Exporting data for analysis using Apache MapReduce

In [107]:
dataset_final.to_csv("final_dataset.csv", index = False)

In [108]:
new_dataset = dataset_final.loc[:,['p_id', 'name','price','colour','brand','ratingCount','avg_rating','brand_id']]
new_dataset.to_csv("new_dataset.csv", index = False, header = False)

KeyError: "['brand_id'] not in index"

In [None]:
dataset_final.info()

# Machine Learning

## Regression Modelling

## Pre-processing

In [None]:
regression_data = dataset_final.loc[:,['price','avg_rating','ratingCount', 'brand_id','name','colour','brand','description','p_attributes']]

In [None]:
le = preprocessing.LabelEncoder()

for i in regression_data.columns:
        if regression_data[i].dtype == object:
            regression_data[i] = le.fit_transform(regression_data[i])
        else:
            pass

In [None]:
regression_data

In [None]:
x_reg = regression_data.loc[:,['price','avg_rating','ratingCount','name','colour','description','p_attributes']]
y_reg = regression_data.loc[:,['brand_id']]

>Creating the training and testing datasets

In [None]:
x_reg_train, x_reg_test, y_reg_train, y_reg_test = train_test_split(x_reg, y_reg, test_size=0.25)

print("Your independent training dataset contains ", x_reg_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_reg_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_reg_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_reg_test.shape, " rows and columns.")

### Linear Regression

>Training the model

In [None]:
LR = LinearRegression()

In [None]:
LR.fit(x_reg_train, y_reg_train)

>Testing the model

In [None]:
LR_predict = LR.predict(x_reg_test)
LR_predict

In [None]:
print("Linear Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, LR_predict))

### Bayesian Ridge Regression

In [None]:
BayRidge = BayesianRidge()
BayRidge.fit(x_reg_train, y_reg_train)

In [None]:
BayRidge_predict = BayRidge.predict(x_reg_test)
BayRidge_predict

In [None]:
print("Bayesian Ridge Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, BayRidge_predict))

In [None]:
dataset_final.info()

## Classification Modelling

>Pre-processing

In [None]:
classification_data = dataset_final.loc[:,['name','colour','brand', 'brand_id','description','p_attributes']]
classification_data

In [None]:
le = preprocessing.LabelEncoder()

for i in classification_data.columns:
        if classification_data[i].dtype == object:
            classification_data[i] = le.fit_transform(classification_data[i])
        else:
            pass

In [None]:
classification_data

>Creating the training datasets

In [None]:
x_class = classification_data.loc[:,['name','colour','description','p_attributes']]
y_class = classification_data.loc[:,['brand']]

In [None]:
x_class

In [None]:
x_class_train, x_class_test, y_class_train, y_class_test = train_test_split(x_class, y_class, test_size=0.25)

print("Your independent training dataset contains ", x_class_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_class_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_class_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_class_test.shape, " rows and columns.")

### Gaussian Naive Bayes Classifier

In [None]:
GNB = GaussianNB()

In [None]:
GNB.fit(x_class_train, y_class_train)

In [None]:
GNB_predict = GNB.predict(x_class_test)

In [None]:
print("Gaussian Naive Bayes Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, GNB_predict))*100,2),"%")

### Decision Tree Classifier

In [None]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_class_train, y_class_train)

In [None]:
dtree_predict = dtree.predict(x_class_test)

In [None]:
print("Decision Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, dtree_predict))*100,2),"%")