# Data Analysis

>Importing libraries

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import ast
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

# Data Cleaning & Transformation

>Importing and viewing the fashion dataset


In [2]:
fashion_dataset= pd.read_csv(r".\fashion dataset.csv")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


>Finding the number of unique brands in the fashion dataset

In [3]:
len(pd.unique(fashion_dataset['brand']))

1021

>Importing the brand details dataset

In [4]:
brand_details = pd.read_excel(r".\fashion brand details.xlsx")
brand_details

Unnamed: 0,brand_id,brand_name
0,1,513
1,2,109F
2,3,20Dresses
3,4,250 Designs
4,5,3Pin
...,...,...
1015,1016,Ziva Fashion
1016,1017,Zivame
1017,1018,Ziyaa
1018,1019,Zoella


>Counting the number of unique brand names in the brand details dataset

In [5]:
len(pd.unique(brand_details['brand_name']))

1020

>Finding the number of null and duplicated values in each dataset

In [6]:
brand_details.isna().sum()

brand_id      0
brand_name    0
dtype: int64

In [7]:
fashion_dataset.isna().sum()

p_id              18
name              19
price             19
colour            22
brand             24
ratingCount     7748
avg_rating      7748
description       19
p_attributes      19
dtype: int64

In [8]:
brand_details.duplicated().sum()

0

In [9]:
fashion_dataset.duplicated().sum()

59

>Testing out duplicate dropping

In [10]:
df_dupl = fashion_dataset.drop_duplicates(keep = "last")
len(pd.unique(df_dupl["brand"]))

1021

>Dropping duplicates in fashion dataset

In [11]:
fashion_dataset.drop_duplicates(inplace = True, keep = "last")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


>Dropping duplicate product names and IDs

In [12]:
fashion_dataset.drop_duplicates(subset=['name','p_id'], inplace=True)

In [13]:
fashion_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14223 entries, 0 to 14328
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   p_id          14222 non-null  float64
 1   name          14221 non-null  object 
 2   price         14221 non-null  float64
 3   colour        14218 non-null  object 
 4   brand         14218 non-null  object 
 5   ratingCount   6535 non-null   float64
 6   avg_rating    6535 non-null   float64
 7   description   14221 non-null  object 
 8   p_attributes  14221 non-null  object 
dtypes: float64(4), object(5)
memory usage: 1.1+ MB


> Expanding the p_attributes column


In [14]:
fashion_dataset['p_attributes'] = fashion_dataset['p_attributes']
not_null = pd.notnull(fashion_dataset['p_attributes'])
fashion2 = fashion_dataset[not_null]['p_attributes'].apply(lambda x: ast.literal_eval(x))
temp = pd.DataFrame([*fashion2], fashion2.index)
fashion_dataset = pd.concat([fashion_dataset, temp], axis=1)

>Viewing all columns of the dataset

In [15]:
print(list(fashion_dataset.columns))

['p_id', 'name', 'price', 'colour', 'brand', 'ratingCount', 'avg_rating', 'description', 'p_attributes', 'Occasion', 'Pattern', 'Print or Pattern Type', 'Body Shape ID', 'Body or Garment Size', 'Closure', 'Fabric', 'Hemline', 'Hood', 'Length', 'Neck', 'Number of Pockets', 'Pocket', 'Sleeve Length', 'Surface Styling', 'Type', 'Wash Care', 'Bottom Fabric', 'Bottom Pattern', 'Dupatta Border', 'Dupatta Fabric', 'Dupatta Pattern', 'Kurta Fabric', 'Kurta Pattern', 'Ornamentation', 'Set Content', 'Stitch', 'Technique', 'Add-Ons', 'Brand Fit Name', 'Character', 'Fabric 2', 'Features', 'Fit', 'Fly Type', 'Main Trend', 'Multipack Set', 'Sustainable', 'Type of Pleat', 'Waist Rise', 'Weave Type', 'Distress', 'Effects', 'Fabric 3', 'Fade', 'Reversible', 'Shade', 'Stretch', 'Type of Distress', 'Waistband', 'Center Front Open', 'Fabric Type', 'Lining', 'Sleeve Styling', 'Transparency', 'taxMaterial', 'Front Styling', 'Knit or Woven', 'Sport', 'Technology', 'Blouse Closure', 'Blouse Fabric', 'Choli St

>Renaming columns for use in SQL and MapReduce

In [16]:
fashion_dataset.rename(columns={'Where-to-wear':'Where_to_wear','Dupatta Pattern':'Dupatta_Pattern', 'Kurta Pattern':'Kurta_Pattern','Bottom Pattern': 'Bottom_Pattern', 'Top Pattern': 'Top_Pattern','Print or Pattern Type': 'Print_Pattern_Type', 'Knit or Woven':'Knit_or_Woven', 'Weave Type':'Weave_Type', 'Weave Pattern':'Weave_Pattern','Dupatta Fabric':'Dupatta_Fabric','Bottom Fabric':'Bottom_Fabric', 'Top Fabric':'Top_Fabric','Fabric Purity':'Fabric_Purity', 'Fabric Type':'Fabric_Type','Blouse Fabric':'Blouse_Fabric', 'Better Cotton Initiative':'Better_Cotton', 'Saree Fabric':'Saree_Fabric', 'Colour Family':'Colour_Family'}, inplace=True)

In [17]:
fashion_dataset['Kurta_Pattern'].isna().sum()

13610

In [18]:
fashion_dataset['Weave_Type'].isna().sum()

8884

In [19]:
fashion_dataset['Knit_or_Woven'].isna().sum()

12345

In [20]:
fashion_dataset['Weave_Pattern'].isna().sum()

13040

### Filling the Fabric column

In [21]:
pd.unique(fashion_dataset['Fabric'])

array([nan, 'Cotton', 'Polyester', 'Poly Crepe', 'Polyviscose',
       'Viscose Rayon', 'Wool', 'Pure Cotton', 'Cotton Blend',
       'Poly Chiffon', 'Linen', 'Poly Georgette', 'Georgette',
       'Silk Blend', 'Fleece', 'Net', 'Modal', 'Art Silk', 'Acrylic',
       'Other', 'Polycotton', 'Nylon', 'Denim', 'Leather', 'Velvet',
       'Liva', 'Poly Silk', 'Silk', 'Pure Silk', 'PU', 'Cotton Silk',
       'Corduroy', 'Organza', 'Viscose Georgette', 'Wool Blend',
       'Chiffon', 'Organic Cotton', 'Brocade', 'Faux Fur', 'Suede',
       'Satin', 'Pure Georgette', 'Chanderi Silk', 'Crepe', 'Tencel',
       'Elastane', 'Dupion', 'Jacquard', 'Synthetic Leather', 'Voile',
       'Dupion Silk', 'Velour', 'Pashmina', 'Pure Crepe', 'Lyocell',
       'Raw Silk', 'Polyester PU Coated', 'Hemp', 'Livaeco'], dtype=object)

In [22]:
sqldf("select count(Fabric) from fashion_dataset where Fabric = 'NA'")

Unnamed: 0,count(Fabric)
0,0


In [23]:
fashion_dataset['Fabric'].isna().sum()

3876

In [24]:
pd.unique(fashion_dataset['Bottom_Fabric'])

array([nan, 'Cotton Blend', 'Viscose Rayon', 'Pure Cotton', 'Satin',
       'Polyester', 'Poly Georgette', 'Shantoon', 'Silk Georgette',
       'Poly Crepe', 'Organic Cotton', 'Poly Silk', 'Silk Blend',
       'Art Silk', 'Pure Silk', 'Pure Wool', 'Poly Chiffon', 'Net',
       'Silk Crepe', 'Silk Chiffon', 'Linen', 'NA', 'Pashmina',
       'Jute Cotton', 'Nylon', 'Velvet', 'Voile'], dtype=object)

In [25]:
fashion_dataset['Bottom_Fabric'] = fashion_dataset['Bottom_Fabric'].where(fashion_dataset['Bottom_Fabric'] != 'NA', None)

In [26]:
pd.unique(fashion_dataset['Blouse_Fabric'])

array([nan, 'Viscose Rayon', 'Organza', 'Poly Silk', 'Silk Blend',
       'Cotton', 'Linen Blend', 'Poly Georgette', 'Pure Georgette',
       'Art Silk', 'Net', 'Silk', 'Satin', 'NA', 'Pure Silk', 'Polyester',
       'Brocade', 'Tissue', 'Cotton Blend', 'Silk Cotton', 'Velvet',
       'Poly Crepe', 'Pure Cotton', 'Raw Silk', 'Polycotton',
       'Pure Chiffon', 'Jute Silk', 'Poly Chiffon', 'Liva', 'Pure Crepe',
       'Pure Linen', 'Supernet', 'Jute Cotton'], dtype=object)

In [27]:
fashion_dataset['Blouse_Fabric'] = fashion_dataset['Blouse_Fabric'].where(fashion_dataset['Blouse_Fabric'] != 'NA', None)

In [28]:
pd.unique(fashion_dataset['Top_Fabric'])

array([nan, 'Cotton Blend', 'Viscose Rayon', 'Pure Cotton', 'Polyester',
       'Liva', 'Poly Crepe', 'Organic Cotton', 'Poly Silk', 'Silk Blend',
       'Poly Georgette', 'Art Silk', 'Pure Wool', 'Net', 'Silk Chiffon',
       'Linen', 'Poly Chanderi', 'Pure Silk', 'Chanderi Cotton',
       'Chanderi Silk', 'Acrylic', 'Silk Georgette', 'Nylon', 'Velvet',
       'Poly Chiffon', 'Voile'], dtype=object)

In [29]:
pd.unique(fashion_dataset['Fabric_Type'])

array([nan, 'Georgette', 'NA', 'Denim', 'Cotton', 'Lace', 'Crepe',
       'Chiffon', 'Cotton Cambric', 'Net', 'Dobby', 'Scuba', 'Satin',
       'Liva', 'Twill', 'Linen', 'Chambray', 'Velvet', 'Jacquard',
       'Corduroy', 'Schiffli', 'Terry'], dtype=object)

In [30]:
fashion_dataset['Fabric_Type'] = fashion_dataset['Fabric_Type'].where(fashion_dataset['Fabric_Type'] != 'NA', None)

In [31]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Top_Fabric'])

In [32]:
fashion_dataset['Fabric'].isna().sum()

2702

In [33]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Bottom_Fabric'])

In [34]:
fashion_dataset['Fabric'].isna().sum()

2070

In [35]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Fabric_Type'])

In [36]:
fashion_dataset['Fabric'].isna().sum()

2022

In [37]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Blouse_Fabric'])

In [38]:
fashion_dataset['Fabric'].isna().sum()

199

In [39]:
pd.unique(fashion_dataset['Better_Cotton'])

array([nan, 'Regular', 'Better Cotton Initiative'], dtype=object)

In [40]:
sqldf("select Better_Cotton from fashion_dataset where Fabric is null and Better_Cotton in('Better Cotton Initiative','Regular')")

Unnamed: 0,Better_Cotton
0,Regular
1,Regular
2,Regular


In [41]:
fashion_dataset['New_Cotton'] = fashion_dataset['Better_Cotton'].replace(['Regular', 'Better Cotton Initiative'], ['Cotton',None])

In [42]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['New_Cotton'])

In [43]:
fashion_dataset['Fabric'].isna().sum()

196

In [44]:
pd.unique(fashion_dataset['Dupatta_Fabric'])
fashion_dataset['Dupatta_Fabric'] = fashion_dataset['Dupatta_Fabric'].where(fashion_dataset['Dupatta_Fabric'] != 'NA', None)

In [45]:
sqldf("select Dupatta_Fabric from fashion_dataset where Dupatta_Fabric is not null and Fabric is null")

Unnamed: 0,Dupatta_Fabric
0,Net


In [46]:
sqldf("select Weave_Pattern, Fabric from fashion_dataset")

Unnamed: 0,Weave_Pattern,Fabric
0,,
1,,Cotton
2,,Cotton Blend
3,,
4,,Cotton
...,...,...
14218,,Jute Cotton
14219,,Cotton
14220,,Polycotton
14221,,Viscose Rayon


In [47]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Weave_Pattern'])

In [48]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Dupatta_Fabric'])

In [49]:
fashion_dataset['Fabric'].isna().sum()

194

In [50]:
sqldf("select Saree_Fabric from fashion_dataset where Saree_Fabric is not null and Fabric is null")

Unnamed: 0,Saree_Fabric
0,Poly Georgette
1,Pure Linen
2,Pure Cotton
3,Cotton Blend
4,Cotton Blend
5,Pure Linen
6,Pure Cotton
7,Pure Chiffon
8,Poly Chiffon
9,Pure Cotton


In [51]:
fashion_dataset['Fabric'] = fashion_dataset['Fabric'].combine_first(fashion_dataset['Saree_Fabric'])

In [52]:
fashion_dataset['Fabric'].isna().sum()

139

In [53]:
fashion_dataset['Fabric'].mode()

0    Cotton
Name: Fabric, dtype: object

In [54]:
fashion_dataset.fillna({'Fabric':'Cotton'}, inplace=True)
fashion_dataset['Fabric'].isna().sum()

0

In [55]:
sqldf("select count(Fabric) from fashion_dataset where Fabric = 'NA'")

Unnamed: 0,count(Fabric)
0,0


### Viewing Weave Pattern, Weave Type & Knit or Woven

In [56]:
pd.unique(fashion_dataset['Weave_Type'])

array([nan, 'Woven', 'Machine Weave', 'Knitted', 'Knitted and Woven',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'],
      dtype=object)

In [57]:
pd.unique(fashion_dataset['Weave_Type'])

array([nan, 'Woven', 'Machine Weave', 'Knitted', 'Knitted and Woven',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'],
      dtype=object)

In [58]:
pd.unique(fashion_dataset['Knit_or_Woven'])

array([nan, 'Knitted', 'Knitted and Woven', 'Woven'], dtype=object)

In [59]:
pd.unique(fashion_dataset['Weave_Pattern'])

array([nan, 'Regular', 'Jacquard', 'Brocade', 'Dobby', 'Khadi'],
      dtype=object)

### Formatting Weave Pattern  and Weave Type to fill Knit or Woven

In [60]:
fashion_dataset['Weave_Pattern_2'] = fashion_dataset['Weave_Pattern'].replace(['Regular', 'Jacquard', 'Brocade', 'Dobby', 'Khadi'], ['Woven','Woven','Woven','Woven','Woven'])

In [61]:
pd.unique(fashion_dataset['Weave_Pattern_2'])

array([nan, 'Woven'], dtype=object)

In [62]:
fashion_dataset['Weave_Type_2'] = fashion_dataset['Weave_Type'].replace(['Machine Weave',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'], ['Woven','Woven','Woven','Woven','Knitted','Woven','Woven'])

In [63]:
pd.unique(fashion_dataset['Weave_Type_2'])

array([nan, 'Woven', 'Knitted', 'Knitted and Woven'], dtype=object)

### Filling Knit_or_Woven

In [64]:
fashion_dataset['Knit_or_Woven'] = fashion_dataset['Knit_or_Woven'].combine_first(fashion_dataset['Weave_Pattern_2'])

In [65]:
fashion_dataset['Knit_or_Woven'] = fashion_dataset['Knit_or_Woven'].combine_first(fashion_dataset['Weave_Type_2'])

In [66]:
fashion_dataset['Knit_or_Woven'].isna().sum()

7006

In [67]:
pd.unique(fashion_dataset['Knit_or_Woven'])

array([nan, 'Woven', 'Knitted', 'Knitted and Woven'], dtype=object)

In [68]:
len(pd.unique(fashion_dataset['Fabric']))

75

In [69]:
fabrics_knit = sqldf("select distinct Fabric from fashion_dataset where Knit_or_Woven = 'Knitted'")

In [70]:
fabrics_knit.isna().sum()

Fabric    0
dtype: int64

In [71]:
fabrics_weave = sqldf("select distinct Fabric from fashion_dataset where Knit_or_Woven = 'Woven'")

In [72]:
fabrics_weave.isna().sum()

Fabric    0
dtype: int64

In [73]:
fabrics_both = sqldf("select distinct Fabric from fashion_dataset where Knit_or_Woven = 'Knitted and Woven'")

In [74]:
fabrics_both.isna().sum()

Fabric    0
dtype: int64

In [75]:
knit_n_woven = sqldf("select * from fabrics_knit where fabrics_knit.Fabric in(select fabrics_weave.Fabric from fabrics_weave)")

In [76]:
knit_n_woven = sqldf("select * from knit_n_woven where Fabric not in ('Other')")
knit_n_woven

Unnamed: 0,Fabric
0,Cotton
1,Polyester
2,Denim
3,Pure Cotton
4,Cotton Blend
5,Viscose Rayon
6,Fleece
7,Acrylic
8,Linen
9,Nylon


In [77]:
fabrics_both = sqldf("select * from fabrics_both where Fabric not in (select Fabric from knit_n_woven)")

In [78]:
both_fabrics = ps.sqldf("select * from knit_n_woven union all select * from fabrics_both")

In [79]:
both_fabrics.loc[:,'weave'] = 'Knitted and Woven'

In [80]:
fabrics_knit = sqldf("select * from fabrics_knit where Fabric not in (select Fabric from both_fabrics) and Fabric != 'Other'")
fabrics_knit.loc[:,['weave']]='Knitted'

In [81]:
fabrics_knit

Unnamed: 0,Fabric,weave
0,Scuba,Knitted
1,Velour,Knitted


In [82]:
fabrics_weave = sqldf("select * from fabrics_weave where Fabric not in (select Fabric from both_fabrics) and Fabric != 'Other'")
fabrics_weave.loc[:,['weave']]='Woven'

In [83]:
fabrics_weave

Unnamed: 0,Fabric,weave
0,Poly Crepe,Woven
1,Crepe,Woven
2,Poly Silk,Woven
3,Poly Georgette,Woven
4,Art Silk,Woven
5,Leather,Woven
6,Regular,Woven
7,Net,Woven
8,Silk Chiffon,Woven
9,Cotton Silk,Woven


In [84]:
other_fabrics = sqldf("select Fabric from fashion_dataset where Fabric = 'Other'")

In [85]:
other_woven_fabrics, other_knit_fabrics = train_test_split(other_fabrics, test_size = 0.5,random_state=1)

In [86]:
other_woven_fabrics.loc[:,['weave']]='Woven'

In [87]:
other_knit_fabrics.loc[:,['weave']]= 'Knit'

In [88]:
other_fabrics = sqldf("select * from other_knit_fabrics union all select * from other_woven_fabrics")

In [89]:
fabric_weave_type = sqldf("select * from fabrics_weave union all select * from fabrics_knit union all select * from both_fabrics union all select * from other_fabrics")
fabric_weave_type

Unnamed: 0,Fabric,weave
0,Poly Crepe,Woven
1,Crepe,Woven
2,Poly Silk,Woven
3,Poly Georgette,Woven
4,Art Silk,Woven
...,...,...
97,Other,Woven
98,Other,Woven
99,Other,Woven
100,Other,Woven


In [90]:
fabric_weave_type.duplicated().sum()

48

In [91]:
fashion_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14223 entries, 0 to 14328
Columns: 118 entries, p_id to Weave_Type_2
dtypes: float64(4), object(114)
memory usage: 12.9+ MB


In [92]:
fashion_dataset = sqldf("select fashion_dataset.*, fabric_weave_type.weave from fashion_dataset left join fabric_weave_type on (fabric_weave_type.Fabric=fashion_dataset.Fabric)")

In [93]:
fashion_dataset.drop_duplicates(inplace=True)

In [94]:
fashion_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14273 entries, 0 to 16672
Columns: 119 entries, p_id to weave
dtypes: float64(4), object(115)
memory usage: 13.1+ MB


In [95]:
fashion_dataset['Knit_or_Woven'] = fashion_dataset['Knit_or_Woven'].combine_first(fashion_dataset['weave'])

In [96]:
fashion_dataset['Knit_or_Woven'].isna().sum()

478

### Filling Print_Pattern_Type null values

In [97]:
pd.unique(fashion_dataset['Print_Pattern_Type'])

array(['Floral', 'Solid', 'Ethnic Motifs', None, 'Geometric', 'Graphic',
       'Self Design', 'Checked', 'Typography', 'Washed', 'Kalamkari',
       'Embellished', 'Humour and Comic', 'Animal', 'Paisley', 'Ribbed',
       'Striped', 'Bandhani', 'Abstract', 'Camouflage', 'Tie and Dye',
       'Woven Design', 'Colourblocked', 'Vertical Stripes', 'Polka Dots',
       'Leheriya', 'Cable Knit', 'Brand Logo', 'Open Knit',
       'Alphanumeric', 'Other', 'Conversational', 'NA', 'Chevron',
       'Ombre', 'Horizontal Stripes', 'Textured', 'Superhero', 'Warli',
       'Boucle', 'Batik', 'Quirky', 'Tribal', 'Ajrak', 'Embroidered',
       'Sequinned Stripes', 'Botanical', 'Shimmer', 'Bagh', 'Houndstooth',
       'Fair Isle', 'Dabu', 'Cartoon Characters', 'Argyle'], dtype=object)

In [98]:
sqldf("select Print_Pattern_Type, Pattern, Top_Pattern, Bottom_Pattern from fashion_dataset where Print_Pattern_Type is null and Top_Pattern is not null or Bottom_Pattern is not null or Pattern is not null")

Unnamed: 0,Print_Pattern_Type,Pattern,Top_Pattern,Bottom_Pattern
0,Floral,Embroidered,,
1,Solid,Solid,,
2,Ethnic Motifs,,,Woven Design
3,Solid,Solid,,
4,,Printed,,
...,...,...,...,...
13254,Floral,Woven Design,,
13255,,,Printed,Printed
13256,Floral,Printed,,
13257,Floral,Printed,,


In [99]:
fashion_dataset['Print_Pattern_Type'] = fashion_dataset['Print_Pattern_Type'].combine_first(fashion_dataset['Pattern'])

In [100]:
fashion_dataset['Print_Pattern_Type'] = fashion_dataset['Print_Pattern_Type'].combine_first(fashion_dataset['Bottom_Pattern'])

In [101]:
fashion_dataset['Print_Pattern_Type'] = fashion_dataset['Print_Pattern_Type'].combine_first(fashion_dataset['Top_Pattern'])

In [102]:
fashion_dataset['Print_Pattern_Type'] = fashion_dataset['Print_Pattern_Type'].combine_first(fashion_dataset['Kurta_Pattern'])

In [103]:
fashion_dataset['Print_Pattern_Type'] = fashion_dataset['Print_Pattern_Type'].combine_first(fashion_dataset['Dupatta_Pattern'])

In [104]:
#fashion_dataset.drop(['Pattern','Top_Pattern','Bottom_Pattern', 'Kurta_Pattern','Dupatta_Pattern'], axis = 'columns', inplace=True)

In [105]:
fashion_dataset['Print_Pattern_Type'].isna().sum()

1002

### Filling remaining null values in pattern, weave, and occasion with mode

In [106]:
fashion_dataset['Print_Pattern_Type'].mode()

0    Solid
Name: Print_Pattern_Type, dtype: object

In [107]:
fashion_dataset['Occasion'].mode()

0    Casual
Name: Occasion, dtype: object

In [108]:
fashion_dataset['Knit_or_Woven'].mode()

0    Woven
Name: Knit_or_Woven, dtype: object

In [109]:
fashion_dataset.fillna({'Print_Pattern_Type':'Solid','Occasion':'Casual','Knit_or_Woven':'Woven'}, inplace=True)
fashion_dataset[['Print_Pattern_Type','Occasion','Knit_or_Woven']].isna().sum()

Print_Pattern_Type    0
Occasion              0
Knit_or_Woven         0
dtype: int64

### Merging the datasets

In [4]:
merged_dataset = ps.sqldf("select fashion_dataset.p_id, fashion_dataset.name, fashion_dataset.price, fashion_dataset.colour, brand_details.*, fashion_dataset.brand, fashion_dataset.ratingCount, fashion_dataset.avg_rating, fashion_dataset.Occasion, fashion_dataset.Print_Pattern_Type, fashion_dataset.Sustainable, fashion_dataset.Knit_or_Woven, fashion_dataset.Fabric, fashion_dataset.Fabric_Purity from fashion_dataset left join brand_details on (fashion_dataset.brand = brand_details.brand_name)")
merged_dataset

Unnamed: 0,p_id,name,price,colour,brand_id,brand_name,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,242.0,Dupatta Bazaar,Dupatta Bazaar,1321.0,4.548827,Daily,Floral,,Knitted and Woven,Cotton,
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,750.0,Roadster,Roadster,5462.0,4.313255,Casual,Solid,,Knitted and Woven,Cotton,
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,389.0,Inddus,Inddus,145.0,4.068966,Daily,Ethnic Motifs,,Knitted and Woven,Cotton Blend,
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,,,SASSAFRAS,9124.0,4.147523,Casual,Solid,Regular,Woven,Cotton,
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,482.0,Kotty,Kotty,12260.0,4.078467,Casual,Solid,,Knitted and Woven,Cotton,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14268,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,880.0,The Chennai Silks,The Chennai Silks,,,Festive,Floral,Regular,Woven,Jute Cotton,
14269,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,471.0,Kinder Kids,Kinder Kids,,,Casual,Printed,Regular,Knitted and Woven,Cotton,
14270,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,,,KLOTTHE,,,Western,Floral,Regular,Woven,Polycotton,
14271,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,,,InWeave,,,Casual,Floral,Regular,Woven,Viscose Rayon,


### Dropping unnecessary null values

In [5]:
merged_dataset.isna().sum()

p_id                      1
name                      2
price                     2
colour                    5
brand_id               6192
brand_name             6192
brand                     5
ratingCount            7725
avg_rating             7725
Occasion                  0
Print_Pattern_Type        8
Sustainable            1302
Knit_or_Woven             0
Fabric                    0
Fabric_Purity         13729
dtype: int64

In [6]:
sqldf("select * from merged_dataset where p_id is null")

Unnamed: 0,p_id,name,price,colour,brand_id,brand_name,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity
0,,,,,,,,,,Casual,Solid,,Knitted and Woven,Cotton,


In [7]:
merged_dataset.dropna(subset = "p_id", inplace = True)
merged_dataset.isna().sum()

p_id                      0
name                      1
price                     1
colour                    4
brand_id               6191
brand_name             6191
brand                     4
ratingCount            7724
avg_rating             7724
Occasion                  0
Print_Pattern_Type        8
Sustainable            1301
Knit_or_Woven             0
Fabric                    0
Fabric_Purity         13728
dtype: int64

In [8]:
sqldf("select * from merged_dataset where name is null")

Unnamed: 0,p_id,name,price,colour,brand_id,brand_name,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity
0,19068208.0,,,,,,,,,Casual,Solid,,Knitted and Woven,Cotton,


In [9]:
merged_dataset.dropna(subset = "name", inplace = True)
merged_dataset.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
brand_id               6190
brand_name             6190
brand                     3
ratingCount            7723
avg_rating             7723
Occasion                  0
Print_Pattern_Type        8
Sustainable            1300
Knit_or_Woven             0
Fabric                    0
Fabric_Purity         13727
dtype: int64

>Upon inspection, the rows where brand was null were duplicate rows, and were therefore dropped.

In [10]:
merged_dataset.dropna(subset = "brand", inplace = True)
merged_dataset.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
brand_id               6187
brand_name             6187
brand                     0
ratingCount            7722
avg_rating             7722
Occasion                  0
Print_Pattern_Type        8
Sustainable            1300
Knit_or_Woven             0
Fabric                    0
Fabric_Purity         13724
dtype: int64

>Finding why there are null brand name and brand ID values

In [11]:
sqldf("select distinct brand from merged_dataset where brand_id is null")

Unnamed: 0,brand
0,SASSAFRAS
1,KASSUALLY
2,Saree mall
3,MONTREZ
4,DOLCE CRUDO
...,...
515,SARIYA
516,kasee
517,SHIVANGI clothing
518,ARTICALE


In [12]:
sqldf("select distinct brand, brand_id from merged_dataset order by brand_id asc")

Unnamed: 0,brand,brand_id
0,SASSAFRAS,
1,KASSUALLY,
2,Saree mall,
3,MONTREZ,
4,DOLCE CRUDO,
...,...,...
1015,Zigo,1011.0
1016,Zima Leto,1013.0
1017,Zink London,1014.0
1018,Ziva Fashion,1016.0


In [13]:
sqldf("select distinct brand_id from merged_dataset order by brand_id asc")

Unnamed: 0,brand_id
0,
1,1.0
2,2.0
3,3.0
4,7.0
...,...
496,1011.0
497,1013.0
498,1014.0
499,1016.0


In [14]:
sqldf("select brand_id from brand_details where brand_name = 'KASSUALLY'")

Unnamed: 0,brand_id


In [15]:
sqldf("select distinct * from merged_dataset where brand is null")

Unnamed: 0,p_id,name,price,colour,brand_id,brand_name,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity


In [16]:
sqldf("select * from merged_dataset where brand_name is null")

Unnamed: 0,p_id,name,price,colour,brand_id,brand_name,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity
0,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,,,SASSAFRAS,9124.0,4.147523,Casual,Solid,Regular,Woven,Cotton,
1,12742100.0,KASSUALLY Women Black & Pink Printed Basic Jum...,2199.0,Black,,,KASSUALLY,6297.0,4.349214,Casual,Printed,Regular,Knitted and Woven,Polyester,
2,13842966.0,Sassafras Brown & Red Geometric Printed George...,1499.0,Brown,,,SASSAFRAS,7358.0,4.395352,Casual,Geometric,Regular,Woven,Polyester,
3,16595858.0,Saree Mall Floral Saree,3599.0,Pink,,,Saree mall,1005.0,3.980100,Party,Floral,Regular,Woven,Organza,
4,18601482.0,MONTREZ Women White Black Open Front Jacket,1999.0,White,,,MONTREZ,61.0,4.377049,Casual,Graphic,Regular,Woven,Cotton,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6182,18055840.0,tantkatha Black Front Closure Saree Blouse Wit...,1699.0,Black,,,tantkatha,,,Party,Solid,Regular,Woven,Dupion,
6183,19361072.0,BoStreet Women Green Solid Mom Fit Trousers,2599.0,Green,,,BoStreet,,,Casual,Solid,Regular,Knitted,Polyester,
6184,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,,,KLOTTHE,,,Western,Floral,Regular,Woven,Polycotton,
6185,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,,,InWeave,,,Casual,Floral,Regular,Woven,Viscose Rayon,


In [17]:
sqldf("select count(distinct brand_id) from merged_dataset")

Unnamed: 0,count(distinct brand_id)
0,500


In [18]:
sqldf("select max(brand_id), brand_name from brand_details")

Unnamed: 0,max(brand_id),brand_name
0,1020,Zola


### Filling brand ID

>Getting rid of null values in brand and brand ID by assigning new IDs

In [19]:
new_id = sqldf("select distinct brand, brand_id from merged_dataset order by brand asc")
new_id

Unnamed: 0,brand,brand_id
0,109F,2.0
1,20Dresses,3.0
2,250 DESIGNS,
3,3PIN,
4,513,1.0
...,...,...
1015,trueBrowns,
1016,urSense,
1017,wild U,
1018,zebu,


In [20]:
new_id.loc[:, 'brandID'] = range(1, 1021)
new_id.drop('brand_id', axis = 'columns')

Unnamed: 0,brand,brandID
0,109F,1
1,20Dresses,2
2,250 DESIGNS,3
3,3PIN,4
4,513,5
...,...,...
1015,trueBrowns,1016
1016,urSense,1017
1017,wild U,1018
1018,zebu,1019


In [21]:
merged_dataset = sqldf("select merged_dataset.*, new_id.brandID from merged_dataset left join new_id on (merged_dataset.brand = new_id.brand)")
merged_dataset.drop(['brand_id','brand_name'], axis = 'columns', inplace = True)
merged_dataset.isna().sum()
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14268 entries, 0 to 14267
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   p_id                14268 non-null  float64
 1   name                14268 non-null  object 
 2   price               14268 non-null  float64
 3   colour              14265 non-null  object 
 4   brand               14268 non-null  object 
 5   ratingCount         6546 non-null   float64
 6   avg_rating          6546 non-null   float64
 7   Occasion            14268 non-null  object 
 8   Print_Pattern_Type  14260 non-null  object 
 9   Sustainable         12968 non-null  object 
 10  Knit_or_Woven       14268 non-null  object 
 11  Fabric              14268 non-null  object 
 12  Fabric_Purity       544 non-null    object 
 13  brandID             14268 non-null  int64  
dtypes: float64(4), int64(1), object(9)
memory usage: 1.5+ MB


In [3]:
merged_dataset.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
brand                     0
ratingCount            7722
avg_rating             7722
Occasion                  0
Print_Pattern_Type        8
Sustainable            1300
Knit_or_Woven             0
Fabric                    0
Fabric_Purity         13724
brandID                   0
dtype: int64

### Filling Fabric_Purity and Sustainable columns

In [4]:
pd.unique(merged_dataset['Fabric_Purity'])

array([nan, 'Blended', 'Pure', 'Synthetic'], dtype=object)

In [5]:
pd.unique(merged_dataset['Sustainable'])

array([nan, 'Regular', 'Sustainable'], dtype=object)

> Creating new column for fabric purity based on sustainable

In [6]:
merged_dataset.loc[:,['pur']]=merged_dataset['Sustainable'].replace(['Regular', 'Sustainable'],['Synthetic','Pure'])

>Filling Fabric Purity with new values

In [7]:
merged_dataset['Fabric_Purity'].isna().sum()

13724

In [8]:
merged_dataset['Fabric_Purity']=merged_dataset['Fabric_Purity'].combine_first(merged_dataset['pur'])

In [9]:
merged_dataset['Fabric_Purity'].isna().sum()

1239

>Creating new column for sustainable values based on  fabric purity values

In [10]:
merged_dataset.loc[:,['sus']]=merged_dataset['Fabric_Purity'].replace(['Blended', 'Pure', 'Synthetic'],['Regular', 'Sustainable','Regular'])

>Filling Sustainable with new values

In [11]:
merged_dataset['Sustainable'].isna().sum()

1300

In [12]:
merged_dataset['Sustainable']=merged_dataset['Sustainable'].combine_first(merged_dataset['sus'])

In [13]:
merged_dataset['Sustainable'].isna().sum()

1239

>Dropping newly created columns

In [14]:
merged_dataset.drop(['pur','sus'], axis='columns', inplace=True)

### Filling in remaining null values

In [15]:
print(merged_dataset['colour'].mode())
print(round(merged_dataset['avg_rating'].mean(),6))
print(merged_dataset['ratingCount'].mean())
print(merged_dataset['Sustainable'].mode())
print(merged_dataset['Fabric_Purity'].mode())

0    Black
Name: colour, dtype: object
4.100788
183.90742438130155
0    Regular
Name: Sustainable, dtype: object
0    Synthetic
Name: Fabric_Purity, dtype: object


In [16]:
merged_dataset.fillna({'colour':'Black',"avg_rating":4.101193, "ratingCount":183.36121433078594, 'Sustainable':'Regular','Fabric_Purity':'Synthetic'}, inplace = True)
merged_dataset.isna().sum()

p_id                  0
name                  0
price                 0
colour                0
brand                 0
ratingCount           0
avg_rating            0
Occasion              0
Print_Pattern_Type    8
Sustainable           0
Knit_or_Woven         0
Fabric                0
Fabric_Purity         0
brandID               0
dtype: int64

### Converting ratingCount to int

In [17]:
merged_dataset['ratingCount']=merged_dataset['ratingCount'].astype(int)

In [18]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14268 entries, 0 to 14267
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   p_id                14268 non-null  float64
 1   name                14268 non-null  object 
 2   price               14268 non-null  float64
 3   colour              14268 non-null  object 
 4   brand               14268 non-null  object 
 5   ratingCount         14268 non-null  int32  
 6   avg_rating          14268 non-null  float64
 7   Occasion            14268 non-null  object 
 8   Print_Pattern_Type  14260 non-null  object 
 9   Sustainable         14268 non-null  object 
 10  Knit_or_Woven       14268 non-null  object 
 11  Fabric              14268 non-null  object 
 12  Fabric_Purity       14268 non-null  object 
 13  brandID             14268 non-null  int64  
dtypes: float64(3), int32(1), int64(1), object(9)
memory usage: 1.5+ MB


In [19]:
merged_dataset.to_csv('./new_dataset.csv',index=False)

# Machine Learning

## Creating ML dataset

In [3]:
dataset_final = merged_dataset

## Pre-processing

In [4]:
le = preprocessing.LabelEncoder()

for i in dataset_final.columns:
        if dataset_final[i].dtype == object:
            dataset_final[i] = le.fit_transform(dataset_final[i])
        else:
            pass

In [5]:
dataset_final

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity,brandID
0,1518329.0,2986,899.0,47,220,1321,4.548827,1,24,0,2,7,2,221
1,5829334.0,9059,1199.0,27,701,5462,4.313255,0,43,0,2,7,2,702
2,10340119.0,4865,5799.0,33,362,145,4.068966,1,22,0,2,8,2,363
3,10856380.0,9330,1499.0,2,719,9124,4.147523,0,43,0,3,7,2,720
4,12384822.0,5811,1999.0,2,445,12260,4.078467,0,43,0,2,7,2,446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,17029604.0,11238,3999.0,34,829,183,4.101193,3,24,0,3,20,2,830
14264,17600212.0,5758,2050.0,3,438,183,4.101193,0,37,0,2,7,2,439
14265,18159266.0,5655,1659.0,17,422,183,4.101193,11,24,0,3,41,2,423
14266,18921114.0,4838,2399.0,36,361,183,4.101193,0,24,0,3,71,2,362


In [6]:
dataset_final.columns

Index(['p_id', 'name', 'price', 'colour', 'brand', 'ratingCount', 'avg_rating',
       'Occasion', 'Print_Pattern_Type', 'Sustainable', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'brandID'],
      dtype='object')

# Machine Learning

## Regression Modelling

## Pre-processing

In [None]:
regression_data = dataset_final

In [None]:
le = preprocessing.LabelEncoder()

for i in regression_data.columns:
        if regression_data[i].dtype == object:
            regression_data[i] = le.fit_transform(regression_data[i])
        else:
            pass

In [None]:
regression_data

In [None]:
x_reg = regression_data.loc[:,[ 'p_id','avg_rating', 'colour', 'ratingCount',
       'Occasion', 'Print_Pattern_Type', 'Sustainable', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'price']]
y_reg = regression_data.loc[:,['brand_id']]

>Creating the training and testing datasets

In [None]:
x_reg_train, x_reg_test, y_reg_train, y_reg_test = train_test_split(x_reg, y_reg, test_size=0.25)

print("Your independent training dataset contains ", x_reg_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_reg_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_reg_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_reg_test.shape, " rows and columns.")

### Linear Regression

>Training the model

In [None]:
LR = LinearRegression()

In [None]:
LR.fit(x_reg_train, y_reg_train)

>Testing the model

In [None]:
LR_predict = LR.predict(x_reg_test)
LR_predict

In [None]:
print("Linear Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, LR_predict))

### Bayesian Ridge Regression

In [None]:
BayRidge = BayesianRidge()
BayRidge.fit(x_reg_train, y_reg_train)

In [None]:
BayRidge_predict = BayRidge.predict(x_reg_test)
BayRidge_predict

In [None]:
print("Bayesian Ridge Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, BayRidge_predict))

In [None]:
dataset_final.info()

## Classification Modelling

>Pre-processing

In [None]:
classification_data = dataset_final
classification_data

In [None]:
le = preprocessing.LabelEncoder()

for i in classification_data.columns:
        if classification_data[i].dtype == object:
            classification_data[i] = le.fit_transform(classification_data[i])
        else:
            pass

In [None]:
classification_data

>Creating the training datasets

In [None]:
x_class = classification_data.loc[:,[ 'p_id', 'name','avg_rating', 'colour', 'ratingCount',
       'Occasion', 'Print_Pattern_Type', 'Sustainable', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'price']]
y_class = classification_data.loc[:,['brand']]

In [None]:
x_class

In [None]:
x_class_train, x_class_test, y_class_train, y_class_test = train_test_split(x_class, y_class, test_size=0.25)

print("Your independent training dataset contains ", x_class_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_class_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_class_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_class_test.shape, " rows and columns.")

### Gaussian Naive Bayes Classifier

In [None]:
GNB = GaussianNB()

In [None]:
GNB.fit(x_class_train, y_class_train)

In [None]:
GNB_predict = GNB.predict(x_class_test)

In [None]:
print("Gaussian Naive Bayes Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, GNB_predict))*100,2),"%")

### Decision Tree Classifier

In [None]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_class_train, y_class_train)

In [None]:
dtree_predict = dtree.predict(x_class_test)

In [None]:
print("Decision Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, dtree_predict))*100,2),"%")