# Data Analysis

>Importing libraries

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

# Data Cleaning & Transformation

>Importing and viewing the fashion dataset


In [2]:
fashion_dataset= pd.read_csv(r".\fashion dataset.csv")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


>Finding the number of unique brands in the fashion dataset

In [3]:
len(pd.unique(fashion_dataset['brand']))

1021

>Importing the brand details dataset

In [4]:
brand_details = pd.read_excel(r".\fashion brand details.xlsx")
brand_details

Unnamed: 0,brand_id,brand_name
0,1,513
1,2,109F
2,3,20Dresses
3,4,250 Designs
4,5,3Pin
...,...,...
1015,1016,Ziva Fashion
1016,1017,Zivame
1017,1018,Ziyaa
1018,1019,Zoella


>Counting the number of unique brand names in the brand details dataset

In [5]:
len(pd.unique(brand_details['brand_name']))

1020

>Finding the number of null and duplicated values in each dataset

In [6]:
brand_details.isna().sum()

brand_id      0
brand_name    0
dtype: int64

In [7]:
fashion_dataset.isna().sum()

p_id              18
name              19
price             19
colour            22
brand             24
ratingCount     7748
avg_rating      7748
description       19
p_attributes      19
dtype: int64

In [8]:
brand_details.duplicated().sum()

0

In [9]:
fashion_dataset.duplicated().sum()

59

>Testing out duplicate dropping

In [10]:
df_dupl = fashion_dataset.drop_duplicates(keep = "last")
len(pd.unique(df_dupl["brand"]))

1021

>Dropping duplicates in fashion dataset

In [11]:
fashion_dataset.drop_duplicates(inplace = True, keep = "last")
fashion_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...,...,...,...
14324,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14325,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14326,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14327,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


>Merging the datasets

In [12]:
merged_dataset = ps.sqldf("select fashion_dataset.*, brand_details.* from fashion_dataset left join brand_details on (fashion_dataset.brand = brand_details.brand_name)")
merged_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id,brand_name
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'...",242.0,Dupatta Bazaar
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm...",750.0,Roadster
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt...",389.0,Inddus
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod...",,
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo...",482.0,Kotty
...,...,...,...,...,...,...,...,...,...,...,...
14265,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous...",880.0,The Chennai Silks
14266,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot...",471.0,Kinder Kids
14267,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...,,
14268,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42...",,


>Dropping unnecessary null values

In [13]:
merged_dataset.isna().sum()

p_id               1
name               2
price              2
colour             5
brand              7
ratingCount     7707
avg_rating      7707
description        2
p_attributes       2
brand_id        6199
brand_name      6199
dtype: int64

In [14]:
merged_dataset.head(5)

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id,brand_name
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'...",242.0,Dupatta Bazaar
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm...",750.0,Roadster
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt...",389.0,Inddus
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod...",,
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo...",482.0,Kotty


In [15]:
sqldf("select * from merged_dataset where p_id is null")

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id,brand_name
0,,,,,,,,,,,


In [16]:
merged_dataset.dropna(subset = "p_id", inplace = True)
merged_dataset.isna().sum()

p_id               0
name               1
price              1
colour             4
brand              6
ratingCount     7706
avg_rating      7706
description        1
p_attributes       1
brand_id        6198
brand_name      6198
dtype: int64

In [17]:
sqldf("select * from merged_dataset where name is null")

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id,brand_name
0,19068208.0,,,,,,,,,,


In [18]:
merged_dataset.dropna(subset = "name", inplace = True)
merged_dataset.isna().sum()

p_id               0
name               0
price              0
colour             3
brand              5
ratingCount     7705
avg_rating      7705
description        0
p_attributes       0
brand_id        6197
brand_name      6197
dtype: int64

>Upon inspection, the rows where brand was null were duplicate rows, and were therefore dropped.

In [19]:
merged_dataset.dropna(subset = "brand", inplace = True)
merged_dataset.isna().sum()

p_id               0
name               0
price              0
colour             3
brand              0
ratingCount     7703
avg_rating      7703
description        0
p_attributes       0
brand_id        6192
brand_name      6192
dtype: int64

>Finding why there are null brand name and brand ID values

In [20]:
sqldf("select distinct brand from merged_dataset where brand_id is null")

Unnamed: 0,brand
0,SASSAFRAS
1,KASSUALLY
2,Saree mall
3,MONTREZ
4,DOLCE CRUDO
...,...
515,SARIYA
516,kasee
517,SHIVANGI clothing
518,ARTICALE


In [21]:
sqldf("select distinct brand, brand_id from merged_dataset order by brand_id asc")

Unnamed: 0,brand,brand_id
0,SASSAFRAS,
1,KASSUALLY,
2,Saree mall,
3,MONTREZ,
4,DOLCE CRUDO,
...,...,...
1015,Zigo,1011.0
1016,Zima Leto,1013.0
1017,Zink London,1014.0
1018,Ziva Fashion,1016.0


In [22]:
sqldf("select distinct brand_id from merged_dataset order by brand_id asc")

Unnamed: 0,brand_id
0,
1,1.0
2,2.0
3,3.0
4,7.0
...,...
496,1011.0
497,1013.0
498,1014.0
499,1016.0


In [23]:
sqldf("select brand_id from brand_details where brand_name = 'KASSUALLY'")

Unnamed: 0,brand_id


In [24]:
sqldf("select distinct * from merged_dataset where brand is null")

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id,brand_name


In [25]:
sqldf("select * from merged_dataset where brand_name is null")

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id,brand_name
0,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod...",,
1,12742100.0,KASSUALLY Women Black & Pink Printed Basic Jum...,2199.0,Black,KASSUALLY,6297.0,4.349214,"Black printed basic jumpsuit, has a V-neck, lo...",{'Body or Garment Size': 'Garment Measurements...,,
2,13842966.0,Sassafras Brown & Red Geometric Printed George...,1499.0,Brown,SASSAFRAS,7358.0,4.395352,<ul><li>Brown and red crop wrap top</li><li>Pr...,"{'Body Shape ID': '333,424', 'Body or Garment ...",,
3,16595858.0,Saree Mall Floral Saree,3599.0,Pink,Saree mall,1005.0,3.980100,Dress up or dress down this modish saree for a...,"{'Blouse': 'Blouse Piece', 'Blouse Fabric': 'O...",,
4,18601482.0,MONTREZ Women White Black Open Front Jacket,1999.0,White,MONTREZ,61.0,4.377049,"White graphic printed open front jacket, has ...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod...",,
...,...,...,...,...,...,...,...,...,...,...,...
6187,18055840.0,tantkatha Black Front Closure Saree Blouse Wit...,1699.0,Black,tantkatha,,,Black Front Closure Blouse With Embellished Ne...,{'Body or Garment Size': 'Garment Measurements...,,
6188,19361072.0,BoStreet Women Green Solid Mom Fit Trousers,2599.0,Green,BoStreet,,,<ul><li>Green knitted parallel</li><li>Mom fit...,"{'Add-Ons': 'NA', 'Body or Garment Size': 'To-...",,
6189,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...,,
6190,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42...",,


In [26]:
sqldf("select count(distinct brand_id) from merged_dataset")

Unnamed: 0,count(distinct brand_id)
0,500


In [27]:
sqldf("select max(brand_id), brand_name from brand_details")

Unnamed: 0,max(brand_id),brand_name
0,1020,Zola


>Getting rid of null values in brand and brand ID by assigning new IDs

In [28]:
new_id = sqldf("select distinct brand, brand_id from merged_dataset order by brand asc")
new_id

Unnamed: 0,brand,brand_id
0,109F,2.0
1,20Dresses,3.0
2,250 DESIGNS,
3,3PIN,
4,513,1.0
...,...,...
1015,trueBrowns,
1016,urSense,
1017,wild U,
1018,zebu,


In [29]:
new_id.loc[:, 'brand_id'] = range(1, 1021)
new_id

Unnamed: 0,brand,brand_id
0,109F,1
1,20Dresses,2
2,250 DESIGNS,3
3,3PIN,4
4,513,5
...,...,...
1015,trueBrowns,1016
1016,urSense,1017
1017,wild U,1018
1018,zebu,1019


In [30]:
dataset_final = sqldf("select p_id, merged_dataset.name, merged_dataset.price, merged_dataset.colour, merged_dataset.brand, merged_dataset.ratingCount, merged_dataset.avg_rating, merged_dataset.description, merged_dataset.p_attributes, new_id.brand_id from merged_dataset left join new_id on (merged_dataset.brand = new_id.brand)")
dataset_final.isna().sum()

p_id               0
name               0
price              0
colour             3
brand              0
ratingCount     7703
avg_rating      7703
description        0
p_attributes       0
brand_id           0
dtype: int64

>Filling in remaining null values

In [31]:
sqldf("select * from dataset_final where colour is null")

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id
0,19145038.0,Baisacrafts Women Pure Cotton Kurta with Trous...,5450.0,,Baisacrafts,,,Solid Kurta with Trousers with dupatta<br><br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",109
1,19142060.0,LIVE OK Women Boyfriend Fit High-Rise Stretcha...,1999.0,,LIVE OK,,,"<ul> <li> Dark shade, no fade jeans </li> <li...","{'Add-Ons': 'NA', 'Body or Garment Size': 'To-...",463
2,16124786.0,MANGO Women Hooded Sweatshirt,2390.0,,MANGO,,,"Solid sweatshirt has a hooded, short drop shou...","{'Body Shape ID': '443,424,324', 'Body or Garm...",498


In [32]:
avg_rating_mean = round(dataset_final["avg_rating"].mean(),6)

In [33]:
dataset_final["ratingCount"].median()

23.0

In [34]:
dataset_final["ratingCount"].mode()

0    5.0
Name: ratingCount, dtype: float64

In [35]:
ratingCount_mean = dataset_final["ratingCount"].mean()

In [36]:
dataset_final.fillna({"colour":"dataset_final.mode()","avg_rating":avg_rating_mean, "ratingCount":ratingCount_mean}, inplace = True)
dataset_final.isna().sum()

p_id            0
name            0
price           0
colour          0
brand           0
ratingCount     0
avg_rating      0
description     0
p_attributes    0
brand_id        0
dtype: int64

In [37]:
dataset_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14263 entries, 0 to 14262
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   p_id          14263 non-null  float64
 1   name          14263 non-null  object 
 2   price         14263 non-null  float64
 3   colour        14263 non-null  object 
 4   brand         14263 non-null  object 
 5   ratingCount   14263 non-null  float64
 6   avg_rating    14263 non-null  float64
 7   description   14263 non-null  object 
 8   p_attributes  14263 non-null  object 
 9   brand_id      14263 non-null  int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 1.1+ MB


>Exporting data for analysis using Apache Spark

In [38]:
dataset_final.to_csv("final_dataset.csv", index = False)

In [67]:
new_dataset = dataset_final.loc[:,['p_id', 'name','price','colour','brand','ratingCount','avg_rating','brand_id']]
new_dataset.to_csv("new_dataset.csv", index = False, header = False)

In [40]:
dataset_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14263 entries, 0 to 14262
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   p_id          14263 non-null  float64
 1   name          14263 non-null  object 
 2   price         14263 non-null  float64
 3   colour        14263 non-null  object 
 4   brand         14263 non-null  object 
 5   ratingCount   14263 non-null  float64
 6   avg_rating    14263 non-null  float64
 7   description   14263 non-null  object 
 8   p_attributes  14263 non-null  object 
 9   brand_id      14263 non-null  int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 1.1+ MB


# Machine Learning

## Regression Modelling

## Pre-processing

In [41]:
regression_data = dataset_final.loc[:,['price','avg_rating','ratingCount', 'brand_id','name','colour','brand','description','p_attributes']]

In [42]:
le = preprocessing.LabelEncoder()

for i in regression_data.columns:
        if regression_data[i].dtype == object:
            regression_data[i] = le.fit_transform(regression_data[i])
        else:
            pass

In [43]:
regression_data

Unnamed: 0,price,avg_rating,ratingCount,brand_id,name,colour,brand,description,p_attributes
0,899.0,4.548827,1321.000000,221,2987,47,220,13718,13085
1,1199.0,4.313255,5462.000000,702,9061,27,701,10086,9354
2,5799.0,4.068966,145.000000,363,4866,33,362,11203,12678
3,1499.0,4.147523,9124.000000,720,9333,2,719,6656,1703
4,1999.0,4.078467,12260.000000,446,5812,2,445,6076,4887
...,...,...,...,...,...,...,...,...,...
14258,3999.0,4.101193,183.785366,830,11241,34,829,732,4942
14259,2050.0,4.101193,183.785366,439,5759,3,438,6820,5376
14260,1659.0,4.101193,183.785366,423,5656,17,422,2965,10500
14261,2399.0,4.101193,183.785366,362,4839,36,361,1784,455


In [44]:
x_reg = regression_data.loc[:,['price','avg_rating','ratingCount','name','colour','description','p_attributes']]
y_reg = regression_data.loc[:,['brand_id']]

>Creating the training and testing datasets

In [45]:
x_reg_train, x_reg_test, y_reg_train, y_reg_test = train_test_split(x_reg, y_reg, test_size=0.25)

print("Your independent training dataset contains ", x_reg_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_reg_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_reg_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_reg_test.shape, " rows and columns.")

Your independent training dataset contains  (10697, 7)  rows and columns.
Your independent testing dataset contains  (3566, 7)  rows and columns.
Your dependent training dataset contains  (10697, 1)  rows and columns.
Your dependent testing dataset contains  (3566, 1)  rows and columns.


### Linear Regression

>Training the model

In [46]:
LR = LinearRegression()

In [47]:
LR.fit(x_reg_train, y_reg_train)

LinearRegression()

>Testing the model

In [48]:
LR_predict = LR.predict(x_reg_test)
LR_predict

array([[149.68382877],
       [288.50145653],
       [ 96.37101945],
       ...,
       [192.75725189],
       [219.55851387],
       [936.87822163]])

In [49]:
print("Linear Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, LR_predict))

Linear Regressor
Mean Absolute Error:  15.206787803899974


### Bayesian Ridge Regression

In [50]:
BayRidge = BayesianRidge()
BayRidge.fit(x_reg_train, y_reg_train)

  y = column_or_1d(y, warn=True)


BayesianRidge()

In [51]:
BayRidge_predict = BayRidge.predict(x_reg_test)
BayRidge_predict

array([149.73384683, 288.5362661 ,  96.00550286, ..., 194.5087062 ,
       219.45627164, 936.83707284])

In [52]:
print("Bayesian Ridge Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, BayRidge_predict))

Bayesian Ridge Regressor
Mean Absolute Error:  15.214003232317511


In [53]:
dataset_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14263 entries, 0 to 14262
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   p_id          14263 non-null  float64
 1   name          14263 non-null  object 
 2   price         14263 non-null  float64
 3   colour        14263 non-null  object 
 4   brand         14263 non-null  object 
 5   ratingCount   14263 non-null  float64
 6   avg_rating    14263 non-null  float64
 7   description   14263 non-null  object 
 8   p_attributes  14263 non-null  object 
 9   brand_id      14263 non-null  int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 1.1+ MB


## Classification Modelling

>Pre-processing

In [54]:
classification_data = dataset_final.loc[:,['name','colour','brand', 'brand_id','description','p_attributes']]
classification_data

Unnamed: 0,name,colour,brand,brand_id,description,p_attributes
0,Dupatta Bazaar White Embroidered Chiffon Dupatta,White,Dupatta Bazaar,221,White embroidered&nbsp;dupattaChiffon<br>Hand-...,"{'Occasion': 'Daily', 'Pattern': 'Embroidered'..."
1,Roadster Women Mustard Yellow Solid Hooded Swe...,Mustard,Roadster,702,"Mustard yellow solid sweatshirt, has a hood, t...","{'Body Shape ID': '443,424,324', 'Body or Garm..."
2,Inddus Peach-Coloured & Beige Unstitched Dress...,Peach,Inddus,363,Peach-Coloured and beige woven design unstitch...,"{'Bottom Fabric': 'Cotton Blend', 'Bottom Patt..."
3,SASSAFRAS Women Black Parallel Trousers,Black,SASSAFRAS,720,"Black solid woven high-rise parallel trousers,...","{'Add-Ons': 'NA', 'Body Shape ID': '424', 'Bod..."
4,Kotty Women Black Wide Leg High-Rise Clean Loo...,Black,Kotty,446,"Black dark wash 4-pocket high-rise jeans, clea...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Clo..."
...,...,...,...,...,...,...
14258,The Chennai Silks Pink & Silver-Toned Floral Z...,Pink,The Chennai Silks,830,<b>Design Details </b><ul><li>Pink and silver-...,"{'Better Cotton Initiative': 'Regular', 'Blous..."
14259,Kinder Kids Girls Blue & Green Printed Foil Pr...,Blue,Kinder Kids,439,"Blue and green printed lehenga choli, has foi...","{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."
14260,KLOTTHE Women Green & Black Floral Printed Pal...,Green,KLOTTHE,423,<ul> <li> Green and black woven palazzos </li>...,{'Body or Garment Size': 'To-Fit Denotes Body ...
14261,InWeave Women Red Printed A-Line Skirt,Red,InWeave,362,"<p>Red printed A-line skirt, has drawstring cl...","{'Add-Ons': 'NA', 'Body Shape ID': '324,333,42..."


In [55]:
le = preprocessing.LabelEncoder()

for i in classification_data.columns:
        if classification_data[i].dtype == object:
            classification_data[i] = le.fit_transform(classification_data[i])
        else:
            pass

In [56]:
classification_data

Unnamed: 0,name,colour,brand,brand_id,description,p_attributes
0,2987,47,220,221,13718,13085
1,9061,27,701,702,10086,9354
2,4866,33,362,363,11203,12678
3,9333,2,719,720,6656,1703
4,5812,2,445,446,6076,4887
...,...,...,...,...,...,...
14258,11241,34,829,830,732,4942
14259,5759,3,438,439,6820,5376
14260,5656,17,422,423,2965,10500
14261,4839,36,361,362,1784,455


>Creating the training datasets

In [57]:
x_class = classification_data.loc[:,['name','colour','description','p_attributes']]
y_class = classification_data.loc[:,['brand']]

In [58]:
x_class

Unnamed: 0,name,colour,description,p_attributes
0,2987,47,13718,13085
1,9061,27,10086,9354
2,4866,33,11203,12678
3,9333,2,6656,1703
4,5812,2,6076,4887
...,...,...,...,...
14258,11241,34,732,4942
14259,5759,3,6820,5376
14260,5656,17,2965,10500
14261,4839,36,1784,455


In [59]:
x_class_train, x_class_test, y_class_train, y_class_test = train_test_split(x_class, y_class, test_size=0.25)

print("Your independent training dataset contains ", x_class_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_class_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_class_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_class_test.shape, " rows and columns.")

Your independent training dataset contains  (10697, 4)  rows and columns.
Your independent testing dataset contains  (3566, 4)  rows and columns.
Your dependent training dataset contains  (10697, 1)  rows and columns.
Your dependent testing dataset contains  (3566, 1)  rows and columns.


### Gaussian Naive Bayes Classifier

In [60]:
GNB = GaussianNB()

In [61]:
GNB.fit(x_class_train, y_class_train)

  y = column_or_1d(y, warn=True)


GaussianNB()

In [62]:
GNB_predict = GNB.predict(x_class_test)

In [63]:
print("Gaussian Naive Bayes Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, GNB_predict))*100,2),"%")

Gaussian Naive Bayes Classifier
Accuracy: 84.02 %


### Decision Tree Classifier

In [64]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_class_train, y_class_train)

In [65]:
dtree_predict = dtree.predict(x_class_test)

In [66]:
print("Decision Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, dtree_predict))*100,2),"%")

Decision Classifier
Accuracy: 91.36 %
