### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import ast
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

### Importing data

In [2]:
merged_dataset = pd.read_csv(r"./new_dataset.csv")

# Machine Learning

## Creating ML dataset

In [3]:
dataset_final = merged_dataset

## Pre-processing

In [4]:
le = preprocessing.LabelEncoder()

for i in dataset_final.columns:
        if dataset_final[i].dtype == object:
            dataset_final[i] = le.fit_transform(dataset_final[i])
        else:
            pass

In [5]:
dataset_final

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity,brandID
0,1518329.0,2986,899.0,47,220,1321,4.548827,1,24,0,2,7,2,221
1,5829334.0,9059,1199.0,27,701,5462,4.313255,0,43,0,2,7,2,702
2,10340119.0,4865,5799.0,33,362,145,4.068966,1,22,0,2,8,2,363
3,10856380.0,9330,1499.0,2,719,9124,4.147523,0,43,0,3,7,2,720
4,12384822.0,5811,1999.0,2,445,12260,4.078467,0,43,0,2,7,2,446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,17029604.0,11238,3999.0,34,829,183,4.101193,3,24,0,3,20,2,830
14264,17600212.0,5758,2050.0,3,438,183,4.101193,0,37,0,2,7,2,439
14265,18159266.0,5655,1659.0,17,422,183,4.101193,11,24,0,3,41,2,423
14266,18921114.0,4838,2399.0,36,361,183,4.101193,0,24,0,3,71,2,362


In [6]:
dataset_final.columns

Index(['p_id', 'name', 'price', 'colour', 'brand', 'ratingCount', 'avg_rating',
       'Occasion', 'Print_Pattern_Type', 'Sustainable', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'brandID'],
      dtype='object')

## Regression Modelling

>Selecting x & y variables

In [7]:
x_regression = dataset_final.loc[:,['p_id', 'name', 'colour', 'brand', 'brandID', 'ratingCount',
       'Occasion', 'Print_Pattern_Type', 'Sustainable', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'price']]

In [8]:
y_regression = dataset_final.loc[:,['avg_rating']]

>Creating training & testing datasets

In [9]:
x_reg_train, x_reg_test, y_reg_train, y_reg_test = train_test_split(x_regression, y_regression, test_size=0.25, random_state=23)

print("Your independent training dataset contains ", x_reg_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_reg_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_reg_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_reg_test.shape, " rows and columns.")

Your independent training dataset contains  (10701, 13)  rows and columns.
Your independent testing dataset contains  (3567, 13)  rows and columns.
Your dependent training dataset contains  (10701, 1)  rows and columns.
Your dependent testing dataset contains  (3567, 1)  rows and columns.


### Linear Regression

>Training the model

In [10]:
LR = LinearRegression()

In [11]:
LR.fit(x_reg_train, y_reg_train)

LinearRegression()

>Testing the model

In [12]:
LR_predict = LR.predict(x_reg_test)
LR_predict

array([[4.11616567],
       [4.08136338],
       [4.11260547],
       ...,
       [4.1264848 ],
       [4.09208029],
       [4.0706807 ]])

### Bayesian Ridge Regression

In [13]:
BayRidge = BayesianRidge()
BayRidge.fit(x_reg_train, y_reg_train)

  y = column_or_1d(y, warn=True)


BayesianRidge()

In [14]:
BayRidge_predict = BayRidge.predict(x_reg_test)
BayRidge_predict

array([4.10684468, 4.09751242, 4.09457996, ..., 4.14321696, 4.1045506 ,
       4.07961781])

### Comparing Error Values

In [15]:
print("Linear Regressor MAE: ", metrics.mean_absolute_error(y_reg_test, LR_predict))
print("Bayesian Ridge Regressor MAE: ", metrics.mean_absolute_error(y_reg_test, BayRidge_predict))

Linear Regressor MAE:  0.163897769316485
Bayesian Ridge Regressor MAE:  0.16194423146395756


## Classification Modelling

### Choosing the x & y variables

> To check for brand

In [16]:
x_class_brand = dataset_final.loc[:,['p_id', 'brandID','avg_rating', 'colour', 'ratingCount',
       'Occasion', 'Print_Pattern_Type', 'Sustainable', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'price']]

In [17]:
y_class_brand = dataset_final.loc[:,['brand']]

>To check for sustainability

In [18]:
x_class_sustain = dataset_final.loc[:,['p_id', 'brand','brandID', 'avg_rating', 'colour', 'ratingCount',
       'Occasion', 'Print_Pattern_Type', 'Knit_or_Woven',
       'Fabric', 'Fabric_Purity', 'price']]

In [19]:
y_class_sustain = dataset_final.loc[:,['Sustainable']]

### Creating training & testing datasets

> To check for brand

In [20]:
x_train_brand, x_test_brand, y_train_brand, y_test_brand = train_test_split(x_class_brand, y_class_brand, test_size=0.25, random_state=23)

print("Your independent training dataset contains ", x_train_brand.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_test_brand.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_train_brand.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_test_brand.shape, " rows and columns.")

Your independent training dataset contains  (10701, 12)  rows and columns.
Your independent testing dataset contains  (3567, 12)  rows and columns.
Your dependent training dataset contains  (10701, 1)  rows and columns.
Your dependent testing dataset contains  (3567, 1)  rows and columns.


>To check for sustainability

In [21]:
x_train_sustain, x_test_sustain, y_train_sustain, y_test_sustain = train_test_split(x_class_sustain, y_class_sustain, test_size=0.25, random_state=23)

print("Your independent training dataset contains ", x_train_sustain.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_test_sustain.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_train_sustain.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_test_sustain.shape, " rows and columns.")

Your independent training dataset contains  (10701, 12)  rows and columns.
Your independent testing dataset contains  (3567, 12)  rows and columns.
Your dependent training dataset contains  (10701, 1)  rows and columns.
Your dependent testing dataset contains  (3567, 1)  rows and columns.


### Gaussian Naive Bayes Classifier for brand

In [22]:
GNB = GaussianNB()

In [23]:
GNB.fit(x_train_brand, y_train_brand)

  y = column_or_1d(y, warn=True)


GaussianNB()

In [24]:
GNB_predict1 = GNB.predict(x_test_brand)

### Decision Tree Classifier for brand

In [25]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train_brand, y_train_brand)

DecisionTreeClassifier()

In [26]:
dtree_predict1 = dtree.predict(x_test_brand)

### Random Forest Classifier for brand

In [40]:
rforest = RandomForestClassifier(n_estimators=60)
rforest.fit(x_train_brand, y_train_brand)

  rforest.fit(x_train_brand, y_train_brand)


RandomForestClassifier(n_estimators=60)

In [41]:
rforest_predict1 = rforest.predict(x_test_brand)

### Comparing Accuracy Scores

In [42]:
print("Gaussian Naive Bayes Classifier Accuracy:", round((metrics.accuracy_score(y_test_brand, GNB_predict1))*100,2),"%")
print("Decision Classifier Accuracy:", round((metrics.accuracy_score(y_test_brand, dtree_predict1))*100,2),"%")
print("Random Forest Classifier Accuracy:", round((metrics.accuracy_score(y_test_brand, rforest_predict1))*100,2),"%")

Gaussian Naive Bayes Classifier Accuracy: 20.21 %
Decision Classifier Accuracy: 94.28 %
Random Forest Classifier Accuracy: 73.76 %


### Gaussian Naive Bayes Classifier for sustainability

In [29]:
GNB = GaussianNB()

In [30]:
GNB.fit(x_train_sustain, y_train_sustain)

  y = column_or_1d(y, warn=True)


GaussianNB()

In [31]:
GNB_predict2 = GNB.predict(x_test_sustain)

### Decision Tree Classifier for sustainability

In [32]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train_sustain, y_train_sustain)

DecisionTreeClassifier()

In [33]:
dtree_predict2 = dtree.predict(x_test_sustain)

### Random Forest Classifier for sustainability

In [40]:
rforest = RandomForestClassifier(n_estimators=60)
rforest.fit(x_train_sustain, y_train_sustain)

  rforest.fit(x_train_brand, y_train_brand)


RandomForestClassifier(n_estimators=60)

In [41]:
rforest_predict2 = rforest.predict(x_test_sustain)

### Comparing Accuracy Scores

In [34]:
print("Gaussian Naive Bayes Classifier Accuracy:", round((metrics.accuracy_score(y_test_sustain, GNB_predict2))*100,2),"%")
print("Decision Classifier Accuracy:", round((metrics.accuracy_score(y_test_sustain, dtree_predict2))*100,2),"%")
print("Random Forest Classifier Accuracy:", round((metrics.accuracy_score(y_test_brand, rforest_predict2))*100,2),"%")

Gaussian Naive Bayes Classifier Accuracy: 97.39 %
Decision Classifier Accuracy: 99.5 %
