# Data Analysis

>Importing libraries

In [None]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import ast
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

# Data Cleaning & Transformation

>Importing and viewing the fashion dataset


In [None]:
fashion_dataset= pd.read_csv(r".\fashion dataset.csv")
fashion_dataset

>Finding the number of unique brands in the fashion dataset

In [None]:
len(pd.unique(fashion_dataset['brand']))

>Importing the brand details dataset

In [None]:
brand_details = pd.read_excel(r".\fashion brand details.xlsx")
brand_details

>Counting the number of unique brand names in the brand details dataset

In [None]:
len(pd.unique(brand_details['brand_name']))

>Finding the number of null and duplicated values in each dataset

In [None]:
brand_details.isna().sum()

In [None]:
fashion_dataset.isna().sum()

In [None]:
brand_details.duplicated().sum()

In [None]:
fashion_dataset.duplicated().sum()

>Testing out duplicate dropping

In [None]:
df_dupl = fashion_dataset.drop_duplicates(keep = "last")
len(pd.unique(df_dupl["brand"]))

>Dropping duplicates in fashion dataset

In [None]:
fashion_dataset.drop_duplicates(inplace = True, keep = "last")
fashion_dataset

> Expanding the p_attributes column


In [None]:
fashion_dataset['p_attributes'] = fashion_dataset['p_attributes']
not_null = pd.notnull(fashion_dataset['p_attributes'])
fash3 = fashion_dataset[not_null]['p_attributes'].apply(lambda x: ast.literal_eval(x))
temp = pd.DataFrame([*fash3], fash3.index)
fashion_dataset = pd.concat([fashion_dataset, temp], axis=1)

>Renaming columns for use in SQL and MapReduce

In [None]:
fashion_dataset.rename(columns={'Dupatta Pattern':'Dupatta_Pattern', 'Kurta Pattern':'Kurta_Pattern','Bottom Pattern': 'Bottom_Pattern', 'Top Pattern': 'Top_Pattern','Print or Pattern Type': 'Print_Pattern_Type', 'Knit or Woven':'Knit_or_Woven', 'Weave Type':'Weave_Type', 'Weave Pattern':'Weave_Pattern'}, inplace=True)

In [None]:
fashion_dataset['Kurta_Pattern'].isna().sum()

In [None]:
fashion_dataset['Weave_Type'].isna().sum()

In [None]:
fashion_dataset['Knit_or_Woven'].isna().sum()

In [None]:
fashion_dataset['Weave_Pattern'].isna().sum()

>Viewing all columns of the dataset

In [None]:
print(list(fashion_dataset.columns))

### Viewing Weave Type & Knit or Woven

>Fill Weave Type with Fabric also. Check which fabrics are woven and which ones are knit

In [None]:
pd.unique(fashion_dataset['Fabric'])

In [None]:
pd.unique(fashion_dataset['Weave_Type'])

In [None]:
pd.unique(fashion_dataset['Weave_Type'])

In [None]:
pd.unique(fashion_dataset['Knit_or_Woven'])

In [None]:
pd.unique(fashion_dataset['Weave_Pattern'])

### Formatting Weave Pattern  and Weave Type to fill Knot or Woven

In [None]:
fashion_dataset['Weave_Pattern'] = fashion_dataset['Weave_Pattern'].replace(['Regular', 'Jacquard', 'Brocade', 'Dobby', 'Khadi'], ['Woven','Woven','Woven','Woven','Woven'])

In [None]:
pd.unique(fashion_dataset['Weave_Pattern'])

In [None]:
fashion_dataset['Weave_Type'] = fashion_dataset['Weave_Type'].replace(['Machine Weave',
       'Velvet', 'Denim', 'Handloom', 'Lace', 'Chambray', 'Corduroy'], ['Woven','Woven','Woven','Woven','Knitted','Woven','Woven'])

In [None]:
pd.unique(fashion_dataset['Weave_Type'])

> Use 'Occasion', 'Print or Pattern Type', 'Pattern', 'Top Pattern', 'Bottom Pattern', 'Dupatta Pattern', 'Kurta Pattern', 'Weave Pattern', 'Knit or Woven', 'Sustainable', 'Sport', 'Fusion_Wear'

### Merging the datasets

In [None]:
merged_dataset = ps.sqldf("select fashion_dataset.p_id, fashion_dataset.name, fashion_dataset.price, fashion_dataset.colour, brand_details.*, fashion_dataset.brand, fashion_dataset.ratingCount, fashion_dataset.avg_rating, fashion_dataset.Occasion, fashion_dataset.Print_Pattern_Type, fashion_dataset.Pattern, fashion_dataset.Top_Pattern, fashion_dataset.Bottom_Pattern, fashion_dataset.Dupatta_Pattern, fashion_dataset.Kurta_Pattern, fashion_dataset.Sustainable, fashion_dataset.Weave_Type, fashion_dataset.Knit_or_Woven,fashion_dataset.Weave_Pattern, fashion_dataset.Fabric from fashion_dataset left join brand_details on (fashion_dataset.brand = brand_details.brand_name)")
merged_dataset

### Dropping unnecessary null values

In [None]:
merged_dataset.isna().sum()

In [None]:
sqldf("select * from merged_dataset where p_id is null")

In [None]:
merged_dataset.dropna(subset = "p_id", inplace = True)
merged_dataset.isna().sum()

In [None]:
sqldf("select * from merged_dataset where name is null")

In [None]:
merged_dataset.dropna(subset = "name", inplace = True)
merged_dataset.isna().sum()

>Upon inspection, the rows where brand was null were duplicate rows, and were therefore dropped.

In [None]:
merged_dataset.dropna(subset = "brand", inplace = True)
merged_dataset.isna().sum()

>Finding why there are null brand name and brand ID values

In [None]:
sqldf("select distinct brand from merged_dataset where brand_id is null")

In [None]:
sqldf("select distinct brand, brand_id from merged_dataset order by brand_id asc")

In [None]:
sqldf("select distinct brand_id from merged_dataset order by brand_id asc")

In [None]:
sqldf("select brand_id from brand_details where brand_name = 'KASSUALLY'")

In [None]:
sqldf("select distinct * from merged_dataset where brand is null")

In [None]:
sqldf("select * from merged_dataset where brand_name is null")

In [None]:
sqldf("select count(distinct brand_id) from merged_dataset")

In [None]:
sqldf("select max(brand_id), brand_name from brand_details")

### Filling brand ID

>Getting rid of null values in brand and brand ID by assigning new IDs

In [None]:
new_id = sqldf("select distinct brand, brand_id from merged_dataset order by brand asc")
new_id

In [None]:
new_id.loc[:, 'brandID'] = range(1, 1021)
new_id.drop('brand_id', axis = 'columns')

In [None]:
dataset_final = sqldf("select merged_dataset.*, new_id.brandID from merged_dataset left join new_id on (merged_dataset.brand = new_id.brand)")
dataset_final.drop(['brand_id','brand_name'], axis = 'columns', inplace = True)
dataset_final.isna().sum()

### Filling Print_Pattern_Type null values

In [None]:
pd.unique(dataset_final['Print_Pattern_Type'])

In [None]:
sqldf("select Print_Pattern_Type, Pattern, Top_Pattern, Bottom_Pattern from dataset_final where Print_Pattern_Type is null and Top_Pattern is not null or Bottom_Pattern is not null or Pattern is not null")

In [None]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Pattern'])

In [None]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Bottom_Pattern'])

In [None]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Top_Pattern'])

In [None]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Kurta_Pattern'])

In [None]:
dataset_final['Print_Pattern_Type'] = dataset_final['Print_Pattern_Type'].combine_first(dataset_final['Dupatta_Pattern'])

In [None]:
dataset_final.drop(['Pattern','Top_Pattern','Bottom_Pattern', 'Kurta_Pattern','Dupatta_Pattern'], axis = 'columns', inplace=True)

In [None]:
dataset_final.isna().sum()

### Filling Knit_or_Woven

In [None]:
#dataset_final['Weave_Type'] = dataset_final['Weave_Type'].combine_first(dataset_final['Weave_Pattern'])

In [None]:
#dataset_final['Weave_Type'] = dataset_final['Weave_Type'].combine_first(dataset_final['Knit_or_Woven'])

In [None]:
#dataset_final.isna().sum()

In [None]:
pd.unique(dataset_final['Weave_Type'])

In [None]:
fabrics_knit = sqldf("select distinct Fabric from dataset_final where Weave_Type = 'Knitted'")
fabrics_knit
#fabrics_knit.dropna(inplace =True)

In [None]:
#fabrics_knit.loc[:, 'weave'] = 'Knitted'

In [None]:
fabrics_weave = sqldf("select distinct Fabric from dataset_final where Weave_Type = 'Woven'")
fabrics_weave

In [None]:
#fabrics_weave.dropna(inplace = True)

In [None]:
#fabrics_weave.loc[:,'weave'] = 'Woven'

In [None]:
fabrics_both = sqldf("select distinct Fabric from dataset_final where Weave_Type = 'Knitted and Woven'")
fabrics_both
#fabrics_both.dropna(inplace=True)

In [None]:
#fabrics_weave.loc[:,'weave'] = 'Knitted and Woven'

In [None]:
#dataset_final = sqldf("select dataset_final.*, fabrics_knit.weave from dataset_final left join fabrics_knit on (fabrics_knit.Fabric=dataset_final.Fabric)")
#dataset_final.isna().sum()

In [None]:
#dataset_final.drop(['Weave_Pattern','Knit_or_Woven'], axis = 'columns', inplace=True)

>Filling in remaining null values

In [None]:
sqldf("select * from dataset_final where colour is null")

In [None]:
avg_rating_mean = round(dataset_final["avg_rating"].mean(),6)

In [None]:
dataset_final["ratingCount"].median()

In [None]:
dataset_final["ratingCount"].mode()

In [None]:
ratingCount_mean = dataset_final["ratingCount"].mean()

In [None]:
dataset_final.fillna({"colour":"dataset_final.mode()","avg_rating":avg_rating_mean, "ratingCount":ratingCount_mean, "Print_Pattern_Type":"dataset_final.mode()", "Occasion":"dataset_final.mode()"}, inplace = True)
dataset_final.isna().sum()

In [None]:
dataset_final.info()

>Exporting data for analysis using Apache MapReduce

In [None]:
dataset_final.to_csv("final_dataset.csv", index = False)

In [None]:
new_dataset = dataset_final.loc[:,['p_id', 'name','price','colour','brand','ratingCount','avg_rating','brand_id']]
new_dataset.to_csv("new_dataset.csv", index = False, header = False)

In [None]:
dataset_final.info()

# Machine Learning

## Regression Modelling

## Pre-processing

In [None]:
regression_data = dataset_final.loc[:,['price','avg_rating','ratingCount', 'brand_id','name','colour','brand','description','p_attributes']]

In [None]:
le = preprocessing.LabelEncoder()

for i in regression_data.columns:
        if regression_data[i].dtype == object:
            regression_data[i] = le.fit_transform(regression_data[i])
        else:
            pass

In [None]:
regression_data

In [None]:
x_reg = regression_data.loc[:,['price','avg_rating','ratingCount','name','colour','description','p_attributes']]
y_reg = regression_data.loc[:,['brand_id']]

>Creating the training and testing datasets

In [None]:
x_reg_train, x_reg_test, y_reg_train, y_reg_test = train_test_split(x_reg, y_reg, test_size=0.25)

print("Your independent training dataset contains ", x_reg_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_reg_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_reg_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_reg_test.shape, " rows and columns.")

### Linear Regression

>Training the model

In [None]:
LR = LinearRegression()

In [None]:
LR.fit(x_reg_train, y_reg_train)

>Testing the model

In [None]:
LR_predict = LR.predict(x_reg_test)
LR_predict

In [None]:
print("Linear Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, LR_predict))

### Bayesian Ridge Regression

In [None]:
BayRidge = BayesianRidge()
BayRidge.fit(x_reg_train, y_reg_train)

In [None]:
BayRidge_predict = BayRidge.predict(x_reg_test)
BayRidge_predict

In [None]:
print("Bayesian Ridge Regressor")
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_reg_test, BayRidge_predict))

In [None]:
dataset_final.info()

## Classification Modelling

>Pre-processing

In [None]:
classification_data = dataset_final.loc[:,['name','colour','brand', 'brand_id','description','p_attributes']]
classification_data

In [None]:
le = preprocessing.LabelEncoder()

for i in classification_data.columns:
        if classification_data[i].dtype == object:
            classification_data[i] = le.fit_transform(classification_data[i])
        else:
            pass

In [None]:
classification_data

>Creating the training datasets

In [None]:
x_class = classification_data.loc[:,['name','colour','description','p_attributes']]
y_class = classification_data.loc[:,['brand']]

In [None]:
x_class

In [None]:
x_class_train, x_class_test, y_class_train, y_class_test = train_test_split(x_class, y_class, test_size=0.25)

print("Your independent training dataset contains ", x_class_train.shape, " rows and columns.")
print("Your independent testing dataset contains ", x_class_test.shape, " rows and columns.")
print("Your dependent training dataset contains ", y_class_train.shape, " rows and columns.")
print("Your dependent testing dataset contains ", y_class_test.shape, " rows and columns.")

### Gaussian Naive Bayes Classifier

In [None]:
GNB = GaussianNB()

In [None]:
GNB.fit(x_class_train, y_class_train)

In [None]:
GNB_predict = GNB.predict(x_class_test)

In [None]:
print("Gaussian Naive Bayes Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, GNB_predict))*100,2),"%")

### Decision Tree Classifier

In [None]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_class_train, y_class_train)

In [None]:
dtree_predict = dtree.predict(x_class_test)

In [None]:
print("Decision Classifier")
print("Accuracy:", round((metrics.accuracy_score(y_class_test, dtree_predict))*100,2),"%")