In [1]:
import pandas as pd
import numpy as np
import pandasql as ps
from pandasql import sqldf
import ast
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# importing datasets

In [2]:
merged_dataset= pd.read_csv(r"./cleaned_2.csv")
merged_dataset

Unnamed: 0,p_id,name,price,colour,brand,ratingCount,avg_rating,Occasion,Print_Pattern_Type,Sustainable,Knit_or_Woven,Fabric,Fabric_Purity,brandID
0,1518329.0,Dupatta Bazaar White Embroidered Chiffon Dupatta,899.0,White,Dupatta Bazaar,1321.0,4.548827,Daily,Floral,,Knitted and Woven,Cotton,,221
1,5829334.0,Roadster Women Mustard Yellow Solid Hooded Swe...,1199.0,Mustard,Roadster,5462.0,4.313255,Casual,Solid,,Knitted and Woven,Cotton,,702
2,10340119.0,Inddus Peach-Coloured & Beige Unstitched Dress...,5799.0,Peach,Inddus,145.0,4.068966,Daily,Ethnic Motifs,,Knitted and Woven,Cotton Blend,,363
3,10856380.0,SASSAFRAS Women Black Parallel Trousers,1499.0,Black,SASSAFRAS,9124.0,4.147523,Casual,Solid,Regular,Woven,Cotton,,720
4,12384822.0,Kotty Women Black Wide Leg High-Rise Clean Loo...,1999.0,Black,Kotty,12260.0,4.078467,Casual,Solid,,Knitted and Woven,Cotton,,446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,17029604.0,The Chennai Silks Pink & Silver-Toned Floral Z...,3999.0,Pink,The Chennai Silks,,,Festive,Floral,Regular,Woven,Jute Cotton,,830
14264,17600212.0,Kinder Kids Girls Blue & Green Printed Foil Pr...,2050.0,Blue,Kinder Kids,,,Casual,Printed,Regular,Knitted and Woven,Cotton,,439
14265,18159266.0,KLOTTHE Women Green & Black Floral Printed Pal...,1659.0,Green,KLOTTHE,,,Western,Floral,Regular,Woven,Polycotton,,423
14266,18921114.0,InWeave Women Red Printed A-Line Skirt,2399.0,Red,InWeave,,,Casual,Floral,Regular,Woven,Viscose Rayon,,362


In [3]:
merged_dataset.isna().sum()

p_id                      0
name                      0
price                     0
colour                    3
brand                     0
ratingCount            7722
avg_rating             7722
Occasion                  0
Print_Pattern_Type        8
Sustainable            1300
Knit_or_Woven             0
Fabric                    0
Fabric_Purity         13724
brandID                   0
dtype: int64

### Filling Fabric_Purity and Sustainable columns

In [4]:
pd.unique(merged_dataset['Fabric_Purity'])

array([nan, 'Blended', 'Pure', 'Synthetic'], dtype=object)

In [5]:
pd.unique(merged_dataset['Sustainable'])

array([nan, 'Regular', 'Sustainable'], dtype=object)

In [6]:
sqldf('select Fabric_Purity, Sustainable from merged_dataset where Sustainable is not null and Fabric_Purity is not null')

Unnamed: 0,Fabric_Purity,Sustainable
0,Pure,Regular
1,Pure,Regular
2,Pure,Regular
3,Pure,Regular
4,Pure,Regular
...,...,...
478,Blended,Regular
479,Pure,Regular
480,Blended,Regular
481,Pure,Regular


> Creating new column for fabric purity based on sustainable

In [7]:
merged_dataset.loc[:,['pur']]=merged_dataset['Sustainable'].replace(['Regular', 'Sustainable'],['Synthetic','Pure'])

>Filling Fabric Purity with new values

In [8]:
merged_dataset['Fabric_Purity'].isna().sum()

13724

In [9]:
merged_dataset['Fabric_Purity']=merged_dataset['Fabric_Purity'].combine_first(merged_dataset['pur'])

In [10]:
merged_dataset['Fabric_Purity'].isna().sum()

1239

>Creating new column for sustainable values based on  fabric purity values

In [11]:
merged_dataset.loc[:,['sus']]=merged_dataset['Fabric_Purity'].replace(['Blended', 'Pure', 'Synthetic'],['Regular', 'Sustainable','Regular'])

>Filling Sustainable with new values

In [12]:
merged_dataset['Sustainable'].isna().sum()

1300

In [13]:
merged_dataset['Sustainable']=merged_dataset['Sustainable'].combine_first(merged_dataset['sus'])

In [14]:
merged_dataset['Sustainable'].isna().sum()

1239

>Dropping newly created columns

In [15]:
merged_dataset.drop(['pur','sus'], axis='columns', inplace=True)

In [16]:
merged_dataset.isna().sum()

p_id                     0
name                     0
price                    0
colour                   3
brand                    0
ratingCount           7722
avg_rating            7722
Occasion                 0
Print_Pattern_Type       8
Sustainable           1239
Knit_or_Woven            0
Fabric                   0
Fabric_Purity         1239
brandID                  0
dtype: int64

### Filling in remaining null values

In [28]:
print(merged_dataset['colour'].mode())
print(merged_dataset['avg_rating'].mean())
print(merged_dataset['ratingCount'].mean())
print(merged_dataset['Sustainable'].mode())
print(merged_dataset['Fabric_Purity'].mode())
print(merged_dataset['Print_Pattern_Type'].mode())

0    Black
Name: colour, dtype: object
4.1003784693013685
183.41631623212783
0    Regular
Name: Sustainable, dtype: object
0    Synthetic
Name: Fabric_Purity, dtype: object
0    Solid
Name: Print_Pattern_Type, dtype: object


In [18]:
merged_dataset.fillna({'colour':'Black',"avg_rating":4.1007883731964485, "ratingCount":183.36121433078594,
                       'Sustainable':'Regular','Fabric_Purity':'Synthetic', 
                       'Print_Pattern_Type':'Solid'}, inplace = True)
merged_dataset.isna().sum()

p_id                  0
name                  0
price                 0
colour                0
brand                 0
ratingCount           0
avg_rating            0
Occasion              0
Print_Pattern_Type    0
Sustainable           0
Knit_or_Woven         0
Fabric                0
Fabric_Purity         0
brandID               0
dtype: int64

### Converting ratingCount to int

In [19]:
merged_dataset['ratingCount']=merged_dataset['ratingCount'].astype(int)

In [20]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14268 entries, 0 to 14267
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   p_id                14268 non-null  float64
 1   name                14268 non-null  object 
 2   price               14268 non-null  float64
 3   colour              14268 non-null  object 
 4   brand               14268 non-null  object 
 5   ratingCount         14268 non-null  int32  
 6   avg_rating          14268 non-null  float64
 7   Occasion            14268 non-null  object 
 8   Print_Pattern_Type  14268 non-null  object 
 9   Sustainable         14268 non-null  object 
 10  Knit_or_Woven       14268 non-null  object 
 11  Fabric              14268 non-null  object 
 12  Fabric_Purity       14268 non-null  object 
 13  brandID             14268 non-null  int64  
dtypes: float64(3), int32(1), int64(1), object(9)
memory usage: 1.5+ MB


### Rounding up avg_rating

In [21]:
merged_dataset['avg_rating']=merged_dataset['avg_rating'].round(1)

In [22]:
merged_dataset['avg_rating']

0        4.5
1        4.3
2        4.1
3        4.1
4        4.1
        ... 
14263    4.1
14264    4.1
14265    4.1
14266    4.1
14267    4.1
Name: avg_rating, Length: 14268, dtype: float64

In [23]:
merged_dataset.to_csv('./new_dataset.csv',index=False)

### Exporting dataset for use in Apache Hive and MapReduce

In [24]:
merged_dataset.to_csv('hive_data.csv', index=False,header=False)

In [25]:
final_dataset = merged_dataset.loc[:,['p_id','price','ratingCount','avg_rating','brandID']]

In [26]:
final_dataset.to_csv('./final_dataset.csv',index=False, header=False)