# Importing Libraries and Data

>Importing Spark libraries

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, explode
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better

>Importing the dataset, inferring schema, and checking data types

In [2]:
final_dataset = spark.read.options(header = True, sep = ",", inferSchema=True, escape='"').option("multiline",'true').csv("./final dataset.csv")
final_dataset.printSchema()

root
 |-- p_id: double (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- colour: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- ratingCount: double (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- description: string (nullable = true)
 |-- p_attributes: string (nullable = true)
 |-- brand_id: integer (nullable = true)



>Viewing top 5 rows of first two columns

In [3]:
final_dataset.select(['p_id','name']).show(5)

+-----------+--------------------+
|       p_id|                name|
+-----------+--------------------+
|  1518329.0|Dupatta Bazaar Wh...|
|  5829334.0|Roadster Women Mu...|
|1.0340119E7|Inddus Peach-Colo...|
| 1.085638E7|SASSAFRAS Women B...|
|1.2384822E7|Kotty Women Black...|
+-----------+--------------------+
only showing top 5 rows



>Grouping by brand & brand ID

In [4]:
final_dataset.groupBy('brand_id', 'brand').count()

brand_id,brand,count
178,Cottinfab,31
399,Jockey,12
987,iki chic,31
760,Saffron Threads,17
321,Granthva Fab,10
470,LYKKEIN,5
83,Arrabi,2
227,EXTRA LOVE BY LIBAS,7
410,KANNAHI,1
530,Marigold by Fable...,4


In [5]:
final_dataset

p_id,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,brand_id
1518329.0,Dupatta Bazaar Wh...,899.0,White,Dupatta Bazaar,1321.0,4.548826646,White embroidered...,{'Occasion': 'Dai...,221
5829334.0,Roadster Women Mu...,1199.0,Mustard,Roadster,5462.0,4.313255218,Mustard yellow so...,{'Body Shape ID':...,702
10340119.0,Inddus Peach-Colo...,5799.0,Peach,Inddus,145.0,4.068965517,Peach-Coloured an...,{'Bottom Fabric':...,363
10856380.0,SASSAFRAS Women B...,1499.0,Black,SASSAFRAS,9124.0,4.147523016,Black solid woven...,"{'Add-Ons': 'NA',...",720
12384822.0,Kotty Women Black...,1999.0,Black,Kotty,12260.0,4.078466558,Black dark wash 4...,"{'Add-Ons': 'NA',...",446
12742100.0,KASSUALLY Women B...,2199.0,Black,KASSUALLY,6297.0,4.349213911,Black printed bas...,{'Body or Garment...,412
13842966.0,Sassafras Brown &...,1499.0,Brown,SASSAFRAS,7358.0,4.395351998,<ul><li>Brown and...,{'Body Shape ID':...,720
14021452.0,Sera Women Multic...,1494.0,Multi,Sera,750.0,4.288,Brown and blue pr...,{'Body or Garment...,771
14063026.0,Tokyo Talkies Wom...,699.0,Black,Tokyo Talkies,1856.0,4.530711207,Black solid mid-r...,{'Body or Garment...,845
14324806.0,Anouk Stylish Bla...,4699.0,Black,Anouk,84.0,3.80952381,Stay fashionable ...,{'Blouse Closure'...,76


In [6]:
final_dataset = final_dataset.drop('description').drop('p_attributes')
final_dataset

p_id,name,price,colour,brand,ratingCount,avg_rating,brand_id
1518329.0,Dupatta Bazaar Wh...,899.0,White,Dupatta Bazaar,1321.0,4.548826646,221
5829334.0,Roadster Women Mu...,1199.0,Mustard,Roadster,5462.0,4.313255218,702
10340119.0,Inddus Peach-Colo...,5799.0,Peach,Inddus,145.0,4.068965517,363
10856380.0,SASSAFRAS Women B...,1499.0,Black,SASSAFRAS,9124.0,4.147523016,720
12384822.0,Kotty Women Black...,1999.0,Black,Kotty,12260.0,4.078466558,446
12742100.0,KASSUALLY Women B...,2199.0,Black,KASSUALLY,6297.0,4.349213911,412
13842966.0,Sassafras Brown &...,1499.0,Brown,SASSAFRAS,7358.0,4.395351998,720
14021452.0,Sera Women Multic...,1494.0,Multi,Sera,750.0,4.288,771
14063026.0,Tokyo Talkies Wom...,699.0,Black,Tokyo Talkies,1856.0,4.530711207,845
14324806.0,Anouk Stylish Bla...,4699.0,Black,Anouk,84.0,3.80952381,76


>Grouping by colour