In [1]:
!pip install pyspark findspark
import findspark
findspark.init()


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=563b2fc0dfa5df3562ed86852d78a3d741f727ebd7aeda3f6b9bf43e337aaaba
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: findspark, pyspark
Successfully installed findspark-2.0.1 pyspark-3.5.2


In [2]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("flipkart_project").getOrCreate()

In [3]:
from google.colab import files
uploaded = files.upload()


Saving flipkar_dataset.csv to flipkar_dataset.csv


In [4]:
file_name = list(uploaded.keys())[0]

# Read the CSV file using PySpark
flipkart_df = spark.read.csv(file_name, header=True, inferSchema=True)

# Display the data (use show() instead of display)
flipkart_df.show(2)

+-----+--------------------+------+---------+--------+------+-----+--------+---------+----------+-------+-------+-------+-------+-------+----------+
|   id|        Product_name|Rating|maincateg|platform|price1|Price|Discount|norating1|noreviews1|star_5f|star_4f|star_3f|star_2f|star_1f|fulfilled1|
+-----+--------------------+------+---------+--------+------+-----+--------+---------+----------+-------+-------+-------+-------+-------+----------+
|16695|Fashionable & Com...|   3.9|    Women|Flipkart|   698|  999|  30.13%|       38|         7|     17|      9|      6|      3|      3|         0|
| 5120|Combo Pack of 4 C...|   3.8|      Men|Flipkart|   999| 1999|  50.03%|      531|        69|    264|     92|     73|     29|     73|         1|
+-----+--------------------+------+---------+--------+------+-----+--------+---------+----------+-------+-------+-------+-------+-------+----------+
only showing top 2 rows



In [5]:
flipkart_df.printSchema()

flipkart_df.describe().show(2)

root
 |-- id: integer (nullable = true)
 |-- Product_name: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- maincateg: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- price1: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Discount: string (nullable = true)
 |-- norating1: integer (nullable = true)
 |-- noreviews1: integer (nullable = true)
 |-- star_5f: integer (nullable = true)
 |-- star_4f: integer (nullable = true)
 |-- star_3f: integer (nullable = true)
 |-- star_2f: integer (nullable = true)
 |-- star_1f: integer (nullable = true)
 |-- fulfilled1: integer (nullable = true)

+-------+------------------+------------+-----------------+---------+--------+-----------------+------------------+--------+------------------+------------------+------------------+-----------------+------------------+----------------+-----------------+------------------+
|summary|                id|Product_name|           Rating|maincateg|platform|

In [6]:
#handling the missing data
flipkart_df.select([count(when(col(c).isNull(), c)).alias(c) for c in flipkart_df.columns]).show()



+---+------------+------+---------+--------+------+-----+--------+---------+----------+-------+-------+-------+-------+-------+----------+
| id|Product_name|Rating|maincateg|platform|price1|Price|Discount|norating1|noreviews1|star_5f|star_4f|star_3f|star_2f|star_1f|fulfilled1|
+---+------------+------+---------+--------+------+-----+--------+---------+----------+-------+-------+-------+-------+-------+----------+
|  0|           0|     0|      526|       0|     0|    0|       0|      678|       578|    588|    539|    231|      0|      0|         0|
+---+------------+------+---------+--------+------+-----+--------+---------+----------+-------+-------+-------+-------+-------+----------+



In [7]:
#drop the rows that is missing
flipkart_df_clean=flipkart_df.dropna()

#filling specific values to the nan columns or missing columns
flipkart_df_filled=flipkart_df.fillna({"maincateg":"Men","norating1":0,"noreviews1":0,"star_5f":0,"star_4f":0,"star_3f":0})

In [8]:
#Data Transformation

# Calculate the effective price after discount
flipkart_df_transformed = flipkart_df.withColumn("EffectivePrice", expr("Price - (Price * Discount / 100)"))

# Show the updated DataFrame
flipkart_df_transformed.select("Product_name", "Price", "Discount", "EffectivePrice").show(5)

+--------------------+-----+--------+--------------+
|        Product_name|Price|Discount|EffectivePrice|
+--------------------+-----+--------+--------------+
|Fashionable & Com...|  999|  30.13%|          NULL|
|Combo Pack of 4 C...| 1999|  50.03%|          NULL|
|Cilia Mode Leo Sn...| 4999|  45.01%|          NULL|
|Men Black Sports ...|  724|  15.85%|          NULL|
|Men Green Sports ...| 2299|  40.02%|          NULL|
+--------------------+-----+--------+--------------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import regexp_replace, col, expr

# Remove the '%' symbol from the 'Discount' column and cast it to float
flipkart_df_cleaned = flipkart_df.withColumn("Discount", regexp_replace(col("Discount"), "%", "").cast("float"))

# Calculate the EffectivePrice by applying the discount
flipkart_df_transformed = flipkart_df_cleaned.withColumn("EffectivePrice", expr("Price - (Price * Discount / 100)"))

# Show the updated DataFrame
flipkart_df_transformed.select("Product_name", "Price", "Discount", "EffectivePrice").show(5)

+--------------------+-----+--------+-----------------+
|        Product_name|Price|Discount|   EffectivePrice|
+--------------------+-----+--------+-----------------+
|Fashionable & Com...|  999|   30.13|698.0013085937501|
|Combo Pack of 4 C...| 1999|   50.03|      998.9003125|
|Cilia Mode Leo Sn...| 4999|   45.01|    2748.95015625|
|Men Black Sports ...|  724|   15.85|  609.24599609375|
|Men Green Sports ...| 2299|   40.02|    1378.94015625|
+--------------------+-----+--------+-----------------+
only showing top 5 rows



In [10]:
#group by the category and calculte the average rating

avg_rating_by_category=flipkart_df_filled.groupBy("maincateg").agg(avg("Rating"))
avg_rating_by_category.show()

+---------+------------------+
|maincateg|       avg(Rating)|
+---------+------------------+
|      Men|3.9687724852496715|
|    Women| 4.047773602095407|
+---------+------------------+



In [11]:
#Total  Revenue by category

total_revenue_by_category=flipkart_df_filled.groupBy("maincateg").sum("Rating")
total_revenue_by_category.show()

+---------+------------------+
|maincateg|       sum(Rating)|
+---------+------------------+
|      Men|27578.999999999967|
|    Women| 35543.49999999977|
+---------+------------------+



In [12]:
# Write the DataFrame to a CSV file
flipkart_df_transformed.write.csv("/content/flipkart_results.csv", header=True, mode="overwrite")
