In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr, sum as _sum, avg
spark = SparkSession.builder.appName("ProductDataAnalysis").getOrCreate()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ProductDataAnalysis").getOrCreate()

data = [
    (101, "Laptop", "Electronics", 55000, 10),
    (102, "Smartphone", "Electronics", 30000, 25),
    (103, "Chair", "Furniture", 2500, 50),
    (104, "Book", "Stationery", 400, 200),
    (105, "Headphones", "Electronics", 1500, 100),
    (106, "Table", "Furniture", 3200, 40),
    (107, "Pen", "Stationery", 20, 500),
    (108, "Monitor", "Electronics", 12000, 15),
    (109, "Notebook", "Stationery", 60, 300),
    (110, "Sofa", "Furniture", 45000, 5)
]

columns = ["product_id", "product_name", "category", "price", "quantity"]

df = spark.createDataFrame(data, columns)
df.write.csv("dbfs:/tmp/products_csv", header=True, mode="overwrite")
df.write.json("dbfs:/tmp/products_json", mode="overwrite")


# Read CSV into DataFrame and print schema

In [0]:
df_csv = spark.read.csv("dbfs:/tmp/products_csv/", header=True, inferSchema=True)
df_csv.printSchema()


root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



#Read JSON and Compare Schema

In [0]:
df_json = spark.read.json("dbfs:/tmp/products_json/")
df_json.printSchema()

print("Schemas match:", df_csv.schema == df_json.schema)


root
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)

Schemas match: False


# Convert CSV to Parquet

In [0]:
df_csv.write.parquet("dbfs:/tmp/products_parquet", mode="overwrite")


#Compare File Sizes (CSV vs JSON vs Parquet)


In [0]:
import os

def get_size(path):
    return sum(os.path.getsize(os.path.join(dirpath, f))
               for dirpath, _, files in os.walk(path)
               for f in files)

csv_size = get_size("/dbfs/tmp/products_csv")
json_size = get_size("/dbfs/tmp/products_json")
parquet_size = get_size("/dbfs/tmp/products_parquet")

print(f"CSV Size: {csv_size} bytes")
print(f"JSON Size: {json_size} bytes")
print(f"Parquet Size: {parquet_size} bytes")


CSV Size: 0 bytes
JSON Size: 0 bytes
Parquet Size: 0 bytes


#Add total_revenue Column


In [0]:
from pyspark.sql.functions import col

df_revenue = df_csv.withColumn("total_revenue", col("price") * col("quantity"))
df_revenue.show()


+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       103|       Chair|  Furniture| 2500|      50|       125000|
|       104|        Book| Stationery|  400|     200|        80000|
|       105|  Headphones|Electronics| 1500|     100|       150000|
|       108|     Monitor|Electronics|12000|      15|       180000|
|       109|    Notebook| Stationery|   60|     300|        18000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
|       101|      Laptop|Electronics|55000|      10|       550000|
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       106|       Table|  Furniture| 3200|      40|       128000|
|       107|         Pen| Stationery|   20|     500|        10000|
+----------+------------+-----------+-----+--------+-------------+



#Top 3 Products by Total Revenue


In [0]:
df_revenue.orderBy(col("total_revenue").desc()).show(3)


+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       101|      Laptop|Electronics|55000|      10|       550000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+
only showing top 3 rows


#Filter Furniture Products with Price > 3000


In [0]:
df_csv.filter((col("category") == "Furniture") & (col("price") > 3000)).show()


+----------+------------+---------+-----+--------+
|product_id|product_name| category|price|quantity|
+----------+------------+---------+-----+--------+
|       110|        Sofa|Furniture|45000|       5|
|       106|       Table|Furniture| 3200|      40|
+----------+------------+---------+-----+--------+



#Create price_band Column


In [0]:
from pyspark.sql.functions import when

df_band = df_csv.withColumn(
    "price_band",
    when(col("price") > 10000, "High")
    .when((col("price") > 3000) & (col("price") <= 10000), "Medium")
    .otherwise("Low")
)
df_band.show()


+----------+------------+-----------+-----+--------+----------+
|product_id|product_name|   category|price|quantity|price_band|
+----------+------------+-----------+-----+--------+----------+
|       103|       Chair|  Furniture| 2500|      50|       Low|
|       104|        Book| Stationery|  400|     200|       Low|
|       105|  Headphones|Electronics| 1500|     100|       Low|
|       108|     Monitor|Electronics|12000|      15|      High|
|       109|    Notebook| Stationery|   60|     300|       Low|
|       110|        Sofa|  Furniture|45000|       5|      High|
|       101|      Laptop|Electronics|55000|      10|      High|
|       102|  Smartphone|Electronics|30000|      25|      High|
|       106|       Table|  Furniture| 3200|      40|    Medium|
|       107|         Pen| Stationery|   20|     500|       Low|
+----------+------------+-----------+-----+--------+----------+



#Group by Category, Total Quantity


In [0]:
from pyspark.sql.functions import sum as _sum

df_csv.groupBy("category").agg(_sum("quantity").alias("total_quantity")).show()


+-----------+--------------+
|   category|total_quantity|
+-----------+--------------+
| Stationery|          1000|
|Electronics|           150|
|  Furniture|            95|
+-----------+--------------+



#Average Price by Category


In [0]:
from pyspark.sql.functions import avg

df_csv.groupBy("category").agg(avg("price").alias("avg_price")).show()


+-----------+---------+
|   category|avg_price|
+-----------+---------+
| Stationery|    160.0|
|Electronics|  24625.0|
|  Furniture|  16900.0|
+-----------+---------+



# Count Products by Price Band


In [0]:
df_band.groupBy("price_band").count().show()


+----------+-----+
|price_band|count|
+----------+-----+
|       Low|    5|
|      High|    4|
|    Medium|    1|
+----------+-----+



#Save Filtered Electronics to Parquet


In [0]:
df_csv.filter((col("category") == "Electronics") & (col("price") > 5000)) \
    .write.parquet("dbfs:/tmp/electronics_filtered", mode="overwrite")


#Save Stationery to JSON

In [0]:
df_csv.filter(col("category") == "Stationery") \
    .write.json("dbfs:/tmp/stationery_products", mode="overwrite")


#Category with Highest Total Revenue


In [0]:
df_parquet = spark.read.parquet("dbfs:/tmp/products_parquet")

df_parquet.withColumn("total_revenue", col("price") * col("quantity")) \
    .groupBy("category") \
    .agg(_sum("total_revenue").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc()) \
    .show(1)


+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|      1630000|
+-----------+-------------+
only showing top 1 row


#Spark SQL: quantity > 100 and price < 1000


In [0]:
df_csv.createOrReplaceTempView("products")

spark.sql("""
    SELECT * FROM products
    WHERE quantity > 100 AND price < 1000
""").show()


+----------+------------+----------+-----+--------+
|product_id|product_name|  category|price|quantity|
+----------+------------+----------+-----+--------+
|       104|        Book|Stationery|  400|     200|
|       109|    Notebook|Stationery|   60|     300|
|       107|         Pen|Stationery|   20|     500|
+----------+------------+----------+-----+--------+

