In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.functions import col, explode, array_repeat
import json
spark = SparkSession.builder.getOrCreate()

In [20]:
data = {
    "order_id": 1001,
    "customers": {"id": 50, "name": "Riya", "location": "Bangalore"},
    "items": [
        {"sku": "A1", "qty": 2, "price": 100},
        {"sku": "B1", "qty": 1, "price": 200}
    ]
}

json_rdd = spark.sparkContext.parallelize([json.dumps(data)])
#df = spark.createDataFrame([data])
df = spark.read.json(json_rdd)
df.show(truncate=False)
df.printSchema()
df.explain()


+---------------------+----------------------------+--------+
|customers            |items                       |order_id|
+---------------------+----------------------------+--------+
|{50, Bangalore, Riya}|[{100, 2, A1}, {200, 1, B1}]|1001    |
+---------------------+----------------------------+--------+

root
 |-- customers: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- price: long (nullable = true)
 |    |    |-- qty: long (nullable = true)
 |    |    |-- sku: string (nullable = true)
 |-- order_id: long (nullable = true)

== Physical Plan ==
*(1) Scan ExistingRDD[customers#188,items#189,order_id#190L]




In [21]:
df = df.select(
    col('order_id'),
    col('customers.id').alias('customer_id'),
    col('customers.name').alias('customer_name'),
    explode(col('items')).alias('item')
)
df.show(truncate=False)
df.printSchema()

+--------+-----------+-------------+------------+
|order_id|customer_id|customer_name|item        |
+--------+-----------+-------------+------------+
|1001    |50         |Riya         |{100, 2, A1}|
|1001    |50         |Riya         |{200, 1, B1}|
+--------+-----------+-------------+------------+

root
 |-- order_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- price: long (nullable = true)
 |    |-- qty: long (nullable = true)
 |    |-- sku: string (nullable = true)



In [22]:
df= df.select(
    col('order_id'),
    col('customer_id'),
    col('customer_name'),
    col('item.sku').alias('item_sku'),
    col('item.qty').alias('item_qty'),
    col('item.price').alias('item_price')
)
df.show(truncate=False)
df.printSchema()

+--------+-----------+-------------+--------+--------+----------+
|order_id|customer_id|customer_name|item_sku|item_qty|item_price|
+--------+-----------+-------------+--------+--------+----------+
|1001    |50         |Riya         |A1      |2       |100       |
|1001    |50         |Riya         |B1      |1       |200       |
+--------+-----------+-------------+--------+--------+----------+

root
 |-- order_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- item_sku: string (nullable = true)
 |-- item_qty: long (nullable = true)
 |-- item_price: long (nullable = true)



In [23]:
df.createOrReplaceTempView("Orders")



In [24]:
data = [
    ("Monitor", 2, 20000),
    ("CPU", 3, 90000),
    ("Earphone", 4, 6000)
]
df = spark.createDataFrame(data, ["ItemName", "Quantity", "TotalPrice"])

df.show()

+--------+--------+----------+
|ItemName|Quantity|TotalPrice|
+--------+--------+----------+
| Monitor|       2|     20000|
|     CPU|       3|     90000|
|Earphone|       4|      6000|
+--------+--------+----------+



In [32]:
# from pyspark.sql.functions import col, explode, array_repeat
df_result = (df
             .withColumn("UnitPrice", col('TotalPrice')/col('Quantity'))
             .withColumn("dummy_column", explode(array_repeat(col("UnitPrice"), col("Quantity").cast("int"))))
             .select("ItemName", col('dummy_column').alias("UnitPrice"))
             .withColumn('Quantity', lit(1))
             .select("ItemName", "Quantity", "UnitPrice")
             
)
df_result.show()

+--------+--------+---------+
|ItemName|Quantity|UnitPrice|
+--------+--------+---------+
| Monitor|       1|  10000.0|
| Monitor|       1|  10000.0|
|     CPU|       1|  30000.0|
|     CPU|       1|  30000.0|
|     CPU|       1|  30000.0|
|Earphone|       1|   1500.0|
|Earphone|       1|   1500.0|
|Earphone|       1|   1500.0|
|Earphone|       1|   1500.0|
+--------+--------+---------+



In [33]:
product_df = df_result.groupBy('ItemName').agg(
	sum(col('Quantity')).alias('Quantity'),
	sum(col('UnitPrice')).alias('Unit Price')
)

product_df.show()

+--------+--------+----------+
|ItemName|Quantity|Unit Price|
+--------+--------+----------+
| Monitor|       2|   20000.0|
|     CPU|       3|   90000.0|
|Earphone|       4|    6000.0|
+--------+--------+----------+

