In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.functions import col, explode, array_repeat
import json
spark = SparkSession.builder.getOrCreate()

In [4]:
data = {
    "order_id": 1001,
    "customers": {"id": 50, "name": "Riya", "location": "Bangalore"},
    "items": [
        {"sku": "A1", "qty": 2, "price": 100},
        {"sku": "B1", "qty": 1, "price": 200}
    ]
}

json_rdd = spark.sparkContext.parallelize([json.dumps(data)])
#df = spark.createDataFrame([data])
df = spark.read.json(json_rdd)
df.show(truncate=False)
df.printSchema()
df.explain()


+---------------------+----------------------------+--------+
|customers            |items                       |order_id|
+---------------------+----------------------------+--------+
|{50, Bangalore, Riya}|[{100, 2, A1}, {200, 1, B1}]|1001    |
+---------------------+----------------------------+--------+

root
 |-- customers: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- price: long (nullable = true)
 |    |    |-- qty: long (nullable = true)
 |    |    |-- sku: string (nullable = true)
 |-- order_id: long (nullable = true)

== Physical Plan ==
*(1) Scan ExistingRDD[customers#10,items#11,order_id#12L]




In [5]:
df = df.select(
    col('order_id'),
    col('customers.id').alias('customer_id'),
    col('customers.name').alias('customer_name'),
    explode(col('items')).alias('item')
)
df.show(truncate=False)
df.printSchema()

+--------+-----------+-------------+------------+
|order_id|customer_id|customer_name|item        |
+--------+-----------+-------------+------------+
|1001    |50         |Riya         |{100, 2, A1}|
|1001    |50         |Riya         |{200, 1, B1}|
+--------+-----------+-------------+------------+

root
 |-- order_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- price: long (nullable = true)
 |    |-- qty: long (nullable = true)
 |    |-- sku: string (nullable = true)



In [6]:
df= df.select(
    col('order_id'),
    col('customer_id'),
    col('customer_name'),
    col('item.sku').alias('item_sku'),
    col('item.qty').alias('item_qty'),
    col('item.price').alias('item_price')
)
df.show(truncate=False)
df.printSchema()

+--------+-----------+-------------+--------+--------+----------+
|order_id|customer_id|customer_name|item_sku|item_qty|item_price|
+--------+-----------+-------------+--------+--------+----------+
|1001    |50         |Riya         |A1      |2       |100       |
|1001    |50         |Riya         |B1      |1       |200       |
+--------+-----------+-------------+--------+--------+----------+

root
 |-- order_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- item_sku: string (nullable = true)
 |-- item_qty: long (nullable = true)
 |-- item_price: long (nullable = true)



In [7]:
df.createOrReplaceTempView("Orders")



In [8]:
data = [
    ("Monitor", 2, 20000),
    ("CPU", 3, 90000),
    ("Earphone", 4, 6000)
]
df = spark.createDataFrame(data, ["ItemName", "Quantity", "TotalPrice"])

df.show()

+--------+--------+----------+
|ItemName|Quantity|TotalPrice|
+--------+--------+----------+
| Monitor|       2|     20000|
|     CPU|       3|     90000|
|Earphone|       4|      6000|
+--------+--------+----------+



In [9]:
# from pyspark.sql.functions import col, explode, array_repeat
df_result = (df
             .withColumn("UnitPrice", col('TotalPrice')/col('Quantity'))
             .withColumn("dummy_column", 
                         explode(
                             array_repeat(
                                 col("UnitPrice"), 
                                 col("Quantity").cast("int")
                                )
                            )
                         )
             .select("ItemName", col('dummy_column').alias("UnitPrice"))
             .withColumn('Quantity', lit(1))
             .select("ItemName", "Quantity", "UnitPrice")            
)
df_result.show()

+--------+--------+---------+
|ItemName|Quantity|UnitPrice|
+--------+--------+---------+
| Monitor|       1|  10000.0|
| Monitor|       1|  10000.0|
|     CPU|       1|  30000.0|
|     CPU|       1|  30000.0|
|     CPU|       1|  30000.0|
|Earphone|       1|   1500.0|
|Earphone|       1|   1500.0|
|Earphone|       1|   1500.0|
|Earphone|       1|   1500.0|
+--------+--------+---------+



In [10]:
product_df = df_result.groupBy('ItemName').agg(
	sum(col('Quantity')).alias('Quantity'),
	sum(col('UnitPrice')).alias('UnitPrice')
)

product_df.show()

+--------+--------+---------+
|ItemName|Quantity|UnitPrice|
+--------+--------+---------+
| Monitor|       2|  20000.0|
|     CPU|       3|  90000.0|
|Earphone|       4|   6000.0|
+--------+--------+---------+



### 1) Extract the date from filename

In [11]:
from pyspark.sql.functions import col, regexp_extract
data = [("passenger_20251125.csv",)] 
df = spark.createDataFrame(data, ["filename"])
df.show()

df_getFilename = df.select(
    col('filename'),
    regexp_extract(col('filename'), r'_(\d{8})\.csv$', 1).alias("date")    
)
df_getFilename.show()


+--------------------+
|            filename|
+--------------------+
|passenger_2025112...|
+--------------------+

+--------------------+--------+
|            filename|    date|
+--------------------+--------+
|passenger_2025112...|20251125|
+--------------------+--------+



### 2) Pivot sales to get Product columns per Region


|Region |Product Revenue
|East   |A       100
|East   |B       200
|West   |A       150
|West   |B       250
|to 
|Region |  A     B
|East   | 100   200   
|West   | 150   250

In [27]:
data = [('East','A',100),
        ('East','B',150),
        ('West','A',200),
        ('West','B',250),
        ('North','A',300),
        ('North','B',350)]

df = spark.createDataFrame(data, ["Region", "Category", "Sales"])
df.show()

+------+--------+-----+
|Region|Category|Sales|
+------+--------+-----+
|  East|       A|  100|
|  East|       B|  150|
|  West|       A|  200|
|  West|       B|  250|
| North|       A|  300|
| North|       B|  350|
+------+--------+-----+



In [28]:
new_df = df.groupBy('Region').pivot('Category').agg(sum('sales'))
new_df.show()

+------+---+---+
|Region|  A|  B|
+------+---+---+
|  East|100|150|
|  West|200|250|
| North|300|350|
+------+---+---+



### 3) Repeat each id N times (id repeated id times)

In [17]:
from pyspark.sql.functions import explode, array_repeat, col
data = [(1,), (2,), (3,), (4,), (5,)]
# output: 1,2,2,3,3,3,4,4,4,4,5,5,5,5,5
df = spark.createDataFrame(data, ['id'])
df_repeated = df.select(
    explode(
        array_repeat(
            col('id'), 
            col('id').cast('int')
        )
    ).alias("id_repeated")
    )
df_repeated.show()


+-----------+
|id_repeated|
+-----------+
|          1|
|          2|
|          2|
|          3|
|          3|
|          3|
|          4|
|          4|
|          4|
|          4|
|          5|
|          5|
|          5|
|          5|
|          5|
+-----------+



### 4) Generate unique team pair combinations (no duplicates, no self-pairs)

In [None]:
'''
team_id,  Name
('RCB', 'Royal challengers banglore'),
('SRH', 'Sunrisers hydrabad'),
('CSK', 'Chennai Super Kings'),
('KKR', 'Kolkata Knight Riders');
'''

data = [('RCB', 'Royal challengers banglore'),
        ('SRH', 'Sunrisers hydrabad'),      
        ('CSK', 'Chennai Super Kings'),
        ('KKR', 'Kolkata Knight Riders')] 
  
df = spark.createDataFrame(data, ['team_id', 'Name'])
df_alias_1 = df.alias('df_1').select(col('team_id').alias('team_a'))
df_alias_2 = df.alias('df_2').select(col('team_id').alias('team_b'))
df_alias_1.show()
df_alias_2.show()


+------+
|team_a|
+------+
|   RCB|
|   SRH|
|   CSK|
|   KKR|
+------+

+------+
|team_b|
+------+
|   RCB|
|   SRH|
|   CSK|
|   KKR|
+------+



In [26]:
cros_join_df = df_alias_1.join(
    df_alias_2,
    df_alias_1['team_a'] != df_alias_2['team_b'],
    'inner'
)
cros_join_df.show()

+------+------+
|team_a|team_b|
+------+------+
|   RCB|   SRH|
|   SRH|   RCB|
|   RCB|   CSK|
|   RCB|   KKR|
|   SRH|   CSK|
|   SRH|   KKR|
|   CSK|   RCB|
|   CSK|   SRH|
|   KKR|   RCB|
|   KKR|   SRH|
|   CSK|   KKR|
|   KKR|   CSK|
+------+------+

