#### PySpark_RealTime_Transforms_Practical.py
#### Practical PySpark script demonstrating a wide variety of DataFrame transformations
#### Includes: local dataset + schema, batch transforms, window functions, UDFs, joins, pivots, union, cache, repartition
#### Also includes a small Structured Streaming example (rate source) that joins streaming events to static reference data.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType, ArrayType, MapType
from pyspark.sql.functions import col, expr, when, lit, udf, explode, split, concat_ws, row_number, dense_rank, lag, lead
from pyspark.sql.functions import sum as _sum, avg as _avg, count as _count, max as _max, min as _min
from pyspark.sql.window import Window
import pyspark.sql.functions as F

#### -----------------------------------------------------------------------------
#### Setup SparkSession (local)
#### -----------------------------------------------------------------------------

In [2]:
spark = (SparkSession.builder
         .appName("PySpark_Transforms_Practical")
         .master("local[1]")
         .config("spark.sql.shuffle.partitions", "2")
         .config("spark.python.worker.reuse", "false")
         .config("spark.sql.execution.pyspark.udf.faulthandler.enabled", "true")
         .config("spark.sql.adaptive.enabled", "false")
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")

#### -----------------------------------------------------------------------------
#### Create a local dataset and schema (static / reference data)
#### -----------------------------------------------------------------------------

In [3]:
customers_schema = StructType([
StructField("customer_id", IntegerType(), False),
StructField("name", StringType(), True),
StructField("city", StringType(), True),
StructField("tier", StringType(), True)
])


customers_data = [
(1, "Asha", "Bengaluru", "Gold"),
(2, "Ravi", "Mumbai", "Silver"),
(3, "Meera", "Delhi", "Gold"),
(4, "Arun", "Chennai", None),
(5, "Sita", "Kolkata", "Bronze")
]

# Transactions dataset (batch example)
transactions_schema = StructType([
StructField("tx_id", IntegerType(), False),
StructField("customer_id", IntegerType(), True),
StructField("amount", DoubleType(), True),
StructField("items", ArrayType(StringType()), True),
StructField("tx_time", TimestampType(), True)
])


transactions_data = [
(101, 1, 250.0, ["pen", "notebook"], None),
(102, 2, 1200.0, ["headphones"], None),
(103, 1, 75.5, ["pencil"], None),
(104, 3, 560.0, ["bag", "wallet"], None),
(105, 5, None, [], None),
(106, 4, 45.0, ["snacks"], None),
(107, 2, 3000.0, ["laptop", "mouse"], None),
(108, 3, 130.0, ["book"], None)
]

transactions_df = spark.createDataFrame(transactions_data, transactions_schema)
transactions_df.show(5)

customers_df = spark.createDataFrame(customers_data, schema=customers_schema)
customers_df.show(5)

+-----+-----------+------+---------------+-------+
|tx_id|customer_id|amount|          items|tx_time|
+-----+-----------+------+---------------+-------+
|  101|          1| 250.0|[pen, notebook]|   NULL|
|  102|          2|1200.0|   [headphones]|   NULL|
|  103|          1|  75.5|       [pencil]|   NULL|
|  104|          3| 560.0|  [bag, wallet]|   NULL|
|  105|          5|  NULL|             []|   NULL|
+-----+-----------+------+---------------+-------+
only showing top 5 rows

+-----------+-----+---------+------+
|customer_id| name|     city|  tier|
+-----------+-----+---------+------+
|          1| Asha|Bengaluru|  Gold|
|          2| Ravi|   Mumbai|Silver|
|          3|Meera|    Delhi|  Gold|
|          4| Arun|  Chennai|  NULL|
|          5| Sita|  Kolkata|Bronze|
+-----------+-----+---------+------+



#### create DataFrame (Spark will set null for None in timestamp/amount)

In [4]:
print("Transactions Data:")
for row in transactions_data[:5]:
    print(row)
print("\nCustomers Data:")
for row in customers_data[:5]:
    print(row)

Transactions Data:
(101, 1, 250.0, ['pen', 'notebook'], None)
(102, 2, 1200.0, ['headphones'], None)
(103, 1, 75.5, ['pencil'], None)
(104, 3, 560.0, ['bag', 'wallet'], None)
(105, 5, None, [], None)

Customers Data:
(1, 'Asha', 'Bengaluru', 'Gold')
(2, 'Ravi', 'Mumbai', 'Silver')
(3, 'Meera', 'Delhi', 'Gold')
(4, 'Arun', 'Chennai', None)
(5, 'Sita', 'Kolkata', 'Bronze')


#### -----------------------------------------------------------------------------
#### Basic transformations (select, filter, where, withColumn, drop)
#### -----------------------------------------------------------------------------

In [6]:
transactions_df.printSchema()

root
 |-- tx_id: integer (nullable = false)
 |-- customer_id: integer (nullable = true)
 |-- amount: double (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tx_time: timestamp (nullable = true)



In [7]:
# select specific columns and expressions
selected = transactions_df.select('tx_id', 'customer_id', col('amount'), (col('amount')*0.95).alias('amount_after_discount'))
selected.show(5)



+-----+-----------+------+---------------------+
|tx_id|customer_id|amount|amount_after_discount|
+-----+-----------+------+---------------------+
|  101|          1| 250.0|                237.5|
|  102|          2|1200.0|               1140.0|
|  103|          1|  75.5|               71.725|
|  104|          3| 560.0|                532.0|
|  105|          5|  NULL|                 NULL|
+-----+-----------+------+---------------------+
only showing top 5 rows



In [8]:
# filter and where
high_value_transaction = transactions_df.filter(col('amount') > 500)
high_value_transaction.show()




+-----+-----------+------+---------------+-------+
|tx_id|customer_id|amount|          items|tx_time|
+-----+-----------+------+---------------+-------+
|  102|          2|1200.0|   [headphones]|   NULL|
|  104|          3| 560.0|  [bag, wallet]|   NULL|
|  107|          2|3000.0|[laptop, mouse]|   NULL|
+-----+-----------+------+---------------+-------+

