# PySpark User Guide - Complete Reference

## Quick Start & Setup

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
from pyspark.sql.functions import col
spark = SparkSession.builder \
    .appName("DataProcessing") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .getOrCreate()    

## Sample Data for Testing

In [15]:
# Customer data
customers_data = [
    (1, "John Doe", "john@email.com", "US", "Premium"),
    (2, "Jane Smith", "jane@email.com", "CA", "Standard"),
    (3, "Bob Johnson", "bob@email.com", "UK", "Premium"),
    (4, "Alice Brown", "alice@email.com", "US", "Standard"),
    (5, "Charlie Wilson", "charlie@email.com", "FR", "Premium")
]
customers_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("country", StringType(), True),
    StructField("tier", StringType(), True)
])
customers = spark.createDataFrame(customers_data, customers_schema)

# Orders data
orders_data = [
    (101, 1, "2024-01-15", 150.0, "Electronics"),
    (102, 2, "2024-01-16", 89.5, "Books"),
    (103, 1, "2024-01-17", 200.0, "Electronics"),
    (104, 3, "2024-01-18", 45.0, "Books"),
    (105, 2, "2024-01-19", 310.0, "Clothing"),
    (106, 4, "2024-01-20", 75.0, "Electronics"),
    (107, 1, "2024-01-21", 125.0, "Clothing"),
    (108, 6, "2024-01-22", 95.0, "Books")  # Orphaned order
]
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("category", StringType(), True)
])
orders = spark.createDataFrame(orders_data, orders_schema)

# Products catalog (small table for broadcast)
products_data = [
    ("Electronics", 0.15, 30),
    ("Books", 0.05, 14),
    ("Clothing", 0.20, 7)
]
products_schema = StructType([
    StructField("category", StringType(), True),
    StructField("commission_rate", DoubleType(), True),
    StructField("return_window_days", IntegerType(), True)
])
products = spark.createDataFrame(products_data, products_schema)



## Advanced Joins

### All Join Types with Examples

#### Inner Join - Only Matching Records


In [3]:

inner_result = customers.join(orders, "customer_id", "inner")
print(f"Inner join: {inner_result.count()} records")  # 7 records
inner_result.select("name", "order_id", "amount").show()

                                                                                

Inner join: 7 records
+-----------+--------+------+
|       name|order_id|amount|
+-----------+--------+------+
|   John Doe|     101| 150.0|
|   John Doe|     103| 200.0|
|   John Doe|     107| 125.0|
| Jane Smith|     102|  89.5|
| Jane Smith|     105| 310.0|
|Bob Johnson|     104|  45.0|
|Alice Brown|     106|  75.0|
+-----------+--------+------+




#### Left Join - All Customers


In [4]:
left_result = customers.join(orders, "customer_id", "left")
print(f"Left join: {left_result.count()} records")  # 8 records (Charlie has no orders)
left_result.select("name", "order_id", "amount").show()

Left join: 8 records




+--------------+--------+------+
|          name|order_id|amount|
+--------------+--------+------+
|      John Doe|     107| 125.0|
|      John Doe|     103| 200.0|
|      John Doe|     101| 150.0|
|    Jane Smith|     105| 310.0|
|    Jane Smith|     102|  89.5|
|   Bob Johnson|     104|  45.0|
|Charlie Wilson|    NULL|  NULL|
|   Alice Brown|     106|  75.0|
+--------------+--------+------+



                                                                                

In [9]:
orders.join(products,"category","left").show()

                                                                                

+-----------+--------+-----------+----------+------+---------------+------------------+
|   category|order_id|customer_id|order_date|amount|commission_rate|return_window_days|
+-----------+--------+-----------+----------+------+---------------+------------------+
|Electronics|     101|          1|2024-01-15| 150.0|           0.15|                30|
|Electronics|     103|          1|2024-01-17| 200.0|           0.15|                30|
|      Books|     102|          2|2024-01-16|  89.5|           0.05|                14|
|      Books|     104|          3|2024-01-18|  45.0|           0.05|                14|
|Electronics|     106|          4|2024-01-20|  75.0|           0.15|                30|
|   Clothing|     105|          2|2024-01-19| 310.0|            0.2|                 7|
|   Clothing|     107|          1|2024-01-21| 125.0|            0.2|                 7|
|      Books|     108|          6|2024-01-22|  95.0|           0.05|                14|
+-----------+--------+----------


#### Anti Join - Customers Without Orders


In [5]:

anti_result = customers.join(orders, "customer_id", "anti")
print(f"Anti join: {anti_result.count()} records")  # 1 record (Charlie)
anti_result.select("name", "email").show()
        

                                                                                

Anti join: 1 records


                                                                                

+--------------+-----------------+
|          name|            email|
+--------------+-----------------+
|Charlie Wilson|charlie@email.com|
+--------------+-----------------+




### Broadcast Joins - Performance Optimization

#### When to Use Broadcast Joins
- Small table < 200MB (configurable via `spark.sql.autoBroadcastJoinThreshold`)
- One table much smaller than the other Ratio > 10:1 entre les tailles des tables
- Avoiding shuffle operations for better performance


In [10]:
from pyspark.sql.functions import broadcast

# Manual broadcast (force small table to all executors)
broadcast_result = orders.join(broadcast(products), "category", "left")
broadcast_result.select("order_id", "category", "commission_rate", "amount").show()


+--------+-----------+---------------+------+
|order_id|   category|commission_rate|amount|
+--------+-----------+---------------+------+
|     101|Electronics|           0.15| 150.0|
|     102|      Books|           0.05|  89.5|
|     103|Electronics|           0.15| 200.0|
|     104|      Books|           0.05|  45.0|
|     105|   Clothing|            0.2| 310.0|
|     106|Electronics|           0.15|  75.0|
|     107|   Clothing|            0.2| 125.0|
|     108|      Books|           0.05|  95.0|
+--------+-----------+---------------+------+



In [None]:

# Calculate commission using broadcast join
commission_calc = orders.join(broadcast(products), "category", "left") \
    .withColumn("commission", col("amount") * col("commission_rate")) \
    .select("order_id", "amount", "commission_rate", "commission")
commission_calc.show()


+--------+------+---------------+------------------+
|order_id|amount|commission_rate|        commission|
+--------+------+---------------+------------------+
|     101| 150.0|           0.15|              22.5|
|     102|  89.5|           0.05|4.4750000000000005|
|     103| 200.0|           0.15|              30.0|
|     104|  45.0|           0.05|              2.25|
|     105| 310.0|            0.2|              62.0|
|     106|  75.0|           0.15|             11.25|
|     107| 125.0|            0.2|              25.0|
|     108|  95.0|           0.05|              4.75|
+--------+------+---------------+------------------+



#### Performance Comparison

In [17]:

# Without broadcast (creates shuffle)
no_broadcast = orders.join(products, "category", "left")

# With broadcast (no shuffle)
with_broadcast = orders.join(broadcast(products), "category", "left")

# Check execution plan
print("=== Without Broadcast ===")
no_broadcast.explain(True)


=== Without Broadcast ===
== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [category])
:- LogicalRDD [order_id#183, customer_id#184, order_date#185, amount#186, category#187], false
+- LogicalRDD [category#188, commission_rate#189, return_window_days#190], false

== Analyzed Logical Plan ==
category: string, order_id: int, customer_id: int, order_date: string, amount: double, commission_rate: double, return_window_days: int
Project [category#187, order_id#183, customer_id#184, order_date#185, amount#186, commission_rate#189, return_window_days#190]
+- Join LeftOuter, (category#187 = category#188)
   :- LogicalRDD [order_id#183, customer_id#184, order_date#185, amount#186, category#187], false
   +- LogicalRDD [category#188, commission_rate#189, return_window_days#190], false

== Optimized Logical Plan ==
Project [category#187, order_id#183, customer_id#184, order_date#185, amount#186, commission_rate#189, return_window_days#190]
+- Join LeftOuter, (category#187 = category#188)
   :

In [18]:

print("\n=== With Broadcast ===")  
with_broadcast.explain(True)




=== With Broadcast ===
== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [category])
:- LogicalRDD [order_id#183, customer_id#184, order_date#185, amount#186, category#187], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [category#188, commission_rate#189, return_window_days#190], false

== Analyzed Logical Plan ==
category: string, order_id: int, customer_id: int, order_date: string, amount: double, commission_rate: double, return_window_days: int
Project [category#187, order_id#183, customer_id#184, order_date#185, amount#186, commission_rate#189, return_window_days#190]
+- Join LeftOuter, (category#187 = category#188)
   :- LogicalRDD [order_id#183, customer_id#184, order_date#185, amount#186, category#187], false
   +- ResolvedHint (strategy=broadcast)
      +- LogicalRDD [category#188, commission_rate#189, return_window_days#190], false

== Optimized Logical Plan ==
Project [category#187, order_id#183, customer_id#184, order_date#185, amount#186, commission_rate#1

### Complex Join Conditions


In [19]:
complex_join = orders.join(
    customers,
    (orders.customer_id == customers.customer_id) & 
    (customers.tier == "Premium") &
    (orders.amount > 100),
    "inner"
)
complex_join.select("name", "order_id", "amount", "tier").show()

[Stage 67:>                                                         (0 + 2) / 2]

+--------+--------+------+-------+
|    name|order_id|amount|   tier|
+--------+--------+------+-------+
|John Doe|     101| 150.0|Premium|
|John Doe|     103| 200.0|Premium|
|John Doe|     107| 125.0|Premium|
+--------+--------+------+-------+



                                                                                