### Create Dataframes

In [0]:
from pyspark.sql import functions as F, types as T

rows_customers = [
    (1,  "Asha",  "IN", True),
    (2,  "Bob",   "US", False),
    (3,  "Chen",  "CN", True),
    (4,  "Diana", "US", None),
    (None, "Ghost","UK", False),     # NULL key to demo null join behavior
]

rows_orders = [
    (101, 1,   120.0, "IN"),
    (102, 1,    80.0, "IN"),
    (103, 2,    50.0, "US"),
    (104, 5,    30.0, "DE"),         # no matching customer_id
    (105, 3,   200.0, "CN"),
    (106, None, 15.0, "UK"),         # NULL key wonâ€™t match
    (107, 3,    40.0, "CN"),
    (108, 2,    75.0, "US"),
]

schema_customers = T.StructType([
    T.StructField("customer_id", T.IntegerType(), True),
    T.StructField("name",        T.StringType(),  True),
    T.StructField("country",     T.StringType(),  True),
    T.StructField("vip",         T.BooleanType(), True),
])

schema_orders = T.StructType([
    T.StructField("order_id",    T.IntegerType(), True),
    T.StructField("customer_id", T.IntegerType(), True),
    T.StructField("amount",      T.DoubleType(),  True),
    T.StructField("country",     T.StringType(),  True),  # same column name to show collisions
])

df_customers = spark.createDataFrame(rows_customers, schema_customers)
df_orders    = spark.createDataFrame(rows_orders,    schema_orders)

display(df_customers)
display(df_orders)

customer_id,name,country,vip
1.0,Asha,IN,True
2.0,Bob,US,False
3.0,Chen,CN,True
4.0,Diana,US,
,Ghost,UK,False


order_id,customer_id,amount,country
101,1.0,120.0,IN
102,1.0,80.0,IN
103,2.0,50.0,US
104,5.0,30.0,DE
105,3.0,200.0,CN
106,,15.0,UK
107,3.0,40.0,CN
108,2.0,75.0,US


### Inner Join

In [0]:
 o = df_orders.alias("o")
 o.show()

+--------+-----------+------+-------+
|order_id|customer_id|amount|country|
+--------+-----------+------+-------+
|     101|          1| 120.0|     IN|
|     102|          1|  80.0|     IN|
|     103|          2|  50.0|     US|
|     104|          5|  30.0|     DE|
|     105|          3| 200.0|     CN|
|     106|       NULL|  15.0|     UK|
|     107|          3|  40.0|     CN|
|     108|          2|  75.0|     US|
+--------+-----------+------+-------+



In [0]:
o, c = df_orders.alias("o"), df_customers.alias("c")

df_inner = o.join(c, on="customer_id", how="inner")
display(df_inner)

customer_id,order_id,amount,country,name,country.1,vip
1,101,120.0,IN,Asha,IN,True
1,102,80.0,IN,Asha,IN,True
2,103,50.0,US,Bob,US,False
3,105,200.0,CN,Chen,CN,True
3,107,40.0,CN,Chen,CN,True
2,108,75.0,US,Bob,US,False


### Disambiguate Columns

In [0]:
df_inner_clean = (
    o.join(c, on="customer_id", how="inner")
     .select(
        "order_id", "customer_id", "amount",
        F.col("o.country").alias("ship_country"),
        "name", F.col("c.country").alias("cust_country"), "vip"
     )
)
display(df_inner_clean)

order_id,customer_id,amount,ship_country,name,cust_country,vip
101,1,120.0,IN,Asha,IN,True
102,1,80.0,IN,Asha,IN,True
103,2,50.0,US,Bob,US,False
105,3,200.0,CN,Chen,CN,True
107,3,40.0,CN,Chen,CN,True
108,2,75.0,US,Bob,US,False


### Other Joins: Left, Full etc.

In [0]:
display(o.join(c, on="customer_id", how="left"))

customer_id,order_id,amount,country,name,country.1,vip
1.0,101,120.0,IN,Asha,IN,True
1.0,102,80.0,IN,Asha,IN,True
2.0,103,50.0,US,Bob,US,False
5.0,104,30.0,DE,,,
3.0,105,200.0,CN,Chen,CN,True
,106,15.0,UK,,,
3.0,107,40.0,CN,Chen,CN,True
2.0,108,75.0,US,Bob,US,False


In [0]:
display(o.join(c, on="customer_id", how="full"))

customer_id,order_id,amount,country,name,country.1,vip
1.0,101.0,120.0,IN,Asha,IN,True
1.0,102.0,80.0,IN,Asha,IN,True
2.0,103.0,50.0,US,Bob,US,False
5.0,104.0,30.0,DE,,,
3.0,105.0,200.0,CN,Chen,CN,True
,106.0,15.0,UK,,,
3.0,107.0,40.0,CN,Chen,CN,True
2.0,108.0,75.0,US,Bob,US,False
,,,,Ghost,UK,False
4.0,,,,Diana,US,


### Left Semi and Left Anti

In [0]:
display(o.join(c, on="customer_id", how="left_semi")) # orders with a known customer

customer_id,order_id,amount,country
1,101,120.0,IN
1,102,80.0,IN
2,103,50.0,US
3,105,200.0,CN
3,107,40.0,CN
2,108,75.0,US


In [0]:
display(o.join(c, on="customer_id", how="left_anti"))  # orphan orders (no matching customer)

customer_id,order_id,amount,country
5.0,104,30.0,DE
,106,15.0,UK


### Multi Key Join

In [0]:
df_multi = o.join(c, on=["customer_id", "country"], how="inner")
display(df_multi)

customer_id,country,order_id,amount,name,vip
1,IN,101,120.0,Asha,True
1,IN,102,80.0,Asha,True
2,US,103,50.0,Bob,False
3,CN,105,200.0,Chen,True
3,CN,107,40.0,Chen,True
2,US,108,75.0,Bob,False
