In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct
spark = SparkSession.builder.appName("JoinVerification").getOrCreate()
commentary_data = [
    (1, "Match 1", "2023-01-01"),
    (2, "Match 2", "2023-01-02"),
    (3, "Match 3", "2023-01-03"),
    (4, "Match 4", "2023-01-04")  ]
over_data = [
    (1, "Team A vs Team B"),
    (2, "Team C vs Team D"),
    (3, "Team E vs Team F"),
    (5, "Team G vs Team H") ]
commentary_df = spark.createDataFrame(
    commentary_data,
    ["match_id", "match_name", "date"]
)

over_df = spark.createDataFrame(
    over_data,
    ["match_id", "teams"]
)
inner_join = commentary_df.join(
    over_df,
    "match_id",
    "inner"
)

left_join = commentary_df.join(
    over_df,
    "match_id",
    "left"
)

right_join = commentary_df.join(
    over_df,
    "match_id",
    "right"
)

full_join = commentary_df.join(
    over_df,
    "match_id",
    "full"
)
commentary_count = commentary_df.select(countDistinct("match_id")).collect()[0][0]
over_count = over_df.select(countDistinct("match_id")).collect()[0][0]
inner_count = inner_join.select(countDistinct("match_id")).collect()[0][0]
left_count = left_join.select(countDistinct("match_id")).collect()[0][0]
right_count = right_join.select(countDistinct("match_id")).collect()[0][0]
full_count = full_join.select(countDistinct("match_id")).collect()[0][0]
results = [
    ("Commentary dataset", commentary_count),
    ("Over dataset", over_count),
    ("INNER JOIN", inner_count),
    ("LEFT JOIN", left_count),
    ("RIGHT JOIN", right_count),
    ("FULL JOIN", full_count)
]

results_df = spark.createDataFrame(
    results,
    ["join_type", "match_count"]
)
print("=== Match Count Verification ===")
results_df.show(truncate=False)

print("\n=== Sample INNER JOIN Results ===")
inner_join.show()

print("\n=== Sample LEFT JOIN Results (showing non-matches) ===")
left_join.filter("teams IS NULL").show()

print("\n=== Sample RIGHT JOIN Results (showing non-matches) ===")
right_join.filter("match_name IS NULL").show()
spark.stop()

=== Match Count Verification ===
+------------------+-----------+
|join_type         |match_count|
+------------------+-----------+
|Commentary dataset|4          |
|Over dataset      |4          |
|INNER JOIN        |3          |
|LEFT JOIN         |4          |
|RIGHT JOIN        |4          |
|FULL JOIN         |5          |
+------------------+-----------+


=== Sample INNER JOIN Results ===
+--------+----------+----------+----------------+
|match_id|match_name|      date|           teams|
+--------+----------+----------+----------------+
|       1|   Match 1|2023-01-01|Team A vs Team B|
|       2|   Match 2|2023-01-02|Team C vs Team D|
|       3|   Match 3|2023-01-03|Team E vs Team F|
+--------+----------+----------+----------------+


=== Sample LEFT JOIN Results (showing non-matches) ===
+--------+----------+----------+-----+
|match_id|match_name|      date|teams|
+--------+----------+----------+-----+
|       4|   Match 4|2023-01-04| NULL|
+--------+----------+----------+-----+