In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()
match_schema = StructType([
    StructField("match_id", IntegerType()),
    StructField("team1", StringType()),
    StructField("team2", StringType()),
    StructField("winner", StringType()),
    StructField("result", StringType()),
    StructField("tie_breaker", StringType()),
    StructField("match_status", StringType()),
    StructField("venue", StringType()),
    StructField("umpire", StringType())
])
match_data = [
    (1, "Team A", "Team B", "Team A", "normal", None, "completed", "Stadium X", "Umpire 1"),
    (2, "Team C", "Team D", None, "tie", "super over", "completed", "Stadium Y", "Umpire 2"),
    (3, "Team A", "Team C", "Team C", "normal", None, "completed", "Stadium Z", "Umpire 3"),
    (4, "Team B", "Team D", None, "tie", "boundary count", "completed", "Stadium X", "Umpire 1"),
    (5, "Team A", "Team D", None, None, None, "abandoned", "Stadium Y", "Umpire 2"),
    (6, "Team B", "Team C", "Team B", "normal", None, "completed", "Stadium Z", "Umpire 3"),
    (7, "Team C", "Team D", None, "tie", "bowl-out", "completed", "Stadium X", "Umpire 1")
]
matches_df = spark.createDataFrame(match_data, schema=match_schema)
print("1. Number of unique teams:")
teams_df = matches_df.select(col("team1")).union(matches_df.select(col("team2")))
unique_teams_count = teams_df.agg(countDistinct("team1")).collect()[0][0]
print(f"Total unique teams: {unique_teams_count}\n")
print("2. Different tie-breaker techniques used for tied matches:")
tie_breakers = matches_df.filter(col("result") == "tie") \
                         .select("tie_breaker") \
                         .distinct() \
                         .collect()
print("Tie-breaker methods:")
for row in tie_breakers:
    print(f"- {row['tie_breaker']}")
print()
print("3. Filtering only completed matches:")
completed_matches = matches_df.filter(col("match_status") == "completed")
print(f"Total matches: {matches_df.count()}")
print(f"Completed matches: {completed_matches.count()}\n")
print("4. Dropping unnecessary columns:")
columns_to_keep = ["match_id", "team1", "team2", "winner", "result", "tie_breaker"]
final_df = completed_matches.select(*columns_to_keep)
print("Final DataFrame schema:")
final_df.printSchema()
print("\nSample data from final DataFrame:")
final_df.show()

1. Number of unique teams:
Total unique teams: 4

2. Different tie-breaker techniques used for tied matches:
Tie-breaker methods:
- super over
- boundary count
- bowl-out

3. Filtering only completed matches:
Total matches: 7
Completed matches: 6

4. Dropping unnecessary columns:
Final DataFrame schema:
root
 |-- match_id: integer (nullable = true)
 |-- team1: string (nullable = true)
 |-- team2: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- result: string (nullable = true)
 |-- tie_breaker: string (nullable = true)


Sample data from final DataFrame:
+--------+------+------+------+------+--------------+
|match_id| team1| team2|winner|result|   tie_breaker|
+--------+------+------+------+------+--------------+
|       1|Team A|Team B|Team A|normal|          NULL|
|       2|Team C|Team D|  NULL|   tie|    super over|
|       3|Team A|Team C|Team C|normal|          NULL|
|       4|Team B|Team D|  NULL|   tie|boundary count|
|       6|Team B|Team C|Team B|normal|    