#### Reading Data from CSV

In [None]:
matchesBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/matches.csv").withColumn("completion_year", year(expr("DATE_TRUNC('year', completion_date)")))
matchDetailsBucketed =  spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/match_details.csv")

medalsMatchesPlayersBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/medals_matches_players.csv")
maps = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/maps.csv")
medals = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/medals.csv")

#### Creating bucketed Tables
Partitioned by completion year instead of completion date for matches as that help me solve the Java OOM error.

In [None]:
%%sql
CREATE DATABASE IF NOT EXISTS bootcamp

In [None]:
bucketedDDL = """
 CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     mapid STRING,
     completion_date TIMESTAMP,
     completion_year INTEGER
 )
 USING iceberg
 PARTITIONED BY (completion_year, bucket(16, match_id));
 """
spark.sql(bucketedDDL)

bucketedDetailsDDL = """
 CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
 """
spark.sql(bucketedDetailsDDL)

bucketedMedalMatchesDDL = """
 CREATE TABLE IF NOT EXISTS bootcamp.medals_matches_players_bucketed (
     match_id STRING,
     player_gamertag STRING,
     medal_id BIGINT,
     count INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
 """
spark.sql(bucketedMedalMatchesDDL)

In [None]:
matchesBucketed.select(
     col("match_id"), col("is_team_game"), col("playlist_id"), col("mapid"), col("completion_date").cast("timestamp"), col("completion_year")
     ) \
     .write.mode("overwrite")  \
     .partitionBy("completion_year") \
     .bucketBy(16, "match_id").saveAsTable("bootcamp.matches_bucketed")

matchDetailsBucketed.select(
     col("match_id"), col("player_gamertag"), col("player_total_kills"), col("player_total_deaths")
     ) \
     .write.mode("overwrite")  \
     .bucketBy(16, "match_id").saveAsTable("bootcamp.match_details_bucketed")
medalsMatchesPlayersBucketed.select(
     col("match_id"), col("player_gamertag"), col("medal_id"), col("count")
     ) \
     .write.mode("overwrite")  \
     .bucketBy(16, "match_id").saveAsTable("bootcamp.medals_matches_players_bucketed")


In [None]:
spark.sql("USE bootcamp")

In [None]:
matchesBucketedDF = spark.read.table("matches_bucketed")
matchDetailsBucketedDF = spark.read.table("match_details_bucketed")
medalsMatchesPlayersBucketedDF = spark.read.table("medals_matches_players_bucketed")

#### Query 2: join the medals and maps tables with an explicitly specified a broadcast join 
#### Query 3: join the match_details, matches and medal_matches_players using a bucket join on match_id with 16 buckets

In [None]:
bucketedJoinResults = matchDetailsBucketedDF.alias("mdb") \
    .join(matchesBucketedDF.alias("mb"), col("mb.match_id") == col("mdb.match_id")) \
    .join(medalsMatchesPlayersBucketedDF.alias("mmb"), 
          (col("mb.match_id") == col("mmb.match_id")) & 
          (col("mmb.player_gamertag") == col("mdb.player_gamertag"))) \
    .select(
        col("mb.*"),
        col("mdb.player_total_deaths"),
        col("mdb.player_total_kills"),
        col("mdb.player_gamertag"),
        col("mmb.count"),
        col("mmb.medal_id")
    )


#### Explicit Broadcast Join

In [None]:
explicitBroadcast = bucketedJoinResults.alias("bk") \
    .join(broadcast(medals).alias("md"), col("bk.medal_id") == col("md.medal_id")) \
    .join(broadcast(maps).alias("mp"), col("bk.mapid") == col("mp.mapid")) \
    .select(
        col("bk.*"),
        col("md.name").alias("medal_name"),
        col("md.description").alias("medal_description"),
        col("mp.name").alias("map_name"),
        col("mp.description").alias("map_description")
    )


In [None]:
finalDF = explicitBroadcast.drop_duplicates()