In [2]:
import org.apache.spark.sql.functions.{broadcast, split, lit}

import org.apache.spark.sql.functions.{broadcast, split, lit}


In [3]:
val matchesBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")
val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")

matchesBucketed: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]


In [4]:
val medalsmatchesplayersBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/medals_matches_players.csv")
val medalsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/medals.csv")

medalsmatchesplayersBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 2 more fields]
medalsBucketed: org.apache.spark.sql.DataFrame = [medal_id: bigint, sprite_uri: string ... 10 more fields]


In [5]:
matchesBucketed.printSchema()        // See columns and data types
matchDetailsBucketed.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- mapid: string (nullable = true)
 |-- is_team_game: boolean (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- game_variant_id: string (nullable = true)
 |-- is_match_over: boolean (nullable = true)
 |-- completion_date: timestamp (nullable = true)
 |-- match_duration: string (nullable = true)
 |-- game_mode: string (nullable = true)
 |-- map_variant_id: string (nullable = true)

root
 |-- match_id: string (nullable = true)
 |-- player_gamertag: string (nullable = true)
 |-- previous_spartan_rank: integer (nullable = true)
 |-- spartan_rank: integer (nullable = true)
 |-- previous_total_xp: integer (nullable = true)
 |-- total_xp: integer (nullable = true)
 |-- previous_csr_tier: integer (nullable = true)
 |-- previous_csr_designation: integer (nullable = true)
 |-- previous_csr: integer (nullable = true)
 |-- previous_csr_percent_to_next_tier: integer (nullable = true)
 |-- previous_csr_rank: integer (nullable = true)
 |-

In [6]:
medalsmatchesplayersBucketed.printSchema()        // See columns and data types
medalsBucketed.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- player_gamertag: string (nullable = true)
 |-- medal_id: long (nullable = true)
 |-- count: integer (nullable = true)

root
 |-- medal_id: long (nullable = true)
 |-- sprite_uri: string (nullable = true)
 |-- sprite_left: integer (nullable = true)
 |-- sprite_top: integer (nullable = true)
 |-- sprite_sheet_width: integer (nullable = true)
 |-- sprite_sheet_height: integer (nullable = true)
 |-- sprite_width: integer (nullable = true)
 |-- sprite_height: integer (nullable = true)
 |-- classification: string (nullable = true)
 |-- description: string (nullable = true)
 |-- name: string (nullable = true)
 |-- difficulty: integer (nullable = true)



In [7]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [8]:
val mapsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/maps.csv")

mapsBucketed: org.apache.spark.sql.DataFrame = [mapid: string, name: string ... 1 more field]


In [9]:
mapsBucketed.printSchema()

root
 |-- mapid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
# These tables are small (few rows, used for lookup).
# Broadcasting avoids shuffling big tables and makes joins efficient—but only for these, since we disabled auto-broadcast globally.

In [10]:
val medalsBroadcasted = broadcast(medalsBucketed)
val mapsBroadcasted = broadcast(mapsBucketed)

medalsBroadcasted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [medal_id: bigint, sprite_uri: string ... 10 more fields]
mapsBroadcasted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mapid: string, name: string ... 1 more field]


In [11]:
val medalDetails = medalsmatchesplayersBucketed
  .join(medalsBroadcasted, Seq("medal_id"), "left")

medalDetails: org.apache.spark.sql.DataFrame = [medal_id: bigint, match_id: string ... 13 more fields]


In [12]:
val matchesWithMapName = matchesBucketed
  .join(mapsBroadcasted, Seq("mapid"), "left")

matchesWithMapName: org.apache.spark.sql.DataFrame = [mapid: string, match_id: string ... 10 more fields]


In [None]:
# Bucket join match_details, matches, and medals_matches_players on match_id with 16 buckets
# Bucketing splits your data into a fixed number of files ("buckets") by the column you choose—here, match_id.
# When you write & then read bucketed tables on the same column, Spark can join them efficiently (avoiding shuffles).

In [None]:
#  Steps to Bucket the Three Tables
# Write each table as a bucketed table on match_id (16 buckets):

In [None]:
//If you want to double-check that it’s gone, you can list tables in the database:
spark.sql("SHOW TABLES IN bootcamp").show()


In [None]:
spark.sql("DROP TABLE IF EXISTS bootcamp.matches_bucketed")
spark.sql("DROP TABLE IF EXISTS bootcamp.match_details_bucketed")
spark.sql("DROP TABLE IF EXISTS bootcamp.medals_matches_players_bucketed")

In [None]:
val matchesDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
     match_id STRING,
     mapid STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     game_variant_id STRING,
     is_match_over BOOLEAN,
     completion_date TIMESTAMP
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, match_id))
"""
spark.sql(matchesDDL)