In [1]:
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, col, count, avg, desc, sum, countDistinct, coalesce
from pyspark.sql.window import Window


In [2]:
spark = (SparkSession.builder
         .appName("HomeWork3")
         .getOrCreate()
        )

24/12/13 20:11:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Disable automatic broadcast joins
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [4]:
folder = Path("/home/iceberg/data/")
match_details_path = f"{folder}/match_details.csv"
matches_path = f"{folder}/matches.csv"
medals_path = f"{folder}/medals.csv"
medals_matches_players_path = f"{folder}/medals_matches_players.csv"
maps_path = f"{folder}/maps.csv"

In [5]:
# Reading CSV files into DataFrames
match_details_df = spark.read.csv(match_details_path, header=True, inferSchema=True)
matches_df = spark.read.csv(matches_path, header=True, inferSchema=True)
medals_matches_players_df = spark.read.csv(medals_matches_players_path, header=True, inferSchema=True)
medals_df = spark.read.csv(medals_path, header=True, inferSchema=True)
maps_df = spark.read.csv(maps_path, header=True, inferSchema= True)

In [6]:
# Register DataFrames as temporary views or write them to Delta tables
match_details_df.createOrReplaceTempView("match_details")
matches_df.createOrReplaceTempView("matches")
medals_matches_players_df.createOrReplaceTempView("medals_matches_players")
medals_df.createOrReplaceTempView("medals")
maps_df.createOrReplaceTempView("maps")

24/12/13 20:11:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [7]:
match_details_df.show()

+--------------------+---------------+---------------------+------------+-----------------+--------+-----------------+------------------------+------------+---------------------------------+-----------------+----------------+-----------------------+-----------+--------------------------------+----------------+-------------------+---------------+-------------------+------------------+----------------------+--------------------------+-------------------------+------------------------+-------------------------+---------------------------+-------------------------------+--------------------------------+---------------------------+--------------------------------+-------------------------------+-------------------+--------------------+--------------------------+-------+-------+
|            match_id|player_gamertag|previous_spartan_rank|spartan_rank|previous_total_xp|total_xp|previous_csr_tier|previous_csr_designation|previous_csr|previous_csr_percent_to_next_tier|previous_csr_rank|current_

In [8]:
matches_df.show()

+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------------+
|            match_id|               mapid|is_team_game|         playlist_id|     game_variant_id|is_match_over|    completion_date|match_duration|game_mode|      map_variant_id|
+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------------+
|11de1a94-8d07-416...|c7edbf0f-f206-11e...|        true|f72e0ef0-7c4a-430...|1e473914-46e4-408...|         true|2016-02-22 00:00:00|          NULL|     NULL|                NULL|
|d3643e71-3e51-43e...|cb914b9e-f206-11e...|       false|d0766624-dbd7-453...|257a305e-4dd3-41f...|         true|2016-02-14 00:00:00|          NULL|     NULL|                NULL|
|d78d2aae-36e4-48a...|c7edbf0f-f206-11e...|        true|f72e0ef0-7c4a-430...|1e473914-46e4-408...|       

In [9]:
medals_matches_players_df.show()

+--------------------+---------------+----------+-----+
|            match_id|player_gamertag|  medal_id|count|
+--------------------+---------------+----------+-----+
|009fdac5-e15c-47c...|       EcZachly|3261908037|    7|
|009fdac5-e15c-47c...|       EcZachly| 824733727|    2|
|009fdac5-e15c-47c...|       EcZachly|2078758684|    2|
|009fdac5-e15c-47c...|       EcZachly|2782465081|    2|
|9169d1a3-955c-4ea...|       EcZachly|3001183151|    1|
|9169d1a3-955c-4ea...|       EcZachly|3565443938|    6|
|9169d1a3-955c-4ea...|       EcZachly|3491849182|    1|
|4a078b2f-65eb-4c6...|       EcZachly|3261908037|    8|
|9169d1a3-955c-4ea...|       EcZachly|2105198095|    6|
|9169d1a3-955c-4ea...|       EcZachly|2916014239|    3|
|9169d1a3-955c-4ea...|       EcZachly|3261908037|    6|
|9169d1a3-955c-4ea...|       EcZachly|1351381581|    2|
|9169d1a3-955c-4ea...|       EcZachly|2838259753|    1|
|9169d1a3-955c-4ea...|       EcZachly|3354395650|    1|
|9169d1a3-955c-4ea...|       EcZachly| 298813630

In [10]:
medals_df.show()

+----------+--------------------+-----------+----------+------------------+-------------------+------------+-------------+-----------------+--------------------+--------------+----------+
|  medal_id|          sprite_uri|sprite_left|sprite_top|sprite_sheet_width|sprite_sheet_height|sprite_width|sprite_height|   classification|         description|          name|difficulty|
+----------+--------------------+-----------+----------+------------------+-------------------+------------+-------------+-----------------+--------------------+--------------+----------+
|2315448068|                NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|             NULL|                NULL|          NULL|      NULL|
|3565441934|                NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|             NULL|                NULL|          NULL|      NULL|
|4162659350|https://content.h...|        750|       750|    

In [11]:
maps_df.show()

+--------------------+-------------------+--------------------+
|               mapid|               name|         description|
+--------------------+-------------------+--------------------+
|c93d708f-f206-11e...|              Urban|Andesia was the c...|
|cb251c51-f206-11e...|     Raid on Apex 7|This unbroken rin...|
|c854e54f-f206-11e...|March on Stormbreak|                NULL|
|c8d69870-f206-11e...| Escape from A.R.C.|Scientists flocke...|
|73ed1fd0-45e5-4bb...|             Osiris|                NULL|
|96c3e3dd-7703-408...|          Blue Team|                NULL|
|1c4f8e19-b046-4f7...|            Glassed|                NULL|
|825065cf-df57-42e...|        Unconfirmed|                NULL|
|9a188f67-1664-4d7...|           Alliance|                NULL|
|2702ea83-2c3e-4fd...|   Before the Storm|                NULL|
|82f8471c-a2ef-408...|            Genesis|                NULL|
|fcd7caa4-37c9-436...|       The Breaking|                NULL|
|7dc80b62-dd39-41d...|          Guardian

In [12]:
# check which df is bigger
matches_df.count(), maps_df.count()

(24025, 40)

In [13]:
matches_with_map = matches_df.join(broadcast(maps_df), matches_df["mapid"] == maps_df["mapid"], "inner")

In [14]:
matches_with_map.columns

['match_id',
 'mapid',
 'is_team_game',
 'playlist_id',
 'game_variant_id',
 'is_match_over',
 'completion_date',
 'match_duration',
 'game_mode',
 'map_variant_id',
 'mapid',
 'name',
 'description']

In [15]:
medals_df.count(), medals_matches_players_df.count()

(183, 755229)

In [16]:
medals_matches_with_medals = medals_matches_players_df.join(
                                    broadcast(medals_df), medals_matches_players_df["medal_id"] == medals_df["medal_id"],
                                    "inner")

In [17]:
medals_matches_with_medals.show()

+--------------------+---------------+----------+-----+----------+--------------------+-----------+----------+------------------+-------------------+------------+-------------+-----------------+--------------------+-------------------+----------+
|            match_id|player_gamertag|  medal_id|count|  medal_id|          sprite_uri|sprite_left|sprite_top|sprite_sheet_width|sprite_sheet_height|sprite_width|sprite_height|   classification|         description|               name|difficulty|
+--------------------+---------------+----------+-----+----------+--------------------+-----------+----------+------------------+-------------------+------------+-------------+-----------------+--------------------+-------------------+----------+
|009fdac5-e15c-47c...|       EcZachly|3261908037|    7|3261908037|https://content.h...|        375|       525|                74|                 74|        1125|          899|WeaponProficiency|Kill an opponent ...|           Headshot|        60|
|009fdac5-e1

In [18]:
#check the numbers
medals_matches_with_medals.count(), matches_with_map.count()

(755229, 24025)

In [19]:
medals_with_maps_joined = medals_matches_with_medals.join(
                            broadcast(matches_with_map), medals_matches_with_medals["match_id"] == matches_with_map["match_id"], "inner")

In [20]:
medals_with_maps_joined.count()

755229

## 2. Broadcast Join

In [21]:
# Save bucketed tables
match_details_df.write.format("parquet") \
    .bucketBy(16, "match_id") \
    .mode("overwrite") \
    .saveAsTable("bootcamp.bucketed_match_details")

matches_df.write.format("parquet") \
    .bucketBy(16, "match_id") \
    .mode("overwrite") \
    .saveAsTable("bootcamp.bucketed_matches")

medals_matches_players_df.write.format("parquet") \
    .bucketBy(16, "match_id") \
    .mode("overwrite") \
    .saveAsTable("bootcamp.bucketed_medals_matches_players")

# Load bucketed tables (if needed for future queries)
bucketed_match_details = spark.table("bootcamp.bucketed_match_details")
bucketed_matches = spark.table("bootcamp.bucketed_matches")
bucketed_medals_matches_players = spark.table("bootcamp.bucketed_medals_matches_players")

# Perform the join on bucketed tables
joined_df = bucketed_match_details.join(
    bucketed_matches, "match_id", "inner"
).join(
    bucketed_medals_matches_players, ["match_id", "player_gamertag"], "inner"
)

## 3. Which player averages the most kills per game?

In [22]:
from pyspark.sql.functions import col, avg, desc, coalesce

# Ensure player_total_kills is numeric and replace nulls with 0
cleaned_df = joined_df.withColumn("player_total_kills", coalesce(col("player_total_kills"), col("player_total_kills").cast("int")).alias("player_total_kills"))

# Aggregate to calculate the average kills per game for each player
player_avg_kills_df = cleaned_df.groupBy("bucketed_match_details.player_gamertag") \
    .agg(avg(col("player_total_kills")).alias("avg_kills_per_game")) \
    .orderBy(desc("avg_kills_per_game"))

# Find the player with the maximum average kills per game
top_player = player_avg_kills_df.first()

print(f"Player with the most average kills per game: {top_player['player_gamertag']} ({top_player['avg_kills_per_game']} kills/game)")


Player with the most average kills per game: gimpinator14 (109.0 kills/game)


## 4. Which playlist gets played the most?

In [23]:
# Aggregate to calculate the count of matches per playlist
playlist_count_df = joined_df.groupBy(["playlist_id"]) \
    .agg(countDistinct("match_id").alias("num_distinct_matches")) \
    .orderBy(desc("num_distinct_matches"))

# Find the playlist with the maximum number of matches
top_playlist = playlist_count_df.first()

print(f"Playlist with the most plays: {top_playlist['playlist_id']} ({top_playlist['num_distinct_matches']} matches)")


Playlist with the most plays: f72e0ef0-7c4a-4307-af78-8e38dac3fdba (7640 matches)


## 5. Which maps gets played the most

In [24]:
# Aggregate to calculate the count of matches per map
map_count_df = joined_df.groupBy("mapid") \
    .agg(countDistinct("match_id").alias("num_matches")) \
    .orderBy(desc("num_matches"))

# Find the map with the maximum number of matches
top_map = map_count_df.first()

print(f"Map played the most: {top_map['mapid']} ({top_map['num_matches']} matches)")

Map played the most: c7edbf0f-f206-11e4-aa52-24be05e24f7e (7032 matches)


## 6. Which map do players get the most Killing Spree medals on?

In [31]:
# Medal ID corresponding to "Killing Spree"
killing_spree_medal_id = 2430242797

# Filter for Killing Spree medals
killing_spree_df = joined_df.filter(col("medal_id") == killing_spree_medal_id)

# Remove duplicate (mapid, match_id) pairs to avoid double counting matches
distinct_matches_df = killing_spree_df.select("mapid", "match_id", "count").distinct()

# Aggregate to find the sum of "count" per map
killing_spree_by_map = distinct_matches_df.groupBy("mapid") \
    .agg(sum("bucketed_medals_matches_players.count").alias("total_killing_spree_medals"))

# Join with maps_df to get map names
killing_spree_with_names = killing_spree_by_map.join(
    maps_df, killing_spree_by_map.mapid == maps_df.mapid, "inner"
).select(maps_df.name.alias("map_name"), "total_killing_spree_medals") \
 .orderBy(desc("total_killing_spree_medals"))

# Show the map with the most Killing Spree medals
killing_spree_with_names.show(1)

24/12/13 20:16:06 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression


+--------------+--------------------------+
|      map_name|total_killing_spree_medals|
+--------------+--------------------------+
|Breakout Arena|                      5159|
+--------------+--------------------------+
only showing top 1 row



In [26]:
# Sort by playlist_id within partitions and save
sorted_by_playlist = joined_df.sortWithinPartitions("playlist_id")
# Write the sorted DataFrames to storage and compare file sizes
sorted_by_playlist.write.mode("overwrite").saveAsTable("bootcamp.sorted_by_playlist")

# Sort by mapid within partitions and save
sorted_by_map = joined_df.sortWithinPartitions("mapid")
sorted_by_map.write.mode("overwrite").saveAsTable("bootcamp.sorted_by_map")

                                                                                

In [27]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'by_playlist' 
FROM bootcamp.sorted_by_playlist.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'by_map' 
FROM bootcamp.sorted_by_map.files

24/12/13 20:12:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


size,num_files,by_playlist
18130999,13,by_playlist
18541324,13,by_map
