In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [5]:
spark = SparkSession.builder.appName("spark_hands_on").getOrCreate()

25/07/18 17:33:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
match_details = spark.read.format("csv")\
                        .option("header","true")\
                        .option("inferschema","true")\
                        .load("file:/home/iceberg/data/match_details.csv")

In [None]:
match_details.show(5)

In [None]:
matches = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferschema", "true")\
                .load("file:/home/iceberg/data/matches.csv")

In [None]:
medals_matches_players = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferschema", "true")\
                .load("file:/home/iceberg/data/medals_matches_players.csv")

In [None]:
medals = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferschema", "true")\
                .load("file:/home/iceberg/data/medals.csv")

In [None]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [None]:
# spark.conf.get("spark.sql.autoBroadcastJoinThreshold")

In [None]:
maps = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferschema", "true")\
                .load("file:/home/iceberg/data/maps.csv")

In [None]:
f.broadcast(medals)
f.broadcast(maps)

In [None]:
medals_matches_players.printSchema()

In [None]:
medals.join(f.broadcast(maps), "mapid")


In [None]:
spark.sql("""
DROP TABLE IF EXISTS bootcamp.match_details_bucketed
""")

match_details_bucketed_DDL = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
        match_id string,
        player_gamertag string,
        previous_spartan_rank integer,
        spartan_rank integer,
        previous_total_xp integer,
        total_xp integer,
        previous_csr_tier integer,
        previous_csr_designation integer,
        previous_csr integer,
        previous_csr_percent_to_next_tier integer,
        previous_csr_rank integer,
        current_csr_tier integer,
        current_csr_designation integer,
        current_csr integer,
        current_csr_percent_to_next_tier integer,
        current_csr_rank integer,
        player_rank_on_team integer,
        player_finished boolean,
        player_average_life string,
        player_total_kills integer,
        player_total_headshots integer,
        player_total_weapon_damage double,
        player_total_shots_landed integer,
        player_total_melee_kills integer,
        player_total_melee_damage double,
        player_total_assassinations integer,
        player_total_ground_pound_kills integer,
        player_total_shoulder_bash_kills integer,
        player_total_grenade_damage double,
        player_total_power_weapon_damage double,
        player_total_power_weapon_grabs integer,
        player_total_deaths integer,
        player_total_assists integer,
        player_total_grenade_kills integer,
        did_win integer,
        team_id integer
)
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
"""

In [None]:
spark.sql(match_details_bucketed_DDL)

In [None]:
match_details.write.mode("append")\
                    .bucketBy(16, "match_id")\
                    .saveAsTable("bootcamp.match_details_bucketed")

In [None]:
# %%sql
# select * from bootcamp.match_details_bucketed.files

In [None]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed""")

matches_bucketed_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
        match_id string ,
        mapid string ,
        is_team_game boolean ,
        playlist_id string ,
        game_variant_id string ,
        is_match_over boolean ,
        completion_date timestamp ,
        match_duration string ,
        game_mode string ,
        map_variant_id string
)
USING iceberg
PARTITIONED BY(bucket(16, match_id));
"""

In [None]:
spark.sql(matches_bucketed_ddl)

In [None]:
matches.write.mode("append")\
                .bucketBy(16, "match_id")\
                .saveAsTable("bootcamp.matches_bucketed")

In [None]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.medal_matches_players_bucketed""")

medal_matches_players_bucketed_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.medal_matches_players_bucketed (
        match_id string ,
        player_gamertag string ,
        medal_id long ,
        count integer 
)
USING iceberg
PARTITIONED BY(bucket(16, match_id));

"""

In [None]:
spark.sql(medal_matches_players_bucketed_ddl)

In [None]:
medals_matches_players.write.mode("append")\
                        .bucketBy(16, "match_id")\
                        .saveAsTable("bootcamp.medal_matches_players_bucketed")

In [6]:

# Read the bucketed tables
bucketed_md = spark.table("bootcamp.match_details_bucketed")
bucketed_m = spark.table("bootcamp.matches_bucketed")
bucketed_mmp = spark.table("bootcamp.medal_matches_players_bucketed")


In [7]:
joined_df = bucketed_m.join(bucketed_md, "match_id")\
                        .join(bucketed_mmp, "match_id")

In [None]:
joined_df.show(5)

In [None]:
#  Which player averages the most kills per game?
joined_df.groupby("match_details_bucketed.player_gamertag")\
    .agg(f.avg("match_details_bucketed.player_total_kills").alias("avg_kill"))\
    .select("match_details_bucketed.player_gamertag","avg_kill")\
    .sort("avg_kill", ascending=False)\
    .show(1)

In [None]:
#  Which playlist gets played the most?

joined_df.groupby("matches_bucketed.playlist_id")\
            .agg(f.count("matches_bucketed.playlist_id").alias("total_plays"))\
            .sort("total_plays", ascending = False)\
            .select("matches_bucketed.playlist_id", "total_plays")\
            .show(5)

In [None]:
#  Which map gets played the most?

bucketed_m.join(maps, "mapid")\
    .groupby("demo.bootcamp.matches_bucketed.mapid")\
    .agg(f.count("demo.bootcamp.matches_bucketed.mapid").alias("total_plays"))\
    .sort("total_plays", ascending = False)\
    .select("demo.bootcamp.matches_bucketed.mapid", "total_plays")\
    .show(5)

In [None]:
# Which map do players get the most Killing Spree medals on?

bucketed_mmp.join(medals, "medal_id")\
        .join(bucketed_m, "match_id")\
        .where(medals.name == "Killing Spree")\
        .groupby("mapid")\
        .agg(f.sum("count").alias("total_killing_spree_medals"))\
        .sort("total_killing_spree_medals", ascending = False)\
        .select("mapid","total_killing_spree_medals")\
        .show(5)


In [8]:
agg_df = joined_df\
            .groupby("match_id","mapid","playlist_id","match_details_bucketed.player_gamertag","medal_id")\
            .agg(f.sum("player_total_kills").alias("total_kills"), f.sum("count").alias("total_medals"))\
            .select("match_id","mapid","playlist_id","match_details_bucketed.player_gamertag","medal_id", "total_kills", "total_medals")
            

In [9]:
# Which player averages the most kills per game?
agg_df.groupby("player_gamertag").agg(f.avg("total_kills").alias("avg_kills")).sort("avg_kills", ascending = False).show(5)

25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:34:14 WARN RowBasedKeyValueBatch: Calling spill() on

+---------------+------------------+
|player_gamertag|         avg_kills|
+---------------+------------------+
|   gimpinator14| 421.1363636363636|
|  I Johann117 I|             384.0|
|BudgetLegendary| 375.4761904761905|
|   Sexy is Back| 295.3181818181818|
|PrimePromethean|289.82608695652175|
+---------------+------------------+
only showing top 5 rows



                                                                                

In [11]:
# Which playlist gets played the most?
agg_df.groupby("playlist_id").agg(f.count("match_id").alias("total_plays")).sort("total_plays", ascending = False)\
    .select("playlist_id","total_plays").show(5)

25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/07/18 17:37:27 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------------+-----------+
|         playlist_id|total_plays|
+--------------------+-----------+
|f72e0ef0-7c4a-430...|     818439|
|c98949ae-60a8-43d...|     311148|
|780cc101-005c-4fc...|     302572|
|0bcf2be1-3168-4e4...|     302052|
|892189e9-d712-4bd...|     288989|
+--------------------+-----------+
only showing top 5 rows



                                                                                

In [None]:
# Which map gets played the most?
agg_df.groupby("mapid").agg(f.count("match_id").alias("total_plays")).sort("total_plays", ascending = False)\
    .select("mapid","total_plays").show(5)

In [None]:
# Which map do players get the most Killing Spree medals on?
agg_df.join(f.broadcast(medals), "medal_id")\
    .where(medals.name == "Killing Spree")\
    .groupby("mapid").agg(f.count("match_id").alias("total_plays")).sort("total_plays", ascending = False)\
    .select("mapid","total_plays").show(5)

In [None]:
sorted_mapid = agg_df.repartition(10).sortWithinPartitions("mapid")
sorted_playlist_id = agg_df.repartition(10).sortWithinPartitions("playlist_id")
sorted_sorted_map_playlist_id = agg_df.repartition(10).sortWithinPartitions("mapid","playlist_id")

In [None]:
sorted_mapid.write.mode("overwrite").saveAsTable("bootcamp.sorted_mapid")
sorted_playlist_id.write.mode("overwrite").saveAsTable("bootcamp.sorted_playlist_id")
sorted_sorted_map_playlist_id.write.mode("overwrite").saveAsTable("bootcamp.sorted_sorted_map_playlist_id")

In [2]:
spark.sql("""
select sum(file_size_in_bytes), 'sorted_mapid' as tablename from bootcamp.sorted_mapid.files
union all
select sum(file_size_in_bytes), 'sorted_playlist_id' as tablename from bootcamp.sorted_playlist_id.files
union all
select sum(file_size_in_bytes), 'sorted_sorted_map_playlist_id' as tablename from bootcamp.sorted_sorted_map_playlist_id.files
""").show()

                                                                                

+-----------------------+--------------------+
|sum(file_size_in_bytes)|           tablename|
+-----------------------+--------------------+
|               25127687|        sorted_mapid|
|               25185934|  sorted_playlist_id|
|               24116516|sorted_sorted_map...|
+-----------------------+--------------------+



In [None]:
# stop the spark session
spark.stop()