In [1]:
import json

import polars as pl
from polars import col

In [2]:
# Initial data cleaning (with steam_games.parquet and steam_games_extra_features.parquet)
# steam_games_agg = steam_games.join(steam_game_features, on="appid", how="left")
# steam_games_agg = steam_games_agg.with_columns(
#     pl.col("review_score").fill_null(0),
#     pl.col("review_score_desc").fill_null("No reviews"),
#     pl.col("total_positive_reviews").fill_null(0),
#     pl.col("total_negative_reviews").fill_null(0),
#     pl.col("total_reviews").fill_null(0)
# )
# steam_games_agg

In [2]:
# read categories json into a table
with open("../data/categories.json") as f:
    categories = json.load(f)
cat_df = pl.DataFrame(categories).select(["categoryid", "name"]).rename({"name": "category"})
cat_df

categoryid,category
i64,str
2,"""Single-player"""
1,"""Multi-player"""
20,"""MMO"""
50,"""High-Quality Audio"""
49,"""PvP"""
…,…
44,"""Remote Play Together"""
61,"""HDR"""
62,"""Family Sharing"""
63,"""Steam Timeline"""


In [3]:
cat_df.write_parquet("../data/preprocessed/categories.parquet")

In [6]:
steam_apps = pl.read_parquet("../data/raw/games/steam_games_full.parquet").lazy()
steam_apps.limit(5).collect()

appid,name,type,required_age,is_free,minimum_pc_requirements,recommended_pc_requirements,controller_support,detailed_description,about_the_game,short_description,supported_languages,header_image,developers,publishers,price,category_ids,genres_list,windows_support,mac_support,linux_support,release_date,coming_soon,recommendations,dlc,review_score,review_score_desc,total_positive_reviews,total_negative_reviews,total_reviews
i64,str,str,i64,bool,str,str,str,str,str,str,list[str],str,list[str],list[str],f64,list[i64],list[str],bool,bool,bool,str,bool,i64,list[i64],i64,str,i64,i64,i64
2852260,"""Pandarunium""","""game""",0,False,"""Minimum:Requires a 64-bit proc…","""Recommended:Requires a 64-bit …",,"""<strong>Pandarunium</strong> i…","""<strong>Pandarunium</strong> i…","""Team up with friends in this t…","[""English""]","""https://shared.akamai.steamsta…","[""Tenax Studios""]","[""Tenax Studios""]",3.99,"[2, 1, … 62]","[""Casual"", ""Indie""]",True,False,False,"""20 Jun, 2024""",False,,[],0,"""2 user reviews""",2,0,2
1226222,"""Killer Gin Cats and Dogs DLC""","""dlc""",0,True,"""Minimum:OS *: Windows 7/8/8.1/…",,"""full""","""This DLC package allows you to…","""This DLC package allows you to…","""This DLC package allows you to…","[""English""]","""https://shared.akamai.steamsta…","[""The Killer Gin""]",[],,"[2, 21, … 62]","[""RPG""]",True,True,True,"""12 Mar, 2020""",False,,[],0,"""No reviews""",0,0,0
291550,"""Brawlhalla""","""game""",0,True,"""Minimum:Memory: 2 GB RAMStorag…","""Recommended:Memory: 4 GB RAMNe…",,"""<img class=""bb_img"" src=""https…","""<img class=""bb_img"" src=""https…","""An epic platform fighter for u…","[""English"", ""French"", … ""Turkish""]","""https://shared.akamai.steamsta…","[""Blue Mammoth Games""]","[""Ubisoft""]",,"[2, 1, … 44]","[""Action"", ""Indie"", ""Free To Play""]",True,True,False,"""17 Oct, 2017""",False,2890.0,"[298641, 2974360, … 3392630]",8,"""Very Positive""",1306,266,1572
2116250,"""Draft Day Sports: College Foot…","""game""",0,False,"""Minimum:Requires a 64-bit proc…","""Recommended:Requires a 64-bit …",,"""Prepare to lead your favorite …","""Prepare to lead your favorite …","""Draft Day Sports: College Foot…","[""English""]","""https://shared.akamai.steamsta…","[""Wolverine Studios""]","[""Wolverine Studios""]",53.0,"[2, 62]","[""Indie"", ""Simulation"", … ""Strategy""]",True,False,False,"""31 Aug, 2022""",False,,[],5,"""Mixed""",9,6,15
946610,"""Pocket Rogues""","""game""",0,False,"""Minimum:Requires a 64-bit proc…","""Recommended:Requires a 64-bit …","""full""","""<p class=""bb_paragraph""><stron…","""<p class=""bb_paragraph""><stron…","""Pocket Rogues is a dynamic 2D-…","[""English"", ""Russian"", … ""Portuguese - Brazil""]","""https://shared.akamai.steamsta…","[""EtherGaming""]","[""EtherGaming""]",23.0,"[2, 1, … 62]","[""Action"", ""Adventure"", … ""Early Access""]",True,False,False,"""25 Oct, 2018""",False,453.0,[],8,"""Very Positive""",95,21,116


In [7]:
steam_apps.group_by(col("type")).len().collect()

type,len
str,u32
"""mod""",89
"""game""",101291
"""advertising""",138
"""dlc""",33735
"""music""",544
"""demo""",5793
"""video""",473
"""series""",50


In [8]:
# Reduce the scope to only games
steam_games = steam_apps.filter(col("type") == "game")

In [9]:
# Found duplicates
steam_games.group_by("name").len().filter(col("len") > 1).sort(by="len", descending=True).collect()

name,len
str,u32
"""Shadow of the Tomb Raider: Def…",20
"""No Way Out""",6
"""Aurora""",6
"""Escape""",5
"""Tom Clancy's Rainbow Six® Sieg…",5
…,…
"""RAGE""",2
"""Science Girls""",2
"""The Cleaner""",2
"""Hotel Simulator""",2


In [10]:
games_table = steam_games.sort(["name", "total_reviews"], descending=True).unique(subset=["name"], keep="first").collect()

In [11]:
games_with_cat = games_table.explode("category_ids").rename({"category_ids": "categoryid"}).join(cat_df, on="categoryid", how="left") \
    .group_by("appid").agg(col("category").explode().alias("categories"))
games_table = games_with_cat.join(games_table, on="appid", how="inner").drop("category_ids")

In [15]:
games_table = games_table.rename({
    'appid': 'game_id',
    'name': 'game_name',
    'required_age': 'game_required_age',
    'is_free': 'game_is_free',
    'supported_languages': 'game_supported_languages',
    'categories': 'game_categories',
    'genres_list': 'game_genres',
    'total_positive_reviews': 'game_total_positive_reviews',
    'total_negative_reviews': 'game_total_negative_reviews',
    'total_reviews': 'game_total_reviews',
    'review_score': 'game_review_score',
    'short_description': 'game_short_description'
})
games_table.head()

game_id,game_categories,game_name,type,game_required_age,game_is_free,minimum_pc_requirements,recommended_pc_requirements,controller_support,detailed_description,about_the_game,short_description,game_supported_languages,header_image,developers,publishers,price,game_genres,windows_support,mac_support,linux_support,release_date,coming_soon,recommendations,dlc,game_review_score,review_score_desc,game_total_positive_reviews,game_total_negative_reviews,game_total_reviews
i64,list[str],str,str,i64,bool,str,str,str,str,str,str,list[str],str,list[str],list[str],f64,list[str],bool,bool,bool,str,bool,i64,list[i64],i64,str,i64,i64,i64
2639280,"[""Single-player"", ""Family Sharing""]","""𣸩""","""game""",0,False,"""Minimum:Storage: 1 GB availabl…",,,"""<img class=""bb_img"" src=""https…","""<img class=""bb_img"" src=""https…","""《袖珍小说游戏》系列的首部作品，讲述失去了想象能力的作家，和…","[""Simplified Chinese"", ""Traditional Chinese""]","""https://shared.akamai.steamsta…","[""箱崎奈绪(Hakozaki Nao)""]","["" 箱崎奈绪(Hakozaki Nao)""]",5.03,"[""Adventure"", ""Casual"", ""Indie""]",True,False,False,"""29 Dec, 2023""",False,,[],0,"""No reviews""",0,0,0
576960,"[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","""🧠 OUT OF THE BOX""","""game""",17,False,"""Minimum:Requires a 64-bit proc…","""Recommended:Requires a 64-bit …",,"""<img class=""bb_img"" src=""https…","""<img class=""bb_img"" src=""https…","""Dive into a wild story of gang…","[""English"", ""Spanish - Spain"", … ""German""]","""https://shared.akamai.steamsta…","[""Nuclear Tales""]","[""🚀 Raiser Games""]",39.5,"[""Adventure"", ""Casual"", … ""Strategy""]",True,True,True,"""19 Jul, 2018""",False,202.0,[],6,"""Mostly Positive""",41,11,52
965340,"[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","""🚀 Human Rocket Person""","""game""",0,False,"""Minimum:OS *: Windows 7, Windo…",,"""full""","""<img class=""bb_img"" src=""https…","""<img class=""bb_img"" src=""https…","""Human Rocket Person is an absu…","[""English"", ""German"", … ""Simplified Chinese""]","""https://shared.akamai.steamsta…","[""2nd Studio""]","[""2nd Studio""]",5.0,"[""Action"", ""Indie"", ""Simulation""]",True,False,False,"""14 Nov, 2018""",False,,[],7,"""Positive""",12,1,13
460250,"[""Single-player"", ""Steam Achievements"", ""Steam Cloud""]","""🔴 Circles""","""game""",0,True,"""Minimum:OS *: Windows XP, Vist…",,,"""Circles is a unique, intuitive…","""Circles is a unique, intuitive…","""Circles is an abstract puzzle …","[""English"", ""French"", … ""Swedish""]","""https://shared.akamai.steamsta…","[""Jeroen Wimmers""]","[""Jeroen Wimmers""]",,"[""Casual"", ""Indie""]",True,True,False,"""17 Feb, 2017""",False,,[],7,"""Positive""",24,2,26
2919560,"[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","""👑Idle Calibur：Zero💕（选王之剑：零）""","""game""",0,False,"""Minimum:Requires a 64-bit proc…","""Recommended:Requires a 64-bit …",,"""<h1>If you encounter performan…","""<p class=""bb_paragraph""><img c…","""A game that seamlessly integra…","[""Simplified Chinese"", ""English"", ""Japanese""]","""https://shared.akamai.steamsta…","[""绝汪""]","[""绝汪"", ""NPC Entertainment""]",24.0,"[""Indie"", ""RPG"", … ""Strategy""]",True,False,False,"""31 Oct, 2024""",False,405.0,[3114710],0,"""9 user reviews""",5,4,9


In [16]:
games_table.write_parquet("../data/preprocessed/games.parquet")

In [61]:
del games_table

In [17]:
steam_reviews = pl.read_parquet("../data/raw/reviews/steam_reviews_*.parquet")
steam_reviews

rec_id,author_id,appid,playtime_forever,playtime_last_two_weeks,playtime_at_review,num_games_owned,num_reviews,last_played,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck
i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,i64,bool,i64,i64,f64,i64,bool,bool,bool,bool
190502887,76561198798149868,2852260,5,,5,78,1,1742257396,"""english""","""I like the part with the panda""",1742257432,1742257432,true,0,0,0.5,0,true,false,false,false
168888656,76561198140769382,2852260,121,,121,,1,1719510337,"""english""","""Solo play works, but friendly …",1720019968,1720019968,true,3,0,0.565217,0,true,false,false,false
196678928,76561199086310241,291550,2202,,2202,1,2,1607922050,"""english""","""Worst teammate matchmaking I'v…",1749353719,1749353719,true,0,0,0.5,0,true,false,false,false
192763979,76561198366424828,291550,75846,4123,62093,,1,1749375912,"""english""","""good way to spend time if you …",1744671161,1744671161,true,0,0,0.5,0,true,false,false,false
189643572,76561199731966768,291550,699,,232,,1,1745115147,"""english""","""This is an overall good game. …",1741398352,1741398352,true,0,0,0.5,0,true,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6418339,76561198065871127,222880,1554,,418,227,8,1535835751,"""english""","""good""",1367079935,1367079935,true,0,0,0.5,0,true,false,false,false
6544900,76561198068660723,222880,569,,58,,7,1555809438,"""english""","""If you like counter strike you…",1367040549,1367040549,true,0,0,0.5,0,true,false,false,false
2732737,76561198011795478,222880,5574,,778,,45,1652131712,"""english""","""It is a good game. i dont know…",1367024720,1367024720,true,1,0,0.5029,0,true,false,false,false
3791442,76561198027412704,222880,4428,,765,,9,1507844405,"""english""","""great game and its still in al…",1366978517,1366978517,true,0,0,0.5,2,true,false,false,false


In [18]:
deduplicated = steam_reviews.unique(subset=["author_id", "appid"], maintain_order=False)

In [19]:
deduplicated

rec_id,author_id,appid,playtime_forever,playtime_last_two_weeks,playtime_at_review,num_games_owned,num_reviews,last_played,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck
i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,i64,bool,i64,i64,f64,i64,bool,bool,bool,bool
175171803,76561198952310075,334230,3979,,3979,206,16,1566678507,"""english""","""Years down the line and the de…",1726577490,1726577632,false,0,0,0.489766,0,true,false,false,false
139803844,76561199384275433,252490,15925,,4160,,1,1719537983,"""english""","""if you were in the bo2 lobbies…",1686347106,1686347106,true,0,0,0.5,0,true,false,false,false
137963264,76561199248692522,1811260,25884,,13356,,1,1742485782,"""english""","""shit as fuck""",1683486383,1683486383,false,0,0,0.5,0,true,false,false,false
67758342,76561198443803455,334230,1540,,1148,135,5,1628188157,"""english""","""fun""",1587478249,1587478249,true,0,0,0.5,0,true,false,false,false
18068501,76561197988603129,238370,918,,838,,5,1448141119,"""english""","""Well... Ill start with a simp…",1442483518,1442483554,true,51,4,0.724219,0,true,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
67280303,76561199016463599,261550,22329,,1265,,2,1707362466,"""english""","""Good Game""",1586852857,1586852857,true,0,0,0.5,0,true,false,true,false
182132835,76561199666725529,264710,6548,,3487,29,3,1740233804,"""english""","""good game""",1733569706,1733569706,true,0,0,0.5,0,true,false,false,false
56467931,76561198066936563,323190,3602,,1016,,36,1713581013,"""english""","""This game is the best if you l…",1572854691,1572854691,true,0,0,0.5,0,true,false,false,false
77744514,76561199099625239,1097150,403,,180,,1,1603362265,"""english""","""VERY GOOD""",1603002117,1603002117,true,0,0,0.5,0,true,false,false,false


In [20]:
user_counts = deduplicated.group_by("author_id").agg(pl.len().alias("num_reviews")).filter(col("num_reviews") >= 10).select("author_id")
game_counts = deduplicated.group_by("appid").agg(pl.len().alias("num_reviews")).filter(col("num_reviews") >= 10).select("appid")

In [21]:
reviews_table = deduplicated.join(user_counts, on="author_id", how="inner").join(game_counts, on="appid", how="inner")

In [29]:
reviews_table = reviews_table.rename({'author_id': 'user_id', 'appid': 'game_id'}).drop(["num_games_owned", "num_reviews"])

In [22]:
reviews_table.write_parquet("../data/preprocessed/reviews.parquet")

In [23]:
# users table. About half of num_games_owned is null, num_reviews has no nulls
users_table = reviews_table.select(["author_id", "num_games_owned", "num_reviews"]).rename({'author_id': 'user_id', 'num_games_owned': 'user_num_games_owned', 'num_reviews': 'user_num_reviews'}).sort(["user_id", "user_num_games_owned", "user_num_reviews"], descending=True).unique(subset=["user_id"], keep="first")

In [24]:
users_table.write_parquet("../data/preprocessed/users.parquet")

In [28]:
reviews_table

rec_id,author_id,appid,playtime_forever,playtime_last_two_weeks,playtime_at_review,num_games_owned,num_reviews,last_played,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck
i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,i64,bool,i64,i64,f64,i64,bool,bool,bool,bool
175171803,76561198952310075,334230,3979,,3979,206,16,1566678507,"""english""","""Years down the line and the de…",1726577490,1726577632,false,0,0,0.489766,0,true,false,false,false
125238571,76561199125339110,1283410,1139,,637,689,258,1739074207,"""english""","""I could tell you about how muc…",1668069390,1668069390,true,0,0,0.472813,0,true,false,false,false
188899567,76561198134316405,404730,2521,,2143,,17,1740790177,"""english""","""Old School CRPG. Before Bethes…",1740701210,1740701210,true,0,0,0.5,0,true,false,false,false
59259475,76561198074815715,976730,5676,,405,,36,1709460892,"""english""","""ITS HALO""",1575519399,1575519399,true,0,0,0.5,0,true,false,false,false
179612982,76561198147768440,1643320,361,,361,,26,1732229544,"""english""","""Hop to it, modders.""",1732329095,1732329095,true,1,0,0.47619,0,true,false,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
15761629,76561198107779127,258220,295,,295,509,439,1431080211,"""english""","""good game""",1431201208,1431201208,true,0,0,0.435369,0,true,false,false,false
29411147,76561197965280169,383080,12,,12,1044,27,1484494110,"""english""","""Click girls, hear some weird m…",1485047336,1485047336,false,0,1,0.5,0,true,false,false,false
60640463,76561198170227470,70,1675,,589,162,89,1735416267,"""english""","""Just Yes.""",1577413918,1577413918,true,0,0,0.5,0,true,false,false,false
154521253,76561199098351797,1144200,1101,,175,88,34,1721204947,"""english""","""i am extremely inadequate inca…",1703663706,1703663706,true,3,0,0.562044,0,true,false,false,false


In [3]:
training_ds = pl.read_parquet("../data/preprocessed/training_dataset.parquet")
training_ds

rec_id,user_id,game_id,playtime_forever,playtime_last_two_weeks,playtime_at_review,last_played,review,timestamp_created,voted_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck,user_id_1,user_num_games_owned,user_num_reviews,game_id_1,game_name,game_required_age,game_is_free,game_short_description,game_supported_languages,game_categories,game_genres,game_total_positive_reviews,game_total_negative_reviews,game_total_reviews,game_review_score
i64,i64,i64,i64,i64,i64,i64,str,i64,bool,i64,f64,i64,bool,bool,bool,bool,i64,i64,i64,i64,str,i64,bool,str,list[str],list[str],list[str],i64,i64,i64,i64
59241263,76561198398321971,271590,23370,,1422,1685837176,"""Old but gold - Now that I'm up…",1575493573,false,0,0.502075,0,true,false,false,false,76561198398321971,95,53,271590,"""Grand Theft Auto V Legacy""",17,false,"""Grand Theft Auto V for PC offe…","[""English"", ""French"", … ""Spanish - Latin America""]","[""Single-player"", ""Multi-player"", … ""Remote Play on TV""]","[""Action"", ""Adventure""]",472621,91976,564597,8
114693825,76561198208763852,1794680,1202,,288,1693173236,"""this game is just insanely fun…",1651283112,true,0,0.5,0,true,false,true,false,76561198208763852,,27,1794680,"""Vampire Survivors""",0,false,"""Mow down thousands of night cr…","[""English"", ""French"", … ""Ukrainian""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""Action"", ""Casual"", … ""RPG""]",119189,1696,120885,9
34851185,76561198314284388,359550,275703,417,43546,1748767572,"""Worth every penny, with long t…",1505009408,true,2,0.5,0,true,false,false,false,76561198314284388,327,37,359550,"""Tom Clancy's Rainbow Six® Sieg…",17,false,"""Tom Clancy's Rainbow Six® Sieg…","[""English"", ""French"", … ""Thai""]","[""Single-player"", ""Multi-player"", … ""Remote Play on Tablet""]","[""Action""]",508146,93393,601539,8
32971936,76561198046212755,379720,3720,,2460,1742375683,"""A mater class in first person …",1498709061,true,0,0.5,0,true,false,false,false,76561198046212755,,35,379720,"""DOOM""",17,false,"""Now includes all three premium…","[""English"", ""French"", … ""Traditional Chinese""]","[""Single-player"", ""Multi-player"", … ""Family Sharing""]","[""Action""]",75964,3449,79413,9
90719330,76561198044112962,1404850,1056,,772,1675061776,"""Activate Mao""",1619081132,true,3,0.5,0,true,false,true,false,76561198044112962,,21,1404850,"""Luck be a Landlord""",0,false,"""Luck be a Landlord is a roguel…","[""English"", ""French"", … ""Arabic""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""Indie"", ""Simulation"", ""Strategy""]",4736,254,4990,8
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
137125092,76561198398064619,2050650,1859,,1756,1682190714,"""shoot the lake 6 times and you…",1682185640,true,0,0.5,0,true,false,false,false,76561198398064619,,19,2050650,"""Resident Evil 4""",0,false,"""Survival is just the beginning…","[""English"", ""French"", … ""Spanish - Latin America""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""Action"", ""Adventure""]",46797,1426,48223,9
12797231,76561197991988567,238210,116,,116,1596062483,"""Do you like Bioshock? well thi…",1415158478,true,0,0.521745,0,true,false,false,false,76561197991988567,141,34,238210,"""System Shock® 2 (Classic)""",17,false,"""&quot;Remember, it is my will …","[""English"", ""German""]","[""Single-player"", ""Multi-player"", … ""Family Sharing""]","[""Action"", ""RPG""]",4518,232,4750,9
129666288,76561198133077786,359550,545,,264,1673592128,"""Tom Clancy is crying in his gr…",1672380275,false,0,0.511628,0,true,false,false,false,76561198133077786,326,87,359550,"""Tom Clancy's Rainbow Six® Sieg…",17,false,"""Tom Clancy's Rainbow Six® Sieg…","[""English"", ""French"", … ""Thai""]","[""Single-player"", ""Multi-player"", … ""Remote Play on Tablet""]","[""Action""]",508146,93393,601539,8
186476457,76561198811578149,489830,12012,,11126,1742848409,"""GoAT""",1737915314,true,0,0.52381,0,true,false,false,false,76561198811578149,,13,489830,"""The Elder Scrolls V: Skyrim Sp…",17,false,"""Winner of more than 200 Game o…","[""English"", ""French"", … ""Japanese""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""RPG""]",109104,6158,115262,8
