# Experimenting rec approaches

In [2]:
import duckdb
import torch
import polars as pl
import torchrec

In [3]:
TRAIN_CUT_TIMESTAMP = '2017-01-01'  # Fetch reviews up to this day FOR TRAINING
VAL_CUT_TIMESTAMP = '2018-01-01'  # Fetch reviews up to this day FOR VALIDATION

In [4]:
duckdb_conn = duckdb.connect('../data/steam.duckdb', read_only=True)

### Train dataset

In [5]:
train_games_df = duckdb_conn.sql(f"SELECT * FROM game_features WHERE DATE(game_review_day) <= '{TRAIN_CUT_TIMESTAMP}'").pl()

In [6]:
train_reviews_df = duckdb_conn.sql(f"SELECT * FROM fact_reviews WHERE DATE(timestamp_created) <= '{TRAIN_CUT_TIMESTAMP}'").pl()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
len(train_reviews_df)

3548588

In [8]:
dim_games_df = duckdb_conn.sql(f"SELECT * FROM dim_games WHERE DATE(game_prerelease_date) <= '{TRAIN_CUT_TIMESTAMP}'").pl()

In [9]:
dim_users_df = duckdb_conn.sql(f"SELECT * FROM dim_users WHERE DATE(first_review_timestamp) <= '{TRAIN_CUT_TIMESTAMP}'").pl()

In [10]:
# Get users and games that are new during validation period
new_users = duckdb_conn.sql(f"SELECT * FROM dim_users WHERE DATE(first_review_timestamp) > '{TRAIN_CUT_TIMESTAMP}' "
                            f"AND DATE(first_review_timestamp) <= '{VAL_CUT_TIMESTAMP}'").pl()
new_games = duckdb_conn.sql(f"SELECT * FROM dim_games WHERE DATE(game_prerelease_date) > '{TRAIN_CUT_TIMESTAMP}' "
                            f"AND DATE(game_prerelease_date) <= '{VAL_CUT_TIMESTAMP}'").pl()

In [11]:
new_games.sort("game_index")

game_index,game_id,game_name,game_is_free,game_developers,game_publishers,game_categories,game_genres,game_steam_release_date,game_release_date,game_prerelease_date,game_short_description,game_review_score,game_review_score_description
i64,i64,str,bool,list[str],list[str],list[str],list[str],datetime[μs],"datetime[μs, America/Lima]","datetime[μs, America/Lima]",str,i64,str
9098,562240,"""The Spirit Underneath""",false,"[""Acuze Interactives""]","[""Displacement Studios""]","[""Single-player"", ""Steam Trading Cards"", … ""Family Sharing""]","[""Violent"", ""Adventure"", ""Indie""]",2017-01-02 00:00:00,2017-01-02 00:00:00 -05,2017-01-02 00:00:00 -05,"""The Spirit Underneath is a fir…",5,"""Mixed"""
9099,540900,"""Nuclear Contingency""",false,"[""Amaterasu Software""]","[""Amaterasu Software""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""Action"", ""Adventure"", ""Indie""]",2017-01-02 00:00:00,2017-01-02 00:00:00 -05,2017-01-02 00:00:00 -05,"""Nuclear Contingency is a top d…",6,"""Mostly Positive"""
9100,560930,"""Marimba VR""",false,"[""Ruby Games""]","[""Ruby Games""]","[""Single-player"", ""Tracked Motion Controller"", … ""Family Sharing""]","[""Casual"", ""Indie"", ""Simulation""]",2017-01-02 00:00:00,2017-01-02 00:00:00 -05,2017-01-02 00:00:00 -05,"""Marimba VR is a great way for …",0,"""1 user reviews"""
9101,520080,"""Your Star""",false,"[""natahem""]","[""natahem""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""Adventure"", ""Indie"", ""RPG""]",2017-01-02 00:00:00,2017-01-02 00:00:00 -05,2017-01-02 00:00:00 -05,"""In the game, you control a you…",0,"""4 user reviews"""
9102,571550,"""Hawks Tactical""",false,"[""LYS""]","[""Indie""]","[""Single-player"", ""Family Sharing""]","[""Adventure"", ""Indie"", … ""Strategy""]",2017-01-02 00:00:00,2017-01-02 00:00:00 -05,2017-01-02 00:00:00 -05,"""Hawk and his friends have swor…",0,"""4 user reviews"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
14916,766700,"""PearsAndGrayWitch""",false,"[""YAOYICHEN""]","[""YAOYICHEN""]","[""Single-player"", ""Family Sharing""]","[""Action"", ""Adventure"", … ""RPG""]",2018-01-01 00:00:00,2018-01-01 00:00:00 -05,2018-01-01 00:00:00 -05,"""A boy who likes &quot;Apples&q…",5,"""Mixed"""
14917,723020,"""3571 The Game""",false,"[""FRAPPA STUDIO""]","[""FRAPPA STUDIO""]","[""Single-player"", ""Steam Leaderboards"", ""Family Sharing""]","[""Action"", ""Adventure"", … ""Early Access""]",2018-01-01 00:00:00,2018-01-01 00:00:00 -05,2018-01-01 00:00:00 -05,"""3571 The Game is a 3D procedur…",0,"""5 user reviews"""
14918,524350,"""TileDynasty FPS Arena""",false,"[""Purpl3Grape""]","[""Purpl3Grape""]","[""Single-player"", ""Multi-player"", … ""Family Sharing""]","[""Action"", ""Indie"", ""Early Access""]",2018-01-01 00:00:00,2018-01-01 00:00:00 -05,2018-01-01 00:00:00 -05,"""Looking to bring the old arcad…",0,"""2 user reviews"""
14919,770520,"""Boomer Rampage""",false,"[""Lunarhellgames""]","[""Lunarhellgames""]","[""Single-player"", ""Steam Achievements"", … ""Family Sharing""]","[""Action"", ""Casual"", ""Indie""]",2018-01-01 00:00:00,2018-01-01 00:00:00 -05,2018-01-01 00:00:00 -05,"""Boomer Rampage is a puzzle gam…",0,"""No reviews"""


In [12]:
# This may include completely new users and games on the validation data
val_reviews_df = duckdb_conn.sql(f"SELECT * FROM fact_reviews WHERE DATE(timestamp_created) > '{TRAIN_CUT_TIMESTAMP}' "
                                 f"AND DATE(timestamp_created) <= '{VAL_CUT_TIMESTAMP}'").pl()

In [13]:
len(val_reviews_df)

1407008

In [14]:
# Make sure all games and users on validation data are also present in training
val_reviews_df = val_reviews_df.filter((~pl.col("game_index").is_in(new_games["game_index"].implode())) & (~pl.col("user_index").is_in(new_users["user_index"].implode())))

In [15]:
len(val_reviews_df)

475615

In [16]:
entity_count = duckdb_conn.sql(f"SELECT * FROM mart_entities_cumulative WHERE review_day = '{TRAIN_CUT_TIMESTAMP}'").pl()

In [17]:
duckdb_conn.close()

In [18]:
num_users = int(entity_count["cumulative_user_count"].first())
num_games = int(entity_count["cumulative_game_count"].first())

In [19]:
num_games, num_users

(9098, 1840563)

### Example using torchrec EmbeddingCollections

In [19]:
ec = torchrec.EmbeddingCollection(
    device=torch.device("cpu"),
    tables=[
        torchrec.EmbeddingConfig(name="user_index", embedding_dim=128, num_embeddings=num_users + 1),
        torchrec.EmbeddingConfig(name="game_index", embedding_dim=128, num_embeddings=num_games + 1),
    ]
)

In [20]:
sample = train_reviews_df.sample(10)
sample

review_id,user_index,game_index,review,written_during_early_access,voted_up,weighted_vote_score,votes_up,comment_count,timestamp_created,scrape_date
i64,i64,i64,str,bool,bool,f64,i64,i64,"datetime[μs, America/Lima]",date
2145352,26162,1315,"""Bizarrely addictive. Good fun.…",False,True,0.42988,1,0,2013-01-30 07:43:16 -05,2025-06-12
19859019,214812,125,"""Classic game.""",False,True,0.5,0,0,2015-12-22 21:11:23 -05,2025-06-12
21742139,1344824,5762,"""I've had more fun than expecte…",False,True,0.479452,0,0,2016-03-10 01:10:24 -05,2025-06-12
8179044,215832,1087,"""This is the best game of all t…",False,True,0.5,0,0,2013-12-21 16:26:46 -05,2025-06-12
28772518,1828608,1602,"""I was one of those people who …",False,True,0.5029,2,0,2016-12-28 19:46:25 -05,2025-06-12
19831456,843359,1494,"""One of those true gems, must b…",False,True,0.5,0,0,2015-12-21 16:21:36 -05,2025-06-12
24568957,1537652,6914,"""If Dark Souls and Geometry Das…",False,True,0.482958,31,1,2016-07-28 00:19:51 -05,2025-06-12
13598925,739702,2688,""";-( <3""",False,True,0.5,0,0,2014-12-25 17:46:32 -05,2025-06-12
24908430,493167,1475,"""I can now sort of fly a helico…",False,True,0.5,0,0,2016-08-14 05:38:19 -05,2025-06-12
15880616,405523,898,"""Just as good as the name says,…",False,True,0.5,0,0,2015-05-17 20:54:19 -05,2025-06-12


In [21]:
features = torchrec.KeyedJaggedTensor.from_jt_dict(
    {
        "user_index": torchrec.JaggedTensor(values=torch.tensor(sample["user_index"]), lengths=torch.ones(10, dtype=torch.int32)),
        "game_index": torchrec.JaggedTensor(values=torch.tensor(sample["game_index"]), lengths=torch.ones(10, dtype=torch.int32)),
    }
)

In [22]:
out = ec(features)
print(out["user_index"].values().shape)

torch.Size([10, 128])


### Pytorch Dataset

In [31]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, df: pl.DataFrame):
        self.reviews = df
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self, idx):
        review = self.reviews.row(idx, named=True)
        return {
            "user_id": torch.tensor(review["user_index"]),
            "game_id": torch.tensor(review["game_index"]),
            "voted_up": torch.tensor(review["voted_up"], dtype=torch.float)
        }


In [24]:
train_dataset = ReviewsDataset(train_reviews_df.filter(pl.col("voted_up") == 1))
val_dataset = ReviewsDataset(val_reviews_df.filter(pl.col("voted_up") == 1))

In [25]:
len(train_dataset), len(val_dataset)

(3042296, 375249)

### Simple PyTorch model

In [32]:
class SimpleRetrievalModel(torch.nn.Module):
    def __init__(self, embedding_dim: int, num_users: int = num_users, num_games: int = num_games):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.user_emb = torch.nn.Embedding(num_users + 1, embedding_dim)
        self.game_emb = torch.nn.Embedding(num_games + 1, embedding_dim)
    def forward(self, batch: dict[str, torch.Tensor]):
        users = batch["user_id"]
        games = batch["game_id"]
        users_embedding = self.user_emb(users)
        games_embedding = self.game_emb(games)
        logits = torch.matmul(users_embedding, games_embedding.t())
        labels = torch.arange(len(users), device=users.device)
        return torch.nn.functional.cross_entropy(logits, labels)

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
model = SimpleRetrievalModel(embedding_dim=128).to(device)

In [29]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False, pin_memory=True)

### Training Loop

In [35]:
def train_one_epoch(model: torch.nn.Module,
                    optimizer: torch.optim.Optimizer,
                    train_loader: torch.utils.data.DataLoader,
                    val_loader: torch.utils.data.DataLoader,
                    device: torch.device):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(batch)
        print(f"Processing batch {batch_idx + 1}/{len(train_loader)} - Loss: {loss.item():.4f}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(batch)
            print(f"Processing batch {batch_idx + 1}/{len(train_loader)} - Val_Loss: {loss.item():.4f}")
            val_loss += loss.item()
    return total_loss / len(train_loader), val_loss / len(val_loader)

In [36]:
def train(model: torch.nn.Module,
          train_loader: torch.utils.data.DataLoader,
          val_loader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          epochs: int,
          device: torch.device,
          scheduler=None):
    train_loss = float('inf')
    for epoch in range(epochs):
        train_loss, val_loss = train_one_epoch(model, optimizer, train_loader, val_loader, device)
        if scheduler is not None:
            scheduler.step()

        print(f"Epoch {epoch + 1}/{epochs} - Train loss: {train_loss:.4f} - Val loss: {val_loss:.4f}")
    return train_loss

In [33]:
train_loss = train(model, train_dataloader, val_dataloader, torch.optim.Adam(model.parameters(), lr=0.00001), epochs=5, device=torch.device("cuda"))

Processing batch 1/11884 - Loss: 30.7446
Processing batch 2/11884 - Loss: 30.1278
Processing batch 3/11884 - Loss: 30.7613
Processing batch 4/11884 - Loss: 30.9727
Processing batch 5/11884 - Loss: 28.8415
Processing batch 6/11884 - Loss: 29.3898
Processing batch 7/11884 - Loss: 30.7560
Processing batch 8/11884 - Loss: 32.0356
Processing batch 9/11884 - Loss: 29.4798
Processing batch 10/11884 - Loss: 31.8106
Processing batch 11/11884 - Loss: 32.1769
Processing batch 12/11884 - Loss: 31.0654
Processing batch 13/11884 - Loss: 29.3575
Processing batch 14/11884 - Loss: 31.7076
Processing batch 15/11884 - Loss: 30.1346
Processing batch 16/11884 - Loss: 29.9977
Processing batch 17/11884 - Loss: 29.4831
Processing batch 18/11884 - Loss: 30.6306
Processing batch 19/11884 - Loss: 30.8012
Processing batch 20/11884 - Loss: 31.2032
Processing batch 21/11884 - Loss: 29.8524
Processing batch 22/11884 - Loss: 30.6861
Processing batch 23/11884 - Loss: 31.5160
Processing batch 24/11884 - Loss: 31.1373
P

KeyboardInterrupt: 

In [21]:
games_review_count = train_reviews_df.group_by("game_index").len().filter(pl.col("len") > 100).sort("len", descending=True)
games_review_count

game_index,len
i64,u32
79,99574
1952,66802
1087,66699
1593,59376
1933,48873
…,…
2660,101
6099,101
1654,101
1881,101


In [24]:
train_rev_filtered = train_reviews_df.join(games_review_count, on="game_index", how="inner")

In [25]:
train_rev_filtered

review_id,user_index,game_index,review,written_during_early_access,voted_up,weighted_vote_score,votes_up,comment_count,timestamp_created,scrape_date,len
i64,i64,i64,str,bool,bool,f64,i64,i64,"datetime[μs, America/Lima]",date,u32
8387507,245760,1593,"""fun game""",false,true,0.5,0,2,2013-12-29 00:19:25 -05,2025-06-12,59376
12995168,245762,1567,"""Created by Daedalic Entertainm…",false,true,0.612418,11,0,2014-11-18 12:03:31 -05,2025-06-12,374
8387526,245763,1413,"""You will never know the joy of…",false,true,0.521739,1,0,2013-12-29 00:20:24 -05,2025-06-12,13682
8387531,245764,79,"""This is an awesome game to get…",false,true,0.522059,1,0,2013-12-29 00:20:41 -05,2025-06-12,99574
8387544,245767,402,"""Just fantastic of a game one o…",false,true,0.5,0,0,2013-12-29 00:21:07 -05,2025-06-12,3579
…,…,…,…,…,…,…,…,…,…,…,…
18782168,207738,882,"""Great Game, campaign needed be…",false,true,0.472441,0,0,2015-10-30 21:32:00 -05,2025-06-12,1952
16945878,207738,1087,"""Had a lot of fun. :)""",false,true,0.5,0,0,2015-07-08 12:52:31 -05,2025-06-12,66699
20592790,207738,2918,"""Yeah Its pretty good.""",false,true,0.472441,0,0,2016-01-17 15:46:49 -05,2025-06-12,347
19924464,207738,311,"""Lots of fun, Tribal age was th…",false,true,0.472441,0,0,2015-12-24 20:59:51 -05,2025-06-12,4159


In [23]:
users_reviews_count = train_reviews_df.group_by("user_index").len().filter(pl.col("len") > 10).sort("len", descending=True)
users_reviews_count

user_index,len
i64,u32
14466,401
638491,338
297975,307
1414,286
29917,282
…,…
379586,11
61082,11
882670,11
439495,11


In [27]:
train_rev_filtered2 = train_rev_filtered.join(users_reviews_count, on="user_index", how="inner")
train_rev_filtered2

review_id,user_index,game_index,review,written_during_early_access,voted_up,weighted_vote_score,votes_up,comment_count,timestamp_created,scrape_date,len,len_right
i64,i64,i64,str,bool,bool,f64,i64,i64,"datetime[μs, America/Lima]",date,u32,u32
9060036,245795,649,"""snorefest twink trainer""",false,false,0.426997,3,0,2014-02-11 21:25:38 -05,2025-06-12,10994,31
8614221,245918,176,"""was my first purchase on steam…",false,true,0.5,0,0,2014-01-07 21:43:10 -05,2025-06-12,4357,11
11951089,245996,130,"""amazing game, 10/10""",false,true,0.483912,0,0,2014-09-01 15:31:44 -05,2025-06-12,121,13
23877431,246073,5952,"""I am sure it is fine for some …",false,false,0.479514,4,1,2016-06-27 18:43:35 -05,2025-06-12,379,64
14714223,246094,1522,"""Inspired me to become a surgeo…",false,true,0.54205,10,0,2015-03-01 14:04:13 -05,2025-06-12,5429,20
…,…,…,…,…,…,…,…,…,…,…,…,…
18782168,207738,882,"""Great Game, campaign needed be…",false,true,0.472441,0,0,2015-10-30 21:32:00 -05,2025-06-12,1952,51
16945878,207738,1087,"""Had a lot of fun. :)""",false,true,0.5,0,0,2015-07-08 12:52:31 -05,2025-06-12,66699,51
20592790,207738,2918,"""Yeah Its pretty good.""",false,true,0.472441,0,0,2016-01-17 15:46:49 -05,2025-06-12,347,51
19924464,207738,311,"""Lots of fun, Tribal age was th…",false,true,0.472441,0,0,2015-12-24 20:59:51 -05,2025-06-12,4159,51


In [28]:
val_rev_filtered = val_reviews_df.join(games_review_count, on="game_index", how="inner").join(users_reviews_count, on="user_index", how="inner")

In [29]:
val_rev_filtered

review_id,user_index,game_index,review,written_during_early_access,voted_up,weighted_vote_score,votes_up,comment_count,timestamp_created,scrape_date,len,len_right
i64,i64,i64,str,bool,bool,f64,i64,i64,"datetime[μs, America/Lima]",date,u32,u32
30499009,247547,1253,"""I don't like hack'n'slash game…",false,true,0.502488,1,0,2017-03-13 18:31:48 -05,2025-06-12,412,16
34397007,247574,5133,"""Love the games by this develop…",false,true,0.5,0,0,2017-08-21 10:12:20 -05,2025-06-12,212,16
29404151,245984,2163,"""10/10 Great Co-Op experience.""",false,true,0.445056,0,0,2017-01-21 13:23:17 -05,2025-06-12,1801,11
29125057,246094,7912,"""Animated Wallpapers are cool, …",true,true,0.533582,2,2,2017-01-08 14:44:54 -05,2025-06-12,365,20
35625841,246895,1087,"""This review was formerly a thu…",false,false,0.498881,9,2,2017-10-10 15:50:04 -05,2025-06-12,66699,16
…,…,…,…,…,…,…,…,…,…,…,…,…
32937157,170404,235,"""5/10 still waiting on Katamari…",false,true,0.289843,2,0,2017-06-28 09:10:50 -05,2025-06-12,325,70
38547436,207738,710,"""Pretty fun game, however the G…",false,false,0.605005,12,0,2017-12-22 14:49:42 -05,2025-06-12,167,51
38750584,207738,1758,"""An amazing experience from sta…",false,true,0.5,2,0,2017-12-27 14:33:37 -05,2025-06-12,16319,51
38062243,207738,1512,"""Its just so dull.""",false,false,0.475921,4,0,2017-12-04 13:06:53 -05,2025-06-12,833,51


In [37]:
filtered_train = ReviewsDataset(train_rev_filtered2)
filtered_val = ReviewsDataset(val_rev_filtered)

In [38]:
filtered_train_loader = torch.utils.data.DataLoader(filtered_train, batch_size=256, shuffle=True, pin_memory=True)
filtered_val_loader = torch.utils.data.DataLoader(filtered_val, batch_size=256, shuffle=True, pin_memory=True)

In [46]:
model = SimpleRetrievalModel(embedding_dim=64).to(device)

In [47]:
train_loss = train(model, filtered_train_loader, filtered_val_loader, torch.optim.Adam(model.parameters(), lr=0.001), epochs=20, device=torch.device("cuda"))

Processing batch 1/1863 - Loss: 22.8077
Processing batch 2/1863 - Loss: 22.0271
Processing batch 3/1863 - Loss: 22.6050
Processing batch 4/1863 - Loss: 22.4268
Processing batch 5/1863 - Loss: 23.0112
Processing batch 6/1863 - Loss: 21.9440
Processing batch 7/1863 - Loss: 21.9081
Processing batch 8/1863 - Loss: 22.0075
Processing batch 9/1863 - Loss: 22.2608
Processing batch 10/1863 - Loss: 22.2519
Processing batch 11/1863 - Loss: 22.5610
Processing batch 12/1863 - Loss: 21.6746
Processing batch 13/1863 - Loss: 22.6561
Processing batch 14/1863 - Loss: 22.9144
Processing batch 15/1863 - Loss: 22.2682
Processing batch 16/1863 - Loss: 21.3811
Processing batch 17/1863 - Loss: 23.1097
Processing batch 18/1863 - Loss: 22.1073
Processing batch 19/1863 - Loss: 21.6227
Processing batch 20/1863 - Loss: 21.8905
Processing batch 21/1863 - Loss: 22.7935
Processing batch 22/1863 - Loss: 22.3241
Processing batch 23/1863 - Loss: 22.0870
Processing batch 24/1863 - Loss: 22.6202
Processing batch 25/1863 

KeyboardInterrupt: 