# Experimenting rec approaches

In [1]:
from datetime import datetime, timedelta

import duckdb
import polars as pl
import torch
import torchrec

In [None]:
TRAIN_CUT_TIMESTAMP = '2017-01-01'  # Fetch reviews up to this day FOR TRAINING
VAL_CUT_TIMESTAMP = '2018-01-01'  # Fetch reviews up to this day FOR VALIDATION

In [None]:
duckdb_conn = duckdb.connect('../data/steam.duckdb', read_only=True)

### Train dataset

In [None]:
train_games_df = duckdb_conn.sql(f"SELECT * FROM game_features WHERE DATE(game_review_day) <= '{TRAIN_CUT_TIMESTAMP}'")

In [None]:
train_reviews_df = duckdb_conn.sql(
    f"SELECT * FROM fact_reviews WHERE DATE(timestamp_created) <= '{TRAIN_CUT_TIMESTAMP}'")

In [None]:
train_reviews_df.show(max_width=10)

In [None]:
type(train_reviews_df)

In [None]:
dim_games_df = duckdb_conn.sql(
    f"SELECT * FROM dim_games WHERE DATE(game_prerelease_date) <= '{TRAIN_CUT_TIMESTAMP}'").pl()

In [None]:
dim_users_df = duckdb_conn.sql(
    f"SELECT * FROM dim_users WHERE DATE(first_review_timestamp) <= '{TRAIN_CUT_TIMESTAMP}'").pl()

In [None]:
# Get users and games that are new during validation period
new_users = duckdb_conn.sql(f"SELECT * FROM dim_users WHERE DATE(first_review_timestamp) > '{TRAIN_CUT_TIMESTAMP}' "
                            f"AND DATE(first_review_timestamp) <= '{VAL_CUT_TIMESTAMP}'").pl()
new_games = duckdb_conn.sql(f"SELECT * FROM dim_games WHERE DATE(game_prerelease_date) > '{TRAIN_CUT_TIMESTAMP}' "
                            f"AND DATE(game_prerelease_date) <= '{VAL_CUT_TIMESTAMP}'").pl()

In [None]:
new_games.sort("game_index")

In [None]:
# This may include completely new users and games on the validation data
val_reviews_df = duckdb_conn.sql(f"SELECT * FROM fact_reviews WHERE DATE(timestamp_created) > '{TRAIN_CUT_TIMESTAMP}' "
                                 f"AND DATE(timestamp_created) <= '{VAL_CUT_TIMESTAMP}'").pl()

In [None]:
len(val_reviews_df)

In [None]:
# Make sure all games and users on validation data are also present in training
val_reviews_df = val_reviews_df.filter((~pl.col("game_index").is_in(new_games["game_index"].implode())) & (
    ~pl.col("user_index").is_in(new_users["user_index"].implode())))

In [None]:
len(val_reviews_df)

In [None]:
entity_count = duckdb_conn.sql(
    f"SELECT * FROM mart_entities_cumulative WHERE review_day = '{TRAIN_CUT_TIMESTAMP}'").pl()

In [None]:
duckdb_conn.close()

In [None]:
num_users = int(entity_count["cumulative_user_count"].first())
num_games = int(entity_count["cumulative_game_count"].first())

In [None]:
num_games, num_users

### Example using torchrec EmbeddingCollections

In [None]:
ec = torchrec.EmbeddingCollection(
    device=torch.device("cpu"),
    tables=[
        torchrec.EmbeddingConfig(name="user_index", embedding_dim=128, num_embeddings=num_users + 1),
        torchrec.EmbeddingConfig(name="game_index", embedding_dim=128, num_embeddings=num_games + 1),
    ]
)

In [None]:
sample = train_reviews_df.sample(10)
sample

In [None]:
features = torchrec.KeyedJaggedTensor.from_jt_dict(
    {
        "user_index": torchrec.JaggedTensor(values=torch.tensor(sample["user_index"]),
                                            lengths=torch.ones(10, dtype=torch.int32)),
        "game_index": torchrec.JaggedTensor(values=torch.tensor(sample["game_index"]),
                                            lengths=torch.ones(10, dtype=torch.int32)),
    }
)

In [None]:
out = ec(features)
print(out["user_index"].values().shape)

### Pytorch Dataset

In [None]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, df: pl.DataFrame):
        self.reviews = df

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews.row(idx, named=True)
        return {
            "user_id": torch.tensor(review["user_index"]),
            "game_id": torch.tensor(review["game_index"]),
            "voted_up": torch.tensor(review["voted_up"], dtype=torch.float)
        }


In [None]:
train_dataset = ReviewsDataset(train_reviews_df.filter(pl.col("voted_up") == 1))
val_dataset = ReviewsDataset(val_reviews_df.filter(pl.col("voted_up") == 1))

In [None]:
len(train_dataset), len(val_dataset)

### Simple PyTorch model

In [None]:
class SimpleRetrievalModel(torch.nn.Module):
    def __init__(self, embedding_dim: int, num_users: int = num_users, num_games: int = num_games):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.user_emb = torch.nn.Embedding(num_users + 1, embedding_dim)
        self.game_emb = torch.nn.Embedding(num_games + 1, embedding_dim)

    def forward(self, batch: dict[str, torch.Tensor]):
        users = batch["user_id"]
        games = batch["game_id"]
        users_embedding = self.user_emb(users)
        games_embedding = self.game_emb(games)
        logits = torch.matmul(users_embedding, games_embedding.t())
        labels = torch.arange(len(users), device=users.device)
        return torch.nn.functional.cross_entropy(logits, labels)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = SimpleRetrievalModel(embedding_dim=128).to(device)

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False, pin_memory=True)

### Training Loop

In [None]:
def train_one_epoch(model: torch.nn.Module,
                    optimizer: torch.optim.Optimizer,
                    train_loader: torch.utils.data.DataLoader,
                    val_loader: torch.utils.data.DataLoader,
                    device: torch.device):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(batch)
        print(f"Processing batch {batch_idx + 1}/{len(train_loader)} - Loss: {loss.item():.4f}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(batch)
            print(f"Processing batch {batch_idx + 1}/{len(train_loader)} - Val_Loss: {loss.item():.4f}")
            val_loss += loss.item()
    return total_loss / len(train_loader), val_loss / len(val_loader)

In [None]:
def train(model: torch.nn.Module,
          train_loader: torch.utils.data.DataLoader,
          val_loader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          epochs: int,
          device: torch.device,
          scheduler=None):
    train_loss = float('inf')
    for epoch in range(epochs):
        train_loss, val_loss = train_one_epoch(model, optimizer, train_loader, val_loader, device)
        if scheduler is not None:
            scheduler.step()

        print(f"Epoch {epoch + 1}/{epochs} - Train loss: {train_loss:.4f} - Val loss: {val_loss:.4f}")
    return train_loss

In [None]:
train_loss = train(model, train_dataloader, val_dataloader, torch.optim.Adam(model.parameters(), lr=0.00001), epochs=5,
                   device=torch.device("cuda"))

In [None]:
games_review_count = train_reviews_df.group_by("game_index").len().filter(pl.col("len") > 100).sort("len",
                                                                                                    descending=True)
games_review_count

In [None]:
train_rev_filtered = train_reviews_df.join(games_review_count, on="game_index", how="inner")

In [None]:
train_rev_filtered

In [None]:
users_reviews_count = train_reviews_df.group_by("user_index").len().filter(pl.col("len") > 10).sort("len",
                                                                                                    descending=True)
users_reviews_count

In [None]:
train_rev_filtered2 = train_rev_filtered.join(users_reviews_count, on="user_index", how="inner")
train_rev_filtered2

In [None]:
val_rev_filtered = val_reviews_df.join(games_review_count, on="game_index", how="inner").join(users_reviews_count,
                                                                                              on="user_index",
                                                                                              how="inner")

In [None]:
val_rev_filtered

In [None]:
filtered_train = ReviewsDataset(train_rev_filtered2)
filtered_val = ReviewsDataset(val_rev_filtered)

In [None]:
filtered_train_loader = torch.utils.data.DataLoader(filtered_train, batch_size=256, shuffle=True, pin_memory=True)
filtered_val_loader = torch.utils.data.DataLoader(filtered_val, batch_size=256, shuffle=True, pin_memory=True)

In [None]:
model = SimpleRetrievalModel(embedding_dim=64).to(device)

In [None]:
train_loss = train(model, filtered_train_loader, filtered_val_loader, torch.optim.Adam(model.parameters(), lr=0.001),
                   epochs=20, device=torch.device("cuda"))

### MLFLow test

In [None]:
import mlflow.sklearn
from sklearn.datasets import load_iris

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Optional if you didn't export MLFLOW_TRACKING_URI
mlflow.set_tracking_uri("http://localhost:5000")

mlflow.set_experiment("test")

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run() as run:
    print(mlflow.get_artifact_uri())
    model = LogisticRegression(max_iter=200, random_state=42).fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlflow.log_param("max_iter", 200)
    mlflow.log_metric("accuracy", acc)

    # Upload model artifacts to object storage and register the model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name="IrisClassifier"
    )

print("Done. Open the MLflow UI and check the Registered Models and the run artifacts.")


In [None]:
!curl -i http://localhost:5000/api/2.0/mlflow/logged-models/list

In [2]:
duckdb_conn = duckdb.connect('../data/steam.duckdb', read_only=True)

In [3]:
duckdb_conn.sql(f"""SET s3_region='us-east-1';
                SET s3_url_style='path';
                SET s3_use_ssl=false;
                SET s3_endpoint='localhost:9000';
                SET s3_access_key_id='';
                SET s3_secret_access_key='';""")

In [None]:
recommended_games = duckdb_conn.sql(
    "SELECT game_id, game_name, game_review_month, game_num_reviews, game_num_positive_reviews,"
    "game_num_negative_reviews, game_weighted_score FROM game_features").pl()

In [None]:
training_data = duckdb_conn.sql("""
                                SELECT user_id, game_id, timestamp_created
                                FROM training_features
                                WHERE current_month::VARCHAR LIKE '2025-%'
                                """).pl()

In [4]:
ground_truth_users = duckdb_conn.sql("""
                                     WITH anchors AS (
  SELECT DISTINCT user_id, current_month
  FROM training_features
  WHERE voted_up = true
    AND strftime(current_month, '%Y') = '2025'  -- prefer date funcs over LIKE
)
SELECT
  a.user_id,
  a.current_month,
  (
    SELECT array_agg(f.game_id ORDER BY f.first_month, f.game_id)
    FROM (
      SELECT tf.game_id, MIN(tf.current_month) AS first_month
      FROM training_features AS tf
      WHERE tf.user_id = a.user_id
        AND tf.voted_up = true
        AND tf.current_month >= a.current_month
      GROUP BY tf.game_id
    ) AS f
  ) AS future_game_ids
FROM anchors AS a
ORDER BY a.user_id, a.current_month
;
                                     """).pl()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
ground_truth_users

In [None]:
all_time_ground_truth = ground_truth_users.group_by("user_id").agg(pl.col("array_agg(game_id)").flatten()).sort("user_id")

In [None]:
duckdb_conn.close()

## Recap
### HitRate @ K
* Recommended: [A, X, B, ... (K values)]
* Relevant: [A, B, C] -> (There is at least 1 match -> Hit = 1, else Hit = 0). Average over all test set (All users)
### Precision @ K
 Out of the first K items recommended to the user, how many were relevant (part of the ground truth)?
* Recommended = [Terminator, James Bond, Love Actually] (K = 3)
* Relevant (Ground Truth): [Terminator, James Bond, Iron Man, ... (can be more)]

Precision @ K = (# of relevant items in top K recommendations) / K = 2 / 3 = 0.67

### Recall @ K
 Out of all the items the user found relevant, how many were recommended?
 Same Example
* Recommended = [Terminator, James Bond, Love Actually] (K = 3)
* Relevant (Ground Truth): [Terminator, James Bond, Iron Man, 3 more relevant movies] (total = 6)

Recall @ K = (# of relevant items in top K recommendations) / (# of relevant items) = 2 / 6 = 0.33

### Average Precision @ K
Weighted sum of precisions for each K

### Mean Average Precision @ K
Mean over all users of the average precision

In [None]:
class PopularityModel:
    def __init__(self):
        self.games_score: pl.DataFrame | None = None

    def train(self, games_df: pl.DataFrame):
        self.games_score = self._compute_popular_games(games_df)

    @staticmethod
    def _compute_popular_games(games: pl.DataFrame):
        sorted_games = games.select(
            pl.all().sort_by("game_num_positive_reviews", descending=True).over("game_review_month",
                                                                                mapping_strategy="explode")
        ).sort("game_review_month", descending=True)
        return sorted_games

    @staticmethod
    def get_month_date(dt: datetime):
        return (dt.replace(day=1) - timedelta(days=1)).replace(day=1)

    def forward(self, x):
        current_month = x["timestamp_created"]
        month_date = self.get_month_date(current_month)
        game_id = x["game_id"]
        score = self.games_score.filter((pl.col("game_id") == game_id) & (pl.col("game_review_month") == month_date))
        return score["game_num_positive_reviews"].first()

    def recommend(self, x, k: int):
        current_month = x["current_month"]
        month_date = self.get_month_date(current_month)
        recommended = self.games_score.filter(pl.col("game_review_month") == month_date).limit(k).select("game_id", "game_name").to_dicts()
        return recommended

In [None]:
class RandomRecModel:
    def __init__(self):
        self.games_score: pl.DataFrame | None = None

    def train(self, games_df: pl.DataFrame):
        self.games_score = games_df

    def recommend(self, x, k: int):
        recommended = self.games_score.sample(k).select("game_id", "game_name").to_dicts()
        return recommended

In [None]:
class RecommenderMetrics:
    @staticmethod
    def hitrate_at_k(ground_truth: list, recommended: list):
        hits = 0
        for i in range(len(ground_truth)):
            if set(ground_truth[i]) & set(recommended[i]):
                hits += 1
        return hits / len(ground_truth)

    @staticmethod
    def map_at_k(self, ground_truth: list, recommended: list, k: int):
        return sum([self.ap_at_k(actual, predicted, k) for actual, predicted in zip(ground_truth, recommended)]) / len(ground_truth)

    @staticmethod
    def ap_at_k(actual: list, predicted: list, k: int):
        sum_precisions = 0
        hits = 0
        for i in range(k):
            new_pred = predicted[i]
            if new_pred in actual and new_pred not in predicted[:i]:
                hits += 1
                sum_precisions += hits / (i + 1)
        return sum_precisions / min(k, len(actual))

In [None]:
baseline = PopularityModel()
baseline.train(recommended_games)

In [None]:
random_baseline = RandomRecModel()
random_baseline.train(recommended_games.unique("game_id"))

In [None]:
random_baseline.games_score

### Metrics for evaluation: HitRate @ K, MAP @ K, NDCG @ K

In [None]:
actual = [1, 450, 345, 923, 345, 90, 988, 3456, 892]
predicted = [3, 450, 90, 34, 56, 90, 908, 345, 56, 234]
# By Hand:
# K = [1, 2, 3, 4, 5]
# Precision @ 1 = 0 / 1 = 0
# Precision @ 2 = 1 / 2 = 0.5
# Precision @ 3 = 2 / 3 = 0.67
# Precision @ 4 = 2 / 4 = 0.5
# Precision @ 5 = 2 / 5 = 0.4
# Precision @ 6 = 2 / 6 = 0.33
# Precision @ 7 = 2 / 7 = 0.28
# Precision @ 8 = 3 / 8 = 0.375
# Precision @ 9 = 3 / 9 = 0.33
# Precision @ 10 = 3 / 10 = 0.3
# Average Precision @ K = ( 1 / min(R, K)) * sum(P@K if k is relevant)
# Average Precision @ 10 = ( 1 / 9 ) * (0.5 + 0.67 + 0.375) = 0.1717

In [None]:
metrics = RecommenderMetrics()

In [None]:
metrics.ap_at_k([1, 450, 345, 923, 345, 90, 988, 3456, 892],
                [3, 450, 90, 34, 56, 90, 908, 345, 56, 234],
                k=10)

### Compute Baseline Metrics

In [None]:
ground_truth_users.filter((pl.col("user_id") == 76561197960268417) & (pl.col("current_month") == datetime(2025, 3, 1)))["future_game_ids"].first()

In [None]:
ground_truth_users

In [None]:
uids = ground_truth_users["user_id"].to_list()
months = ground_truth_users["current_month"].to_list()
futures = ground_truth_users["future_game_ids"].to_list()
future_map = dict(zip(zip(uids, months), futures))

In [None]:
total_ap = 0.0
for i, elem in enumerate(ground_truth_users.iter_rows(named=True)):
    recs = baseline.recommend(elem, k=10)
    game_ids = [rec["game_id"] for rec in recs]
    actual = future_map.get((elem["user_id"], elem["current_month"]))
    ap_k = metrics.ap_at_k(actual, game_ids, k=10)
    total_ap = total_ap + ap_k
    if i % 100_000 == 0:
        print(f"Processed {i} users")
        print(f"mean Average Precision @ 10: {total_ap / (i + 1)}")
print(f"mean Average Precision @ 10: {total_ap / len(ground_truth_users)}")

In [None]:
total_ap = 0.0
for i, elem in enumerate(ground_truth_users.iter_rows(named=True)):
    recs = baseline.recommend(elem, k=10)
    game_ids = [rec["game_id"] for rec in recs]
    actual = all_time_ground_truth.filter(pl.col("user_id") == elem["user_id"])["array_agg(game_id)"].first()
    ap_k = metrics.ap_at_k(actual, game_ids, k=10)
    total_ap = total_ap + ap_k
    if i % 100_000 == 0:
        print(f"Processed {i} users")
        print(f"mean Average Precision @ 10: {total_ap / (i + 1)}")
print(f"mean Average Precision @ 10: {total_ap / len(ground_truth_users)}")

In [None]:
total_ap = 0.0
for i, elem in enumerate(ground_truth_users.iter_rows(named=True)):
    recs = random_baseline.recommend(elem, k=10)
    game_ids = [rec["game_id"] for rec in recs]
    actual = all_time_ground_truth.filter(pl.col("user_id") == elem["user_id"])["array_agg(game_id)"].first()
    ap_k = metrics.ap_at_k(actual, game_ids, k=10)
    total_ap = total_ap + ap_k
    if i % 100_000 == 0:
        print(f"Processed {i} users")
        print(f"mean Average Precision @ 10: {total_ap / (i + 1)}")
print(f"mean Average Precision @ 10: {total_ap / len(ground_truth_users)}")

### mlflow + Baseline Model

In [None]:
%%writefile baseline.py

from datetime import timedelta, datetime
import polars as pl
from mlflow.pyfunc import PythonModel
from mlflow.models import set_model

class PopularityPyFunc(PythonModel):
    def __init__(self, k: int = 10):
        self.k = k
        self._month_to_recs = None

    def load_context(self, context):
        df = pl.read_parquet(context.artifacts["games_score"]).sort(
            ["game_review_month", "game_num_positive_reviews"],
            descending=[False, True],
        )
        grouped = df.group_by("game_review_month").agg(
            pl.col("game_id").head(self.k).alias("top_game_ids")
        )
        self._month_to_recs = {
            row["game_review_month"]: row["top_game_ids"]
            for row in grouped.iter_rows(named=True)
        }

    @staticmethod
    def _month_start_prev(dt: datetime):
        return (dt.replace(day=1) - timedelta(days=1)).replace(day=1)

    def predict(self, context, model_input: pl.DataFrame):
        # assumes model_input["current_month"] is datetime-like
        return model_input["current_month"].map_elements(lambda dt: self._month_to_recs.get(self._month_start_prev(dt), []))

set_model(PopularityPyFunc(k=10))


In [None]:
import mlflow
import polars as pl

In [None]:
class PopularityTrainer:
    def fit(self, games_df: pl.DataFrame) -> pl.DataFrame:
        return games_df.select(
            pl.all().sort_by("game_num_positive_reviews", descending=True).over("game_review_month", mapping_strategy="explode")).sort("game_review_month", descending=True)

In [None]:
trainer = PopularityTrainer()
games_score = trainer.fit(recommended_games)

In [None]:
games_score

In [None]:
import tempfile, os
mlflow.set_experiment("baseline")
mlflow.set_tracking_uri("http://localhost:5000")
with mlflow.start_run(run_name="test-baseline"), tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, "games_score.parquet")
    games_score.write_parquet(path)
    model_info = mlflow.pyfunc.log_model(
        python_model="baseline.py",
        artifacts={"games_score": path},
        name="baseline-model",
        registered_model_name="baseline-model"
    )

In [None]:
model_info.model_uri

In [None]:
my_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [5]:
df = ground_truth_users

In [None]:
import mlflow

In [None]:
mlflow.models.evaluate(
    data=df,
    model_type="retriever",
    targets="future_game_ids",
    predictions="predictions",
    evaluators="default"
)

In [None]:
from datetime import date

In [None]:
my_model._month_to_recs

In [None]:
mlflow.metrics.precision_at_k()

In [None]:
import sys, gamerec
print(sys.executable, gamerec.file)

In [None]:
from gamerec.models.pop_model_baseline import PopularityPyFunc

In [None]:
PopularityPyFunc()

In [6]:
import mlflow
mlflow.set_tracking_uri('http://localhost:5000')

In [7]:
my_model = mlflow.pyfunc.load_model('models:/m-d7a6fd196be64abab05c29e7c5fb9296')

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]



In [9]:
mini = df.sample(10, seed=1)

In [8]:
my_model.predict(df)

current_month
list[i64]
"[3241660, 2246340, … 413150]"
"[2694490, 2527500, … 252490]"
"[730, 3527290, … 322170]"
"[3164500, 3241660, … 1086940]"
"[1771300, 730, … 550]"
…
"[3527290, 730, … 431960]"
"[3527290, 730, … 431960]"
"[3527290, 730, … 431960]"
"[3527290, 730, … 431960]"
