Embedding builder for Video Game Recommender

This notebook is used to sentence embeddings for each game in the dataset

Inputs: 
- data/game_overview/game_overview_final_vol2.json

Outputs: 
- embeddings.py ; a matrix of shape (n_games, embedding_dim)
- games_df.pkl ; a pandas DataFrame with all the necessary features for posterior analysis


In [None]:
# Imports and configuration
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import re

In [None]:
# Load dataset
df = pd.read_json("data/game_overview/game_overview_final_vol2.json")
df = df.reset_index(drop=True)

print("Number of games:", len(df))
df.head()

In [None]:
#Build the text to embed 
def make_text(row):
    parts = []
    
    for col in ['name', 'summary', 'genres', 'platforms', 'companies', 'keywords']:
        if col in row:
            val = row[col]
            if isinstance(val, str) and val.strip():
                parts.append(val)

    return " ".join(parts)

df['text'] = df.apply(make_text, axis=1)

print(df[['game_id', 'name', 'text']].head())
print(df.shape)

In [None]:

# Convert the release date column to an actual date time with value 0-1, 0 being the oldest game in the dataset, and 1 being the most recent 
df["first_release_date"] = pd.to_datetime(df["first_release_date"], errors="coerce")

min_date = df["first_release_date"].min()
max_date = df["first_release_date"].max()

date_range_days = (max_date - min_date).days

def compute_recency(d):
    if pd.isna(d):
        return 0.5
    
    return (d - min_date).days/date_range_days
df["recency"] = df["first_release_date"].apply(compute_recency)

In [None]:
#Load embedding model (change device to 'cuda if GPU is desired to run  the SentenceTransformer)
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

#Encode to embedding
texts = df["text"].tolist()
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Print and normalize embeddings
print(embeddings.shape)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
print(df.loc[1])

In [None]:
# Save embeddings and the dataset to use in the next methods
np.save("embeddings.npy", embeddings)
df.to_pickle("games_df.pkl")  