# 3.1 Embedding Builder 

This notebook is used to sentence embeddings for each game in the dataset

Inputs:

data/json/game_overview_final_vol2.json
Outputs:

embeddings_FINAL.py ; a matrix of shape (n_games, embedding_dim)
games_df_FINAL.pkl ; a pandas DataFrame with all the necessary features for posterior analysis

In [None]:
# Imports 
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Load dataset

In [2]:
# Load dataset
df = pd.read_json("../data/json/game_overview_final_vol2.json")
df = df.reset_index(drop=True)

print("Number of games:", len(df))
df.head()

FileNotFoundError: File ../data/json/game_overview_final_vol2.json does not exist

## Build text to embed

In [3]:
def make_text(row):
    parts = []
    
    for col in ['name', 'summary', 'genres', 'platforms', 'companies', 'keywords']:
        if col in row:
            val = row[col]
            if isinstance(val, str) and val.strip():
                parts.append(val)

    return " ".join(parts)

df['text'] = df.apply(make_text, axis=1)

print(df[['game_id', 'name', 'text']].head())
print(df.shape)

NameError: name 'df' is not defined

In [None]:

# Convert the release date column to an actual date time with value 0-1, 0 being the oldest game in the dataset, and 1 being the most recent 
df["first_release_date"] = pd.to_datetime(df["first_release_date"], errors="coerce")

min_date = df["first_release_date"].min()
max_date = df["first_release_date"].max()

date_range_days = (max_date - min_date).days

def compute_recency(d):
    if pd.isna(d):
        return 0.5
    
    return (d - min_date).days/date_range_days
df["recency"] = df["first_release_date"].apply(compute_recency)

## Generate the embeddings and save themm

In [None]:
#Load embedding model (change device to 'cuda if GPU is desired to run  the SentenceTransformer)
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

#Encode to embedding
texts = df["text"].tolist()
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Print and normalize embeddings
print(embeddings.shape)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
print(df.loc[1])

In [None]:
# Save embeddings and the dataset to use in the next methods
np.save("../data/embeddings_FINAL.npy", embeddings)
df.to_pickle("../data/games_df_FINAL.pkl") 

# LSH with embeddings 

This notebook builds a simple LSH index on top of precomputed embeddings and provides a recommendation function

It assumes you already have run the Embedding Builder notebook and generated:

games_df.pkl; a DataFrame with game metadata
embeddings.npy; numpy array of shape (n_games, embedding_dim)
Steps taken:

Load the embeddings and game data
Generate random hyperplanes and build LSH index
Implement LSH query to get candidate neighbours
Rank candidates using cosine similarity + rating + recency
Show example recommendations for a given game index

## Get the imports and embeddings

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# In the original script we used timing/tracemalloc for profiling.
# In this notebook we focus on the logic and interactivity, so we skip that.

df = pd.read_pickle("../data/games_df_FINAL.pkl")
embeddings = np.load("../data/embeddings_FINAL.npy")
print("Embeddings shape:", embeddings.shape)
print("Number of games:", len(df))

## generate random hyperplanes

In [None]:
# Several sets of random hyperplanes are created, each corresponding to one LSH table, and each hyperplane used to generate one bit of the hash key
def generate_hyperplanes(num_tables, num_planes, dim, seed= 0):
    rng = np.random.RandomState(seed)
    hyperplanes = []
    for _ in range(num_tables):
        planes = rng.randn(num_planes, dim)
        planes /= np.linalg.norm(planes, axis = 1, keepdims = True) +1e-9
        hyperplanes.append(planes)
    
    return hyperplanes

# check it
dim = embeddings.shape[1]
num_tables = 10
num_planes = 16

hyperplanes = generate_hyperplanes(num_tables, num_planes, dim, seed=33)
print(hyperplanes)

## Hash each embedding into buckets 

In [None]:
def hash_v(v, planes):
    projections = planes @ v
    bits = projections > 0
    key = ''.join('1' if b else '0' for b in bits)
    return key

## Get LSH index, which is tables with buckets

In [None]:
# We build one hash table per hyperplane set. Each table maps a bitstring key to a bucket of game indices whose embedding fall into that region

def build_lsh_idx(embeddings, hyperplanes):
    n, dim = embeddings.shape
    num_tables = len(hyperplanes)

    tables = [defaultdict(set) for _ in range(num_tables)]


    for idx, v in enumerate(embeddings):
        for t, planes in enumerate(hyperplanes):
            key = hash_v(v, planes)
            tables[t][key].add(idx)
    
    return tables

# check it
tables = build_lsh_idx(embeddings, hyperplanes)

## query LSH; get candidate neighbours for a game

In [None]:
# Given a game idx, we compute its hash key in each table, collect all indices in the corresponding bucket, union them into a candidate set
def lsh_query(idx, embeddings, hyperplanes, tables):
    v = embeddings[idx]
    candidates = set()

    for t, planes in enumerate(hyperplanes):
        key = hash_v(v, planes)
        bucket = tables[t].get(key, set())
        candidates |= bucket

    candidates.discard(idx)
    return candidates

#test
cand = lsh_query(0, embeddings, hyperplanes, tables)
print("Candidates for game 0:", cand)
for candidate in cand:
    print(df.loc[candidate, 'name'])

## Recommender

In [None]:
# Using the candidates for a given game, the cosine similarity between the candidates and the game is computed and normalized to [0,1]. 
# Combining similarity, rating and recency, a final score is obtained for each candidate (0-1) and the results are sorted to print the top recommendations based on this score.
# The weights to similarity, rating and recency can be changed to emphasize one or the other. 
def recommend_lsh_embedding(idx, embeddings, hyperplanes, tables, df, top=5):
    v = embeddings[idx]
    cand = lsh_query(idx, embeddings, hyperplanes, tables)

    if not cand:
        print(f"No candidates for {df.loc[idx, 'name']}")
        return []

    cand_indices = np.array(list(cand), dtype=int)

    sim = cosine_similarity(
        v.reshape(1, -1),
        embeddings[cand_indices]
    )  # shape: (1, n_candidates)

    sim_flat = sim[0]  
    cos_norm = (sim_flat + 1.0) / 2.0


    ratings = df.loc[cand_indices, 'steam_rating'].astype(float).to_numpy()
    recency = df.loc[cand_indices, 'recency'].astype(float).to_numpy()


    w_sim = 0.7
    w_rating = 0.2
    w_date = 0.1

    final_score = (w_sim*cos_norm + w_rating*ratings + w_date*recency)
    
    order = np.argsort(final_score)[::-1]
    ordered_indices = cand_indices[order]
    ordered_cos_norm = cos_norm[order]    
    ordered_rating = ratings[order]
    ordered_recency = recency[order]
    ordered_score = final_score[order]

    results = list(zip(
        ordered_indices[:top], 
        ordered_cos_norm[:top],
        ordered_rating[:top],
        ordered_recency[:top],
        ordered_score[:top]))

    print(f"Query: {df.loc[idx, 'name']}")
    print("Recommendations (cosine_norm, rating, recency, score):")

    for j, cos_s, r, rec, score in results:
        print(
            f" --> {df.loc[j, 'name']} "
            f"(cos={cos_s:.3f}, rating={r:.3f}, recency = {rec:.3f}, score={score:.3f})"
        )

    return results


#test
recommend_lsh_embedding(0, embeddings, hyperplanes, tables, df, top=3)

# LSH with tokens 
This notebook implements an LSH-based recommender using token-shingles and MinHash signatures

It:

Loads the raw game dataset
builds a text field per game
Creates word shingles
Computes MinHash signatures
Builds LSH using banding
Finds candidate similar games and ranks them using: Jaccard similarity, rating and recency.

In [None]:
# Load the necessary packages
import pandas as pd
import mmh3
from  collections import defaultdict
import re

## Get the text from the dataset and build the text to shingle

In [None]:
# Normalize the text so that no "weird" shingles are formed later
df = pd.read_json("../data/json/game_overview_final_vol2.json")
df = df.reset_index(drop=True)

def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.replace(";", " ")         
    text = re.sub(r"[^a-z0-9\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def make_text(row):
    parts =[]

    for col in ['name', 'summary', 'genres', 'platforms', 'companies', 'keywords']:
        if col in row:
            val = row[col]
            if isinstance(val, str) and val.strip():
                parts.append(val)
    
    return " ".join(parts)

df['text'] = df.apply(make_text, axis = 1)

print(df[["game_id", "name", "text"]].head())
df.shape

## Compute a normalized recency score [0, 1] from the release date

In [None]:
df["first_release_date"] = pd.to_datetime(df["first_release_date"], errors="coerce")

min_date = df["first_release_date"].min()
max_date = df["first_release_date"].max()

date_range_days = (max_date - min_date).days

def compute_recency(d):
    if pd.isna(d):
        return 0.5
    
    return (d - min_date).days/date_range_days
df["recency"] = df["first_release_date"].apply(compute_recency)

## q - shingles 

In [None]:
# We turn each game's text into a set of word shingles. These shingles are the input to the MinHash function.
def shingle(text: str, q: int = 2):
    text = normalize_text(text)
    words = text.split()
    shingles = set()
    for i in range(len(words) - q + 1):
        sh = " ".join(words[i:i+q])
        shingles.add(sh)
    return shingles

NUM_HASHES = 100
BANDS = 50
ROWS_PER_BAND = 2

# Checking it works
print(df.loc[1, "text"])
print(shingle(df.loc[1, "text"], q=2))

## Compute a Minhash signature of length "num_hases"

In [None]:
def minhash_sign(shingles, num_hashes = NUM_HASHES):
    signature = []
    for seed in range(num_hashes):
        min_val = None
        for sh in shingles:
            h = mmh3.hash(sh, seed, signed=False)
            if (min_val is None) or (h < min_val):
                min_val = h
        signature.append(min_val)
    return signature

## get the signature for each game

In [None]:
def compute_sign(df, q=2, num_hashes = NUM_HASHES):
    signatures = {}
    shingle_cache = {}

    for idx, row in df.iterrows():
        text = row["text"]
        shingles = shingle(text, q = q)
        shingle_cache[idx] = shingles
        sig = minhash_sign(shingles, num_hashes= num_hashes)
        signatures[idx] = sig

    return signatures, shingle_cache

signatures, shingle_cache = compute_sign(df, q= 2, num_hashes= NUM_HASHES)

## Get LSH bands and buckets

In [None]:
def build_lsh_idx(signatures, bands=BANDS, row_band=ROWS_PER_BAND):
    buckets = defaultdict(set)
    for doc_id, sig in signatures.items():
        assert len(sig) == bands * row_band
        for b in range(bands):
            start = b * row_band
            end = start + row_band
            band_slice = tuple(sig[start:end])
            band_hash = hash(band_slice)
            buckets[(b, band_hash)].add(doc_id)
    return buckets

buckets = build_lsh_idx(signatures, bands=BANDS, row_band= ROWS_PER_BAND)

## Get candidate neighbours for a game

In [None]:
# we first retrieve candidates that share at least one band with the query game
def lsh_candidates(doc_id, signatures, buckets, bands=BANDS, row_band=ROWS_PER_BAND):
    sig = signatures[doc_id]
    candidates = set()
    for b in range(bands):
        start = b * row_band
        end = start + row_band
        band_slice = tuple(sig[start:end])
        band_hash = hash(band_slice)
        bucket_docs = buckets.get((b, band_hash), set())

        for other in bucket_docs:
            if other != doc_id:
                candidates.add(other)
    return candidates


# Check it on one game
doc = 0
candidates = lsh_candidates(doc, signatures, buckets)
print(f"Candidates for game 0: {candidates}")

## Calculate the Jaccard Similarity 

In [None]:
def jaccard_sim(sig1, sig2):
    total = sum(1 for a, b in zip(sig1, sig2) if a == b)
    return total/len(sig1)

# Recommender

In [None]:
# For a given game idx:
# - Get the candidates
# - Compute similarity of signatures
# - Filter by a minimum similarity threshold
# - Combine similarity, rating and recency into a final score (0-1)
# Sort the  final score and print the top results. The weights or threshold can be changed into what wants to be emphasized. 
def similar_games(doc_id, 
                  df, 
                  signatures,
                  buckets, 
                  bands=BANDS, 
                  row_band=ROWS_PER_BAND, 
                  thresh = 0.6,
                  w_sim = 0.7,
                  w_rating = 0.2,
                  w_date = 0.1,
                  rating_col = 'steam_rating',
                  recency_col = 'recency'):
    sig0 = signatures[doc_id]
    cand = lsh_candidates(doc_id, signatures, buckets, bands, row_band)

    results = []
    for other in cand:
        sim = jaccard_sim(sig0, signatures[other])

        if sim < thresh:
            continue
        
        rating = df.loc[other, rating_col]
        recency = df.loc[other, recency_col]
        
        score = (
            w_sim*sim + w_rating*rating + w_date*recency
        )

        results.append((other, sim, rating, recency, score))

    results.sort(key=lambda x:x[4], reverse=True)
    return results


#test it in one case:
idx = 0
res = similar_games(idx, df, signatures, buckets, bands=BANDS, row_band=ROWS_PER_BAND, thresh=0.2)
print("Query:", df.loc[idx, "name"])
print("Found", len(res), "similar games")

for other, sim, rating, recency, score in res[:10]:
    print(
        " -->", df.loc[other, "name"],
        f"(Jaccard={sim:.2f}, rating={rating:.2f}, recency = {recency:.2f}, score={score:.2f})"
    )