# 3.1 Embedding Builder 

This notebook is used to sentence embeddings for each game in the dataset

Inputs:

data/json/game_overview_final_vol2.json
Outputs:

embeddings_FINAL.py ; a matrix of shape (n_games, embedding_dim)
games_df_FINAL.pkl ; a pandas DataFrame with all the necessary features for posterior analysis

In [3]:
# Imports 
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

## Load dataset

In [4]:
# Load dataset
df = pd.read_json("../data/json/game_overview_final_vol2.json")
df = df.reset_index(drop=True)

print("Number of games:", len(df))
df.head()

Number of games: 10476


Unnamed: 0,game_id,name,summary,first_release_date,genres,platforms,companies,keywords,steam_rating
0,340494,Hunter Hitman,Hunt or be hunted. Execute stealth kills and s...,2025-01-02,Strategy,"Mac, PC (Microsoft Windows)",,,0.707317
1,327987,Slender: Reborn,Rediscover terror in this reimagined classic s...,2022-02-10,"Adventure, Indie, Simulator",PC (Microsoft Windows),"CreativeForge Games, Jeff Winner","creepypasta, dark, fangame, psychological horr...",0.648148
2,237944,Train Sim World 3: Amtrak's Acela,America's Fastest Train! Race along Northeast ...,2023-02-21,Simulator,PC (Microsoft Windows),,,0.716981
3,325912,DCS World: NS 430 Navigation System for Mi-8MT...,The NS 430 for the Mi-8MTV2 module integrates ...,2021-11-06,Simulator,PC (Microsoft Windows),Eagle Dynamics,,0.444444
4,301417,Temple of Shadows,Embark on an epic adventure as Sabastian Harri...,2024-11-27,"Adventure, Indie, Puzzle",PC (Microsoft Windows),DRB Studios,"archeology, physics puzzles, suspense",0.75


## Build text to embed

In [5]:
def make_text(row):
    parts = []
    
    for col in ['name', 'summary', 'genres', 'platforms', 'companies', 'keywords']:
        if col in row:
            val = row[col]
            if isinstance(val, str) and val.strip():
                parts.append(val)

    return " ".join(parts)

df['text'] = df.apply(make_text, axis=1)

print(df[['game_id', 'name', 'text']].head())
print(df.shape)

   game_id                                               name  \
0   340494                                      Hunter Hitman   
1   327987                                    Slender: Reborn   
2   237944                  Train Sim World 3: Amtrak's Acela   
3   325912  DCS World: NS 430 Navigation System for Mi-8MT...   
4   301417                                  Temple of Shadows   

                                                text  
0  Hunter Hitman Hunt or be hunted. Execute steal...  
1  Slender: Reborn Rediscover terror in this reim...  
2  Train Sim World 3: Amtrak's Acela America's Fa...  
3  DCS World: NS 430 Navigation System for Mi-8MT...  
4  Temple of Shadows Embark on an epic adventure ...  
(10476, 10)


In [6]:

# Convert the release date column to an actual date time with value 0-1, 0 being the oldest game in the dataset, and 1 being the most recent 
df["first_release_date"] = pd.to_datetime(df["first_release_date"], errors="coerce")

min_date = df["first_release_date"].min()
max_date = df["first_release_date"].max()

date_range_days = (max_date - min_date).days

def compute_recency(d):
    if pd.isna(d):
        return 0.5
    
    return (d - min_date).days/date_range_days
df["recency"] = df["first_release_date"].apply(compute_recency)

## Generate the embeddings and save themm

In [7]:
#Load embedding model (change device to 'cuda if GPU is desired to run  the SentenceTransformer)
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

#Encode to embedding
texts = df["text"].tolist()
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Print and normalize embeddings
print(embeddings.shape)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
print(df.loc[1])

Batches: 100%|██████████| 328/328 [02:13<00:00,  2.45it/s]

(10476, 384)
game_id                                                          327987
name                                                    Slender: Reborn
summary               Rediscover terror in this reimagined classic s...
first_release_date                                  2022-02-10 00:00:00
genres                                      Adventure, Indie, Simulator
platforms                                        PC (Microsoft Windows)
companies                              CreativeForge Games, Jeff Winner
keywords              creepypasta, dark, fangame, psychological horr...
steam_rating                                                   0.648148
text                  Slender: Reborn Rediscover terror in this reim...
recency                                                        0.929315
Name: 1, dtype: object





In [8]:
# Save embeddings and the dataset to use in the next methods
np.save("../data/embeddings_FINAL.npy", embeddings)
df.to_pickle("../data/games_df_FINAL.pkl") 

# 3.2.2 LSH with embeddings 

This notebook builds a simple LSH index on top of precomputed embeddings and provides a recommendation function

It assumes you already have run the Embedding Builder notebook and generated:

games_df.pkl; a DataFrame with game metadata
embeddings.npy; numpy array of shape (n_games, embedding_dim)
Steps taken:

Load the embeddings and game data
Generate random hyperplanes and build LSH index
Implement LSH query to get candidate neighbours
Rank candidates using cosine similarity + rating + recency
Show example recommendations for a given game index

## Get the imports and embeddings

In [9]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# In the original script we used timing/tracemalloc for profiling.
# In this notebook we focus on the logic and interactivity, so we skip that.

df = pd.read_pickle("../data/games_df_FINAL.pkl")
embeddings = np.load("../data/embeddings_FINAL.npy")
print("Embeddings shape:", embeddings.shape)
print("Number of games:", len(df))

Embeddings shape: (10476, 384)
Number of games: 10476


## generate random hyperplanes

In [10]:
# Several sets of random hyperplanes are created, each corresponding to one LSH table, and each hyperplane used to generate one bit of the hash key
def generate_hyperplanes(num_tables, num_planes, dim, seed= 0):
    rng = np.random.RandomState(seed)
    hyperplanes = []
    for _ in range(num_tables):
        planes = rng.randn(num_planes, dim)
        planes /= np.linalg.norm(planes, axis = 1, keepdims = True) +1e-9
        hyperplanes.append(planes)
    
    return hyperplanes

# check it
dim = embeddings.shape[1]
num_tables = 10
num_planes = 16

hyperplanes = generate_hyperplanes(num_tables, num_planes, dim, seed=33)
print(hyperplanes)

[array([[-0.01636433, -0.0822688 , -0.07879106, ..., -0.03037444,
         0.02857754,  0.05055213],
       [ 0.04226748,  0.01131865, -0.00684458, ..., -0.00090081,
        -0.04132469,  0.03906224],
       [ 0.00036863, -0.00614149, -0.0029053 , ..., -0.02039806,
        -0.06660375, -0.00144424],
       ...,
       [-0.02160066,  0.00627249,  0.05156927, ...,  0.04815518,
         0.03370179, -0.03259623],
       [-0.16762874,  0.01323529,  0.035048  , ...,  0.07726289,
         0.00270281,  0.01887253],
       [ 0.00026104,  0.00025794,  0.05709854, ..., -0.07947919,
         0.00487741,  0.00843518]], shape=(16, 384)), array([[ 0.01588397,  0.08047979, -0.00158552, ...,  0.00705402,
         0.04997843,  0.02041587],
       [ 0.01394603, -0.08109805, -0.02365737, ..., -0.12642717,
        -0.02261045,  0.05341474],
       [-0.05057007,  0.06450797,  0.05145814, ..., -0.04600032,
         0.03888609, -0.01980033],
       ...,
       [-0.02351672, -0.00966409, -0.03954437, ...,  0.0

## Hash each embedding into buckets 

In [11]:
def hash_v(v, planes):
    projections = planes @ v
    bits = projections > 0
    key = ''.join('1' if b else '0' for b in bits)
    return key

## Get LSH index, which is tables with buckets

In [12]:
# We build one hash table per hyperplane set. Each table maps a bitstring key to a bucket of game indices whose embedding fall into that region

def build_lsh_idx(embeddings, hyperplanes):
    n, dim = embeddings.shape
    num_tables = len(hyperplanes)

    tables = [defaultdict(set) for _ in range(num_tables)]


    for idx, v in enumerate(embeddings):
        for t, planes in enumerate(hyperplanes):
            key = hash_v(v, planes)
            tables[t][key].add(idx)
    
    return tables

# check it
tables = build_lsh_idx(embeddings, hyperplanes)

## query LSH; get candidate neighbours for a game

In [13]:
# Given a game idx, we compute its hash key in each table, collect all indices in the corresponding bucket, union them into a candidate set
def lsh_query(idx, embeddings, hyperplanes, tables):
    v = embeddings[idx]
    candidates = set()

    for t, planes in enumerate(hyperplanes):
        key = hash_v(v, planes)
        bucket = tables[t].get(key, set())
        candidates |= bucket

    candidates.discard(idx)
    return candidates

#test
cand = lsh_query(0, embeddings, hyperplanes, tables)
print("Candidates for game 0:", cand)
for candidate in cand:
    print(df.loc[candidate, 'name'])

Candidates for game 0: {2050, 10441, 7566, 848, 7442, 4948, 10265, 8476, 2208, 10144, 5221, 6376, 3116, 8428, 3886, 6323, 1141, 9205, 2551, 5368, 4921, 6907, 1598}
Trine 2: Goblin Menace
Rake Remastered
Total War: Shogun 2 - Fall of the Samurai: The Saga Faction Pack
DNF Duel: Season Pass
Lords of the Fallen: The Foundation Boost
Tri.Defender
Injustice 2: Reverse Flash
Deep Space Scoundrel
Quake III: Team Arena
Running into the Cyberpunk
Slime VS. Female Hero Party
Scratch Man
Renegade Ops: Reinforcement Pack
National Zombie Park
Electric Zombies
Ballistic Protection
Astronite
Freebooter of Splorr!!
Darkest Dungeon: The Musketeer
Naruto x Boruto: Ultimate Ninja Storm Connection - DLC Pack 2
TROUBLESHOOTER: Abandoned Children - Giselle's Costume Set
Sweet Berry Crush
Auto Dungeon


## Recommender

In [14]:
# Using the candidates for a given game, the cosine similarity between the candidates and the game is computed and normalized to [0,1]. 
# Combining similarity, rating and recency, a final score is obtained for each candidate (0-1) and the results are sorted to print the top recommendations based on this score.
# The weights to similarity, rating and recency can be changed to emphasize one or the other. 
def recommend_lsh_embedding(idx, embeddings, hyperplanes, tables, df, top=5):
    v = embeddings[idx]
    cand = lsh_query(idx, embeddings, hyperplanes, tables)

    if not cand:
        print(f"No candidates for {df.loc[idx, 'name']}")
        return []

    cand_indices = np.array(list(cand), dtype=int)

    sim = cosine_similarity(
        v.reshape(1, -1),
        embeddings[cand_indices]
    )  # shape: (1, n_candidates)

    sim_flat = sim[0]  
    cos_norm = (sim_flat + 1.0) / 2.0


    ratings = df.loc[cand_indices, 'steam_rating'].astype(float).to_numpy()
    recency = df.loc[cand_indices, 'recency'].astype(float).to_numpy()


    w_sim = 0.7
    w_rating = 0.2
    w_date = 0.1

    final_score = (w_sim*cos_norm + w_rating*ratings + w_date*recency)
    
    order = np.argsort(final_score)[::-1]
    ordered_indices = cand_indices[order]
    ordered_cos_norm = cos_norm[order]    
    ordered_rating = ratings[order]
    ordered_recency = recency[order]
    ordered_score = final_score[order]

    results = list(zip(
        ordered_indices[:top], 
        ordered_cos_norm[:top],
        ordered_rating[:top],
        ordered_recency[:top],
        ordered_score[:top]))

    print(f"Query: {df.loc[idx, 'name']}")
    print("Recommendations (cosine_norm, rating, recency, score):")

    for j, cos_s, r, rec, score in results:
        print(
            f" --> {df.loc[j, 'name']} "
            f"(cos={cos_s:.3f}, rating={r:.3f}, recency = {rec:.3f}, score={score:.3f})"
        )

    return results


#test
recommend_lsh_embedding(0, embeddings, hyperplanes, tables, df, top=3)

Query: Hunter Hitman
Recommendations (cosine_norm, rating, recency, score):
 --> Running into the Cyberpunk (cos=0.677, rating=1.000, recency = 0.941, score=0.768)
 --> Scratch Man (cos=0.681, rating=0.923, recency = 0.955, score=0.757)
 --> Freebooter of Splorr!! (cos=0.654, rating=1.000, recency = 0.919, score=0.750)


[(np.int64(10144),
  np.float32(0.6766021),
  np.float64(1.0),
  np.float64(0.9410124943999204),
  np.float64(0.7677227370574846)),
 (np.int64(6376),
  np.float32(0.6806127),
  np.float64(0.9230769231),
  np.float64(0.9554482552640748),
  np.float64(0.7565890765328211)),
 (np.int64(9205),
  np.float32(0.6544188),
  np.float64(1.0),
  np.float64(0.9187117327890886),
  np.float64(0.7499643396302271))]

# 3.2.1 LSH with tokens 
This notebook implements an LSH-based recommender using token-shingles and MinHash signatures

It:

Loads the raw game dataset
builds a text field per game
Creates word shingles
Computes MinHash signatures
Builds LSH using banding
Finds candidate similar games and ranks them using: Jaccard similarity, rating and recency.

In [15]:
# Load the necessary packages
import pandas as pd
import mmh3
from  collections import defaultdict
import re

## Get the text from the dataset and build the text to shingle

In [16]:
# Normalize the text so that no "weird" shingles are formed later
df = pd.read_json("../data/json/game_overview_final_vol2.json")
df = df.reset_index(drop=True)

def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.replace(";", " ")         
    text = re.sub(r"[^a-z0-9\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def make_text(row):
    parts =[]

    for col in ['name', 'summary', 'genres', 'platforms', 'companies', 'keywords']:
        if col in row:
            val = row[col]
            if isinstance(val, str) and val.strip():
                parts.append(val)
    
    return " ".join(parts)

df['text'] = df.apply(make_text, axis = 1)

print(df[["game_id", "name", "text"]].head())
df.shape

   game_id                                               name  \
0   340494                                      Hunter Hitman   
1   327987                                    Slender: Reborn   
2   237944                  Train Sim World 3: Amtrak's Acela   
3   325912  DCS World: NS 430 Navigation System for Mi-8MT...   
4   301417                                  Temple of Shadows   

                                                text  
0  Hunter Hitman Hunt or be hunted. Execute steal...  
1  Slender: Reborn Rediscover terror in this reim...  
2  Train Sim World 3: Amtrak's Acela America's Fa...  
3  DCS World: NS 430 Navigation System for Mi-8MT...  
4  Temple of Shadows Embark on an epic adventure ...  


(10476, 10)

## Compute a normalized recency score [0, 1] from the release date

In [17]:
df["first_release_date"] = pd.to_datetime(df["first_release_date"], errors="coerce")

min_date = df["first_release_date"].min()
max_date = df["first_release_date"].max()

date_range_days = (max_date - min_date).days

def compute_recency(d):
    if pd.isna(d):
        return 0.5
    
    return (d - min_date).days/date_range_days
df["recency"] = df["first_release_date"].apply(compute_recency)

## q - shingles 

In [18]:
# We turn each game's text into a set of word shingles. These shingles are the input to the MinHash function.
def shingle(text: str, q: int = 2):
    text = normalize_text(text)
    words = text.split()
    shingles = set()
    for i in range(len(words) - q + 1):
        sh = " ".join(words[i:i+q])
        shingles.add(sh)
    return shingles

NUM_HASHES = 100
BANDS = 50
ROWS_PER_BAND = 2

# Checking it works
print(df.loc[1, "text"])
print(shingle(df.loc[1, "text"], q=2))

Slender: Reborn Rediscover terror in this reimagined classic survival horror. Collect eight pages and evade the relentless Slenderman in a hauntingly realistic world rebuilt with updated graphics, adaptive AI, and immersive sound design. Prepare for unpredictable scares and a chilling atmosphere that redefines fear Adventure, Indie, Simulator PC (Microsoft Windows) CreativeForge Games, Jeff Winner creepypasta, dark, fangame, psychological horror, simulation, slender, slender man, unofficial remake
{'design prepare', 'collect eight', 'adventure indie', 'fear adventure', 'dark fangame', 'adaptive ai', 'windows creativeforge', 'horror simulation', 'the relentless', 'microsoft windows', 'updated graphics', 'pc microsoft', 'prepare for', 'jeff winner', 'winner creepypasta', 'psychological horror', 'reimagined classic', 'simulator pc', 'a chilling', 'rebuilt with', 'a hauntingly', 'with updated', 'creepypasta dark', 'slender slender', 'unpredictable scares', 'unofficial remake', 'for unpredi

## Compute a Minhash signature of length "num_hases"

In [19]:
def minhash_sign(shingles, num_hashes = NUM_HASHES):
    signature = []
    for seed in range(num_hashes):
        min_val = None
        for sh in shingles:
            h = mmh3.hash(sh, seed, signed=False)
            if (min_val is None) or (h < min_val):
                min_val = h
        signature.append(min_val)
    return signature

## get the signature for each game

In [20]:
def compute_sign(df, q=2, num_hashes = NUM_HASHES):
    signatures = {}
    shingle_cache = {}

    for idx, row in df.iterrows():
        text = row["text"]
        shingles = shingle(text, q = q)
        shingle_cache[idx] = shingles
        sig = minhash_sign(shingles, num_hashes= num_hashes)
        signatures[idx] = sig

    return signatures, shingle_cache

signatures, shingle_cache = compute_sign(df, q= 2, num_hashes= NUM_HASHES)

## Get LSH bands and buckets

In [21]:
def build_lsh_idx(signatures, bands=BANDS, row_band=ROWS_PER_BAND):
    buckets = defaultdict(set)
    for doc_id, sig in signatures.items():
        assert len(sig) == bands * row_band
        for b in range(bands):
            start = b * row_band
            end = start + row_band
            band_slice = tuple(sig[start:end])
            band_hash = hash(band_slice)
            buckets[(b, band_hash)].add(doc_id)
    return buckets

buckets = build_lsh_idx(signatures, bands=BANDS, row_band= ROWS_PER_BAND)

## Get candidate neighbours for a game

In [22]:
# we first retrieve candidates that share at least one band with the query game
def lsh_candidates(doc_id, signatures, buckets, bands=BANDS, row_band=ROWS_PER_BAND):
    sig = signatures[doc_id]
    candidates = set()
    for b in range(bands):
        start = b * row_band
        end = start + row_band
        band_slice = tuple(sig[start:end])
        band_hash = hash(band_slice)
        bucket_docs = buckets.get((b, band_hash), set())

        for other in bucket_docs:
            if other != doc_id:
                candidates.add(other)
    return candidates


# Check it on one game
doc = 0
candidates = lsh_candidates(doc, signatures, buckets)
print(f"Candidates for game 0: {candidates}")

Candidates for game 0: {2050, 4098, 6, 10247, 2056, 4105, 10252, 10253, 2067, 8211, 6165, 22, 10259, 8217, 4125, 2078, 30, 8224, 10274, 8228, 10276, 10279, 2088, 4137, 4138, 4142, 53, 8246, 57, 6201, 6202, 60, 2112, 8258, 2117, 6213, 6215, 10313, 75, 10316, 2124, 10319, 4175, 6225, 83, 6228, 86, 6231, 6232, 2134, 6234, 8283, 10332, 4192, 10337, 2146, 6242, 4196, 8290, 10340, 6247, 10349, 111, 112, 2164, 10365, 125, 6269, 8317, 2180, 4228, 134, 10374, 6280, 2186, 2188, 2189, 6287, 8337, 6292, 6293, 4248, 8344, 4250, 2203, 6303, 10399, 8354, 2214, 4263, 2221, 2222, 6320, 10436, 8390, 10443, 4316, 4318, 4319, 2272, 2273, 226, 8414, 6372, 229, 2283, 8427, 10475, 240, 241, 4336, 6392, 2298, 6396, 4351, 8454, 2322, 6418, 6419, 6421, 8470, 281, 283, 8479, 6436, 8486, 8488, 300, 4402, 6451, 2357, 4407, 4410, 6461, 2367, 320, 326, 328, 4427, 6476, 4435, 4437, 2390, 2391, 4445, 8543, 2401, 8545, 8552, 6506, 4460, 369, 6515, 2421, 376, 2430, 4481, 2434, 8579, 8581, 4487, 392, 2441, 4490, 8592, 40

## Calculate the Jaccard Similarity 

In [23]:
def jaccard_sim(sig1, sig2):
    total = sum(1 for a, b in zip(sig1, sig2) if a == b)
    return total/len(sig1)

# Recommender

In [24]:
# For a given game idx:
# - Get the candidates
# - Compute similarity of signatures
# - Filter by a minimum similarity threshold
# - Combine similarity, rating and recency into a final score (0-1)
# Sort the  final score and print the top results. The weights or threshold can be changed into what wants to be emphasized. 
def similar_games(doc_id, 
                  df, 
                  signatures,
                  buckets, 
                  bands=BANDS, 
                  row_band=ROWS_PER_BAND, 
                  thresh = 0.6,
                  w_sim = 0.7,
                  w_rating = 0.2,
                  w_date = 0.1,
                  rating_col = 'steam_rating',
                  recency_col = 'recency'):
    sig0 = signatures[doc_id]
    cand = lsh_candidates(doc_id, signatures, buckets, bands, row_band)

    results = []
    for other in cand:
        sim = jaccard_sim(sig0, signatures[other])

        if sim < thresh:
            continue
        
        rating = df.loc[other, rating_col]
        recency = df.loc[other, recency_col]
        
        score = (
            w_sim*sim + w_rating*rating + w_date*recency
        )

        results.append((other, sim, rating, recency, score))

    results.sort(key=lambda x:x[4], reverse=True)
    return results


#test it in one case:
idx = 0
res = similar_games(idx, df, signatures, buckets, bands=BANDS, row_band=ROWS_PER_BAND, thresh=0.2)
print("Query:", df.loc[idx, "name"])
print("Found", len(res), "similar games")

for other, sim, rating, recency, score in res[:10]:
    print(
        " -->", df.loc[other, "name"],
        f"(Jaccard={sim:.2f}, rating={rating:.2f}, recency = {recency:.2f}, score={score:.2f})"
    )

Query: Hunter Hitman
Found 2 similar games
 --> Chill Seekers (Jaccard=0.23, rating=1.00, recency = 0.97, score=0.46)
 --> Beast Collector (Jaccard=0.22, rating=0.55, recency = 0.96, score=0.36)
