LSH with embeddings for a Video Game Recommender

This notebook builds a simple LSH index on top of precomputed embeddings and provides a recommendation function

It assumes you already have run the Embedding Builder notebook and generated:
- games_df.pkl; a DataFrame with game metadata
- embeddings.npy; numpy array of shape (n_games, embedding_dim)

Steps taken:
1. Load the embeddings and game data
2. Generate random hyperplanes and build LSH index 
3. Implement LSH query to get candidate neighbours
4. Rank candidates using cosine similarity + rating + recency
5. Show example recommendations for a given game index


In [None]:
# Get the imports and embeddings

import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# In the original script we used timing/tracemalloc for profiling.
# In this notebook we focus on the logic and interactivity, so we skip that.

df = pd.read_pickle("games_df.pkl")
embeddings = np.load("embeddings.npy")
print("Embeddings shape:", embeddings.shape)
print("Number of games:", len(df))


In [None]:
# generate random hyperplanes
# Several sets of random hyperplanes are created, each corresponding to one LSH table, and each hyperplane used to generate one bit of the hash key
def generate_hyperplanes(num_tables, num_planes, dim, seed= 0):
    rng = np.random.RandomState(seed)
    hyperplanes = []
    for _ in range(num_tables):
        planes = rng.randn(num_planes, dim)
        planes /= np.linalg.norm(planes, axis = 1, keepdims = True) +1e-9
        hyperplanes.append(planes)
    
    return hyperplanes

# check it
dim = embeddings.shape[1]
num_tables = 10
num_planes = 16

hyperplanes = generate_hyperplanes(num_tables, num_planes, dim, seed=33)
print(hyperplanes)

In [None]:
# Hash each embedding into buckets 
def hash_v(v, planes):
    projections = planes @ v
    bits = projections > 0
    key = ''.join('1' if b else '0' for b in bits)
    return key

In [None]:
# Get LSH index, which is tables with buckets
# We build one hash table per hyperplane set. Each table maps a bitstring key to a bucket of game indices whose embedding fall into that region

def build_lsh_idx(embeddings, hyperplanes):
    n, dim = embeddings.shape
    num_tables = len(hyperplanes)

    tables = [defaultdict(set) for _ in range(num_tables)]


    for idx, v in enumerate(embeddings):
        for t, planes in enumerate(hyperplanes):
            key = hash_v(v, planes)
            tables[t][key].add(idx)
    
    return tables

# check it
tables = build_lsh_idx(embeddings, hyperplanes)

In [None]:
# query LSH; get candidate neighbours for a game
# Given a game idx, we compute its hash key in each table, collect all indices in the corresponding bucket, union them into a candidate set
def lsh_query(idx, embeddings, hyperplanes, tables):
    v = embeddings[idx]
    candidates = set()

    for t, planes in enumerate(hyperplanes):
        key = hash_v(v, planes)
        bucket = tables[t].get(key, set())
        candidates |= bucket

    candidates.discard(idx)
    return candidates

#test
cand = lsh_query(0, embeddings, hyperplanes, tables)
print("Candidates for game 0:", cand)
for candidate in cand:
    print(df.loc[candidate, 'name'])

In [None]:
# Rank candidates by cosine similarity
# Using the candidates for a given game, the cosine similarity between the candidates and the game is computed and normalized to [0,1]. 
# Combining similarity, rating and recency, a final score is obtained for each candidate (0-1) and the results are sorted to print the top recommendations based on this score.
# The weights to similarity, rating and recency can be changed to emphasize one or the other. 
def recommend_lsh_embedding(idx, embeddings, hyperplanes, tables, df, top=5):
    v = embeddings[idx]
    cand = lsh_query(idx, embeddings, hyperplanes, tables)

    if not cand:
        print(f"No candidates for {df.loc[idx, 'name']}")
        return []

    cand_indices = np.array(list(cand), dtype=int)

    sim = cosine_similarity(
        v.reshape(1, -1),
        embeddings[cand_indices]
    )  # shape: (1, n_candidates)

    sim_flat = sim[0]  
    cos_norm = (sim_flat + 1.0) / 2.0


    ratings = df.loc[cand_indices, 'steam_rating'].astype(float).to_numpy()
    recency = df.loc[cand_indices, 'recency'].astype(float).to_numpy()


    w_sim = 0.7
    w_rating = 0.2
    w_date = 0.1

    final_score = (w_sim*cos_norm + w_rating*ratings + w_date*recency)
    
    order = np.argsort(final_score)[::-1]
    ordered_indices = cand_indices[order]
    ordered_cos_norm = cos_norm[order]    
    ordered_rating = ratings[order]
    ordered_recency = recency[order]
    ordered_score = final_score[order]

    results = list(zip(
        ordered_indices[:top], 
        ordered_cos_norm[:top],
        ordered_rating[:top],
        ordered_recency[:top],
        ordered_score[:top]))

    print(f"Query: {df.loc[idx, 'name']}")
    print("Recommendations (cosine_norm, rating, recency, score):")

    for j, cos_s, r, rec, score in results:
        print(
            f" --> {df.loc[j, 'name']} "
            f"(cos={cos_s:.3f}, rating={r:.3f}, recency = {rec:.3f}, score={score:.3f})"
        )

    return results


#test
recommend_lsh_embedding(0, embeddings, hyperplanes, tables, df, top=3)
