LSH with tokens for a Video Game Recommender
This notebook implements an LSH-based recommender using token-shingles and MinHash signatures

It:
1. Loads the raw game dataset
2. builds a text field per game
3. Creates word shingles
4. Computes MinHash signatures 
5. Builds LSH using banding
6. Finds candidate similar games and ranks them using: Jaccard similarity, rating and recency.

In [1]:
# Load the necessary packages
import pandas as pd
import mmh3
from  collections import defaultdict
import re
pd.set_option('display.max_colwidth', None)


In [None]:
# Get the text from the dataset and build the text to shingle
# Normalize the text so that no "weird" shingles are formed later
df = pd.read_json("../data/json/game_overview_final_vol2.json")
df = df.reset_index(drop=True)

def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.replace(";", " ")         
    text = re.sub(r"[^a-z0-9\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def make_text(row):
    parts =[]

    for col in ['name', 'summary', 'genres', 'platforms', 'companies', 'keywords']:
        if col in row:
            val = row[col]
            if isinstance(val, str) and val.strip():
                parts.append(val)
    
    return " ".join(parts)

df['text'] = df.apply(make_text, axis = 1)

print(df[["game_id", "name", "text"]].head())
df.shape


In [None]:
# Compute a normalized recency score [0, 1] from the release date
df["first_release_date"] = pd.to_datetime(df["first_release_date"], errors="coerce")

min_date = df["first_release_date"].min()
max_date = df["first_release_date"].max()

date_range_days = (max_date - min_date).days

def compute_recency(d):
    if pd.isna(d):
        return 0.5
    
    return (d - min_date).days/date_range_days
df["recency"] = df["first_release_date"].apply(compute_recency)

In [None]:
# q - shingles 
# We turn each game's text into a set of word shingles. These shingles are the input to the MinHash function.
def shingle(text: str, q: int = 2):
    text = normalize_text(text)
    words = text.split()
    shingles = set()
    for i in range(len(words) - q + 1):
        sh = " ".join(words[i:i+q])
        shingles.add(sh)
    return shingles

NUM_HASHES = 100
BANDS = 50
ROWS_PER_BAND = 2

# Checking it works
print(df.loc[1, "text"])
print(shingle(df.loc[1, "text"], q=2))

In [None]:
# Compute a Minhash signature of length "num_hases"
def minhash_sign(shingles, num_hashes = NUM_HASHES):
    signature = []
    for seed in range(num_hashes):
        min_val = None
        for sh in shingles:
            h = mmh3.hash(sh, seed, signed=False)
            if (min_val is None) or (h < min_val):
                min_val = h
        signature.append(min_val)
    return signature

In [5]:
# get the signature for each game
def compute_sign(df, q=2, num_hashes = NUM_HASHES):
    signatures = {}
    shingle_cache = {}

    for idx, row in df.iterrows():
        text = row["text"]
        shingles = shingle(text, q = q)
        shingle_cache[idx] = shingles
        sig = minhash_sign(shingles, num_hashes= num_hashes)
        signatures[idx] = sig

    return signatures, shingle_cache

signatures, shingle_cache = compute_sign(df, q= 2, num_hashes= NUM_HASHES)

In [6]:
#Get LSH bands and buckets
def build_lsh_idx(signatures, bands=BANDS, row_band=ROWS_PER_BAND):
    buckets = defaultdict(set)
    for doc_id, sig in signatures.items():
        assert len(sig) == bands * row_band
        for b in range(bands):
            start = b * row_band
            end = start + row_band
            band_slice = tuple(sig[start:end])
            band_hash = hash(band_slice)
            buckets[(b, band_hash)].add(doc_id)
    return buckets

buckets = build_lsh_idx(signatures, bands=BANDS, row_band= ROWS_PER_BAND)



In [None]:
# get candidate neighbours for a game
# we first retrieve candidates that share at least one band with the query game
def lsh_candidates(doc_id, signatures, buckets, bands=BANDS, row_band=ROWS_PER_BAND):
    sig = signatures[doc_id]
    candidates = set()
    for b in range(bands):
        start = b * row_band
        end = start + row_band
        band_slice = tuple(sig[start:end])
        band_hash = hash(band_slice)
        bucket_docs = buckets.get((b, band_hash), set())

        for other in bucket_docs:
            if other != doc_id:
                candidates.add(other)
    return candidates


# Check it on one game
doc = 0
candidates = lsh_candidates(doc, signatures, buckets)
print(f"Candidates for game 0: {candidates}")




In [8]:
#Calculate the Jaccard Similarity 
def jaccard_sim(sig1, sig2):
    total = sum(1 for a, b in zip(sig1, sig2) if a == b)
    return total/len(sig1)

In [None]:
# For a given game idx:
# - Get the candidates
# - Compute similarity of signatures
# - Filter by a minimum similarity threshold
# - Combine similarity, rating and recency into a final score (0-1)
# Sort the  final score and print the top results. The weights or threshold can be changed into what wants to be emphasized. 
def similar_games(doc_id, 
                  df, 
                  signatures,
                  buckets, 
                  bands=BANDS, 
                  row_band=ROWS_PER_BAND, 
                  thresh = 0.6,
                  w_sim = 0.7,
                  w_rating = 0.2,
                  w_date = 0.1,
                  rating_col = 'steam_rating',
                  recency_col = 'recency'):
    sig0 = signatures[doc_id]
    cand = lsh_candidates(doc_id, signatures, buckets, bands, row_band)

    results = []
    for other in cand:
        sim = jaccard_sim(sig0, signatures[other])

        if sim < thresh:
            continue
        
        rating = df.loc[other, rating_col]
        recency = df.loc[other, recency_col]
        
        score = (
            w_sim*sim + w_rating*rating + w_date*recency
        )

        results.append((other, sim, rating, recency, score))

    results.sort(key=lambda x:x[4], reverse=True)
    return results


#test it in one case:
idx = 0
res = similar_games(idx, df, signatures, buckets, bands=BANDS, row_band=ROWS_PER_BAND, thresh=0.2)
print("Query:", df.loc[idx, "name"])
print("Found", len(res), "similar games")

for other, sim, rating, recency, score in res[:10]:
    print(
        " -->", df.loc[other, "name"],
        f"(Jaccard={sim:.2f}, rating={rating:.2f}, recency = {recency:.2f}, score={score:.2f})"
    )