# Min Hashing

In [6]:
#!pip install kagglehub
import kagglehub
import os
import shutil
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

from datasketch import MinHash, MinHashLSH


In [9]:
def load_data():
    # Contains movieId, title, genres   
    ml_movies = pd.read_csv('data/ml-32m/movies.csv')
    
    # Contains userId, movieId, rating, timestamp
    ml_ratings = pd.read_csv('data/ml-32m/ratings.csv')
    
    # Contains userId, movieId, tag, timestamp
    ml_tags = pd.read_csv('data/ml-32m/tags.csv')

    # Contains movieId, imdbId, tmdbId
    ml_links = pd.read_csv('data/ml-32m/links.csv')

    # IMDB Dataset:
    imdb = pd.read_csv('data/imdb/movies.csv')

    return ml_movies, ml_ratings, ml_tags, ml_links, imdb

ml_movies, ml_ratings, ml_tags, ml_links, imdb = load_data()

  imdb = pd.read_csv('data/imdb/movies.csv')


In [5]:
# Preprocessing IMDB:

#drop if description is "Add a plot" because we have no description then
imdb_descriptions = imdb[imdb['description'] != "Add a Plot"]

imdb_descriptions['id'] = imdb_descriptions['id'].str[2:]
imdb_descriptions['id'] = pd.to_numeric(imdb_descriptions['id'], errors='coerce')

#keep what we need
imdb_descriptions = imdb_descriptions[['id', 'description']]

#merge them, so we have a li nk between the two datasets
merged_links = ml_links.merge(imdb_descriptions, left_on='imdbId', right_on='id', how='inner').dropna(subset=['id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_descriptions['id'] = imdb_descriptions['id'].str[2:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_descriptions['id'] = pd.to_numeric(imdb_descriptions['id'], errors='coerce')


In [11]:
#merge the movies with the descriptions if that is what we want
ml_movies_description = ml_movies.merge(merged_links, left_on='movieId', right_on='movieId', how='inner')

ml_movies_description

NameError: name 'merged_links' is not defined

### Min-hashing

In [7]:
num_perm = 128

In [8]:
def run_or_read_minhash(run: bool, df=None, num_perm=num_perm):
# Sample DataFrame setup
# df = pd.DataFrame({'description': [...]})
    if run:
        # Define a function to min-hash a description
        def minhash_description(text, num_perm=num_perm):
            m = MinHash(num_perm=num_perm)
            for word in text.split():
                m.update(word.encode('utf8'))
            return m

        # Apply min-hashing on the 'description' column
        df['minhash'] = df['description'].apply(minhash_description)

        # (Optional) Convert min-hash signatures to a comparable format
        df['minhash_signature'] = df['minhash'].apply(lambda x: x.digest())

        df.to_pickle('data/df_min_hash.pkl')
        print('Minhashing done and saved to df_min_hash.pkl')
    else:
        df = pd.read_pickle('data/df_min_hash.pkl')
        print('Minhashing loaded from df_min_hash.pkl')
    return df



In [9]:
df_small = None # ml_movies_description
df_minhash = run_or_read_minhash(False, df=df_small, num_perm=num_perm)

Minhashing loaded from df_min_hash.pkl


### LSH 

In [13]:
threshold = 0.4

In [14]:
def run_or_read_lsh(run: bool, df_minhash, threshold, num_perm=num_perm):

    if run:
        lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

        # Insert each min-hash signature into the LSH
        for idx, minhash in enumerate(df_minhash['minhash']):
            lsh.insert(idx, minhash)  # Use the DataFrame index as the unique ID

        # Function to find similar items using LSH
        def find_similar(description, lsh, num_perm=128):
            m = MinHash(num_perm=num_perm)
            for word in description.split():
                m.update(word.encode('utf8'))
            return lsh.query(m)

        # save similaritities to dict
        bands = {}
        for idx in tqdm(range(len(df_minhash))):
            bands[idx] = find_similar(df_minhash['description'][idx], lsh)

        # save similarities to file as pickle
        with open(f'data/lsh_groups_{threshold}.pkl', 'wb') as f:
            pickle.dump(bands, f)
        print('LSH done and saved to lsh_groups.pkl')
    else:
        with open(f'data/lsh_groups_{threshold}.pkl', 'rb') as f:
            bands = pickle.load(f)
        print('LSH loaded from lsh_groups.pkl')
    return bands



In [15]:
dict_lsh = run_or_read_lsh(False, df_minhash, threshold=threshold, num_perm=num_perm)


LSH loaded from lsh_groups.pkl


In [20]:
# calculate jaccard for second band (df_lsh[1])

for idx in dict_lsh[1]:
    jaccard_score = df_minhash['minhash'][1].jaccard(df_minhash['minhash'][idx])
    print(f'Jaccard similarity between 1 and {idx}: {jaccard_score}')

Jaccard similarity between 1 and 1: 1.0
Jaccard similarity between 1 and 28483: 0.09375
Jaccard similarity between 1 and 22974: 0.1015625
Jaccard similarity between 1 and 24229: 0.171875
Jaccard similarity between 1 and 33893: 0.09375
Jaccard similarity between 1 and 49630: 0.140625
Jaccard similarity between 1 and 43146: 0.1328125
Jaccard similarity between 1 and 30795: 0.0859375
Jaccard similarity between 1 and 1229: 0.15625
Jaccard similarity between 1 and 24110: 0.109375
Jaccard similarity between 1 and 50605: 0.125
Jaccard similarity between 1 and 20656: 0.0625
Jaccard similarity between 1 and 39312: 0.109375
Jaccard similarity between 1 and 27185: 0.1328125
Jaccard similarity between 1 and 9847: 0.1015625
Jaccard similarity between 1 and 43994: 0.1015625
Jaccard similarity between 1 and 25980: 0.125
Jaccard similarity between 1 and 45502: 0.0546875


Lidt mærkeligt at der er similarity scores på under threshold. Noget med at den minimerer false negatives og false positives.

*(The Jaccard similarity threshold between 0.0 and1.0. The initialized MinHash LSH will be optimized for the threshold by minizing the false positive and false negative.)*

In [21]:
def run_or_read_jaccard(run: bool, df_minhash, dict_lsh, threshold):
    if run:
        jaccard_similarities = {}

        for idx, similar_indices in tqdm(dict_lsh.items()):
            for similar_idx in similar_indices:
                if idx < similar_idx:  # Avoid duplicate pairs and self-comparison
                    jaccard_score = df_minhash['minhash'][idx].jaccard(df_minhash['minhash'][similar_idx])
                    jaccard_similarities[(idx, similar_idx)] = jaccard_score

        with open(f'data/jaccard_similarities_{threshold}.pkl', 'wb') as f:
            pickle.dump(jaccard_similarities, f)
        print("Jaccard similarities saved to 'data/jaccard_similarities.pkl'")
    else:
        with open(f'data/jaccard_similarities_{threshold}.pkl', 'rb') as f:
            jaccard_similarities = pickle.load(f)
        print("Jaccard similarities loaded from 'data/jaccard_similarities.pkl'")
    return jaccard_similarities


jaccard_similarities = run_or_read_jaccard(False, df_minhash, dict_lsh, threshold)


Jaccard similarities loaded from 'data/jaccard_similarities.pkl'


## Make recommendation logic

In [22]:
jaccard_similarities

{(0, 2128): 0.140625,
 (0, 20812): 0.078125,
 (1, 28483): 0.09375,
 (1, 22974): 0.1015625,
 (1, 24229): 0.171875,
 (1, 33893): 0.09375,
 (1, 49630): 0.140625,
 (1, 43146): 0.1328125,
 (1, 30795): 0.0859375,
 (1, 1229): 0.15625,
 (1, 24110): 0.109375,
 (1, 50605): 0.125,
 (1, 20656): 0.0625,
 (1, 39312): 0.109375,
 (1, 27185): 0.1328125,
 (1, 9847): 0.1015625,
 (1, 43994): 0.1015625,
 (1, 25980): 0.125,
 (1, 45502): 0.0546875,
 (2, 35338): 0.1796875,
 (2, 20621): 0.1171875,
 (2, 3342): 0.1796875,
 (2, 7439): 0.1484375,
 (2, 10004): 0.1484375,
 (2, 21141): 0.1328125,
 (2, 22549): 0.15625,
 (2, 13594): 0.203125,
 (2, 6941): 0.171875,
 (2, 33824): 0.203125,
 (2, 46241): 0.1328125,
 (2, 46497): 0.1640625,
 (2, 45348): 0.1328125,
 (2, 27823): 0.2265625,
 (2, 1843): 0.1875,
 (2, 436): 0.09375,
 (2, 21812): 0.1484375,
 (2, 36918): 0.15625,
 (2, 50230): 0.2109375,
 (2, 51253): 0.125,
 (2, 13757): 0.21875,
 (2, 40005): 0.1171875,
 (2, 50761): 0.171875,
 (2, 7246): 0.15625,
 (2, 26323): 0.140625,

In [None]:

# make dict to convert from index to movieId
index_to_id = dict(zip(df_minhash.index, df_minhash.movieId))
# and reverse
id_to_index = dict(zip(df_minhash.movieId, df_minhash.index))


def movie_recommendation_min_hash(movie_id, id_to_index=id_to_index, index_to_id=index_to_id, dict_lsh=dict_lsh, df_minhash=df_minhash):
    index = id_to_index[movie_id]
    similar_movies = []
    for idx in dict_lsh[index]:
        jaccard_score = df_minhash['minhash'][index].jaccard(df_minhash['minhash'][idx])
        similar_movies.append((index_to_id[idx], jaccard_score))
    
    # sort by jaccard score
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    # remove the movie itself
    similar_movies = [movie for movie in similar_movies if movie[0] != movie_id]
    
    return similar_movies



movie_recommendation_min_hash(1, id_to_index=id_to_index, index_to_id=index_to_id, dict_lsh=dict_lsh, df_minhash=df_minhash)

# make function to 

[(3270, 0.140625), (140016, 0.078125)]