# Min Hashing

In [None]:
#!pip install kagglehub
import kagglehub
import os
import shutil
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

from datasketch import MinHash, MinHashLSH


In [2]:
def load_data():
    # Contains movieId, title, genres   
    ml_movies = pd.read_csv('data/ml-32m/movies.csv')
    
    # Contains userId, movieId, rating, timestamp
    ml_ratings = pd.read_csv('data/ml-32m/ratings.csv')
    
    # Contains userId, movieId, tag, timestamp
    ml_tags = pd.read_csv('data/ml-32m/tags.csv')

    # Contains movieId, imdbId, tmdbId
    ml_links = pd.read_csv('data/ml-32m/links.csv')

    # IMDB Dataset:
    imdb = pd.read_csv('data/imdb/movies.csv')

    return ml_movies, ml_ratings, ml_tags, ml_links, imdb

ml_movies, ml_ratings, ml_tags, ml_links, imdb = load_data()

  imdb = pd.read_csv('data/imdb/movies.csv')


In [3]:
# Preprocessing IMDB:

#drop if description is "Add a plot" because we have no description then
imdb_descriptions = imdb[imdb['description'] != "Add a Plot"]

imdb_descriptions['id'] = imdb_descriptions['id'].str[2:]
imdb_descriptions['id'] = pd.to_numeric(imdb_descriptions['id'], errors='coerce')

#keep what we need
imdb_descriptions = imdb_descriptions[['id', 'description']]

#merge them, so we have a li nk between the two datasets
merged_links = ml_links.merge(imdb_descriptions, left_on='imdbId', right_on='id', how='inner').dropna(subset=['id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_descriptions['id'] = imdb_descriptions['id'].str[2:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_descriptions['id'] = pd.to_numeric(imdb_descriptions['id'], errors='coerce')


In [4]:
#merge the movies with the descriptions if that is what we want
ml_movies_description = ml_movies.merge(merged_links, left_on='movieId', right_on='movieId', how='inner')

ml_movies_description

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,114709,A cowboy doll is profoundly threatened and jea...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,113497,When two kids find and play a magical board ga...
2,6,Heat (1995),Action|Crime|Thriller,113277,949.0,113277,A group of high-end professional thieves start...
3,8,Tom and Huck (1995),Adventure|Children,112302,45325.0,112302,Two best friends witness a murder and embark o...
4,9,Sudden Death (1995),Action,114576,9091.0,114576,A former fireman takes on a group of terrorist...
...,...,...,...,...,...,...,...
51855,292541,The Settlers (2023),Drama|Western,10370812,989589.0,10370812,"A mixed-race Chilean, rides south on an expedi..."
51856,292585,Night Crawlers (2009),Comedy|Horror,985060,147230.0,985060,Blood is thicker than water in this tiny Texas...
51857,292605,Our River... Our Sky (2023),Drama|War,10676126,855800.0,10676126,Baghdad. The last week of 2006. All over the c...
51858,292613,Freelance (2023),Action|Comedy,15744298,897087.0,15744298,An ex special forces operator takes a job to p...


### Min-hashing

In [51]:
num_perm = 128

In [52]:
def run_or_read_minhash(run: bool, df, num_perm=num_perm):
# Sample DataFrame setup
# df = pd.DataFrame({'description': [...]})
    if run:
        # Define a function to min-hash a description
        def minhash_description(text, num_perm=num_perm):
            m = MinHash(num_perm=num_perm)
            for word in text.split():
                m.update(word.encode('utf8'))
            return m

        # Apply min-hashing on the 'description' column
        df['minhash'] = df['description'].apply(minhash_description)

        # (Optional) Convert min-hash signatures to a comparable format
        df['minhash_signature'] = df['minhash'].apply(lambda x: x.digest())

        df.to_pickle('data/df_min_hash.pkl')
        print('Minhashing done and saved to df_min_hash.pkl')
    else:
        df = pd.read_pickle('data/df_min_hash.pkl')
        print('Minhashing loaded from df_min_hash.pkl')
    return df



In [53]:
df_small = ml_movies_description
df_minhash = run_or_read_minhash(False, df_small, num_perm=num_perm)

Minhashing loaded from df_min_hash.pkl


### LSH 

In [57]:
threshold = 0.4

In [58]:
def run_or_read_lsh(run: bool, df_minhash, threshold, num_perm=num_perm):

    if run:
        lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

        # Insert each min-hash signature into the LSH
        for idx, minhash in enumerate(df_minhash['minhash']):
            lsh.insert(idx, minhash)  # Use the DataFrame index as the unique ID

        # Function to find similar items using LSH
        def find_similar(description, lsh, num_perm=128):
            m = MinHash(num_perm=num_perm)
            for word in description.split():
                m.update(word.encode('utf8'))
            return lsh.query(m)

        # save similaritities to dict
        bands = {}
        for idx in tqdm(range(len(df_minhash))):
            bands[idx] = find_similar(df_minhash['description'][idx], lsh)

        # save similarities to file as pickle
        with open(f'data/lsh_groups_{threshold}.pkl', 'wb') as f:
            pickle.dump(bands, f)
        print('LSH done and saved to lsh_groups.pkl')
    else:
        with open(f'data/lsh_groups_{threshold}.pkl', 'rb') as f:
            bands = pickle.load(f)
        print('LSH loaded from lsh_groups.pkl')
    return bands



In [59]:
dict_lsh = run_or_read_lsh(True, df_minhash, threshold=threshold, num_perm=num_perm)


100%|██████████| 51860/51860 [00:57<00:00, 902.58it/s]


LSH done and saved to lsh_groups.pkl


In [49]:
# calculate jaccard for second band (df_lsh[1])

def jaccard_similarity(s1, s2):
    return len(s1.intersection(s2)) / len(s1.union(s2))

for idx in dict_lsh[1]:
    jaccard_score = df_minhash['minhash'][1].jaccard(df_minhash['minhash'][idx])
    print(f'Jaccard similarity between 1 and {idx}: {jaccard_score}')

Jaccard similarity between 1 and 1: 1.0
Jaccard similarity between 1 and 32775: 0.0859375
Jaccard similarity between 1 and 11: 0.078125
Jaccard similarity between 1 and 13: 0.078125
Jaccard similarity between 1 and 32788: 0.0390625
Jaccard similarity between 1 and 20: 0.125
Jaccard similarity between 1 and 32790: 0.046875
Jaccard similarity between 1 and 25: 0.0859375
Jaccard similarity between 1 and 32797: 0.0703125
Jaccard similarity between 1 and 32802: 0.0625
Jaccard similarity between 1 and 32806: 0.1015625
Jaccard similarity between 1 and 32825: 0.0859375
Jaccard similarity between 1 and 32829: 0.125
Jaccard similarity between 1 and 32830: 0.109375
Jaccard similarity between 1 and 32833: 0.078125
Jaccard similarity between 1 and 32837: 0.0859375
Jaccard similarity between 1 and 32840: 0.0703125
Jaccard similarity between 1 and 79: 0.1015625
Jaccard similarity between 1 and 81: 0.0625
Jaccard similarity between 1 and 83: 0.0625
Jaccard similarity between 1 and 86: 0.109375
Jaccard

Lidt mærkeligt at der er similarity scores på under threshold. Noget med at den minimerer false negatives og false positives.

*(The Jaccard similarity threshold between 0.0 and1.0. The initialized MinHash LSH will be optimized for the threshold by minizing the false positive and false negative.)*

In [None]:
def run_or_read_jaccard(run: bool, df_minhash, dict_lsh, threshold):
    if run:
        jaccard_similarities = {}

        for idx, similar_indices in tqdm(dict_lsh.items()):
            for similar_idx in similar_indices:
                if idx < similar_idx:  # Avoid duplicate pairs and self-comparison
                    jaccard_score = df_minhash['minhash'][idx].jaccard(df_minhash['minhash'][similar_idx])
                    jaccard_similarities[(idx, similar_idx)] = jaccard_score

        with open(f'data/jaccard_similarities_{threshold}.pkl', 'wb') as f:
            pickle.dump(jaccard_similarities, f)
        print("Jaccard similarities saved to 'data/jaccard_similarities.pkl'")
    else:
        with open(f'data/jaccard_similarities.pkl_{threshold}', 'rb') as f:
            jaccard_similarities = pickle.load(f)
        print("Jaccard similarities loaded from 'data/jaccard_similarities.pkl'")
    return jaccard_similarities


jaccard_similarities = run_or_read_jaccard(True, df_minhash, dict_lsh, threshold)


100%|██████████| 51860/51860 [00:10<00:00, 5055.31it/s] 


Jaccard similarities saved to 'data/jaccard_similarities.pkl'
