## 1. Dependencies

In [1]:
import os
import sys
import os.path as osp

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasketch import MinHashLSHForest, MinHash

## 2. Data Loading and Preprocessing

### 2.1 Data Loading

In [2]:
## data root
data_dir = 'archive'

## file names
file_names = ['combined_data_{}.txt'.format(i) for i in range(1, 5)]

## Read only the first file combine_data_1
df = pd.read_csv(osp.join(data_dir, file_names[0]), header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

In [3]:
## Preview
df.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


### 2.2 Data Preprocessing

Convert the dataframe with columns: 'Cust_Id', 'Rating', 'Movie_Id'

In [4]:
## Find the index of movie id
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()
print(df_nan.shape)
df_nan.head()


movie_np = []
movie_id = 1

for i,j in tqdm(zip(df_nan['index'][1:],df_nan['index'][:-1])):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1
    
# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

# remove those Movie ID rows
df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
df.head()

(4499, 2)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


-Dataset examples-


Unnamed: 0,Cust_Id,Rating,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


Create two new dataframes: group by 'Cust_Id' and 'Movie_id' separately.

In [5]:
## group by user
df_user = df.groupby('Cust_Id')['Movie_Id'].apply(list)

print(df_user.shape)
df_user

(470758,)


Cust_Id
6          [30, 157, 173, 175, 191, 197, 241, 295, 299, 3...
7          [8, 28, 30, 83, 175, 185, 191, 257, 273, 283, ...
8          [1144, 1202, 1428, 1518, 1719, 1799, 1843, 190...
10         [175, 191, 197, 285, 299, 468, 473, 483, 571, ...
25                                    [178, 761, 3427, 4432]
                                 ...                        
2649404    [78, 299, 550, 1145, 1406, 1428, 2152, 2174, 2...
2649409    [191, 197, 334, 457, 1073, 1220, 1918, 2372, 2...
2649421                                   [1877, 2152, 3078]
2649426    [17, 28, 30, 111, 197, 216, 252, 273, 357, 406...
2649429    [30, 143, 199, 213, 452, 457, 483, 533, 571, 6...
Name: Movie_Id, Length: 470758, dtype: object

In [6]:
## group by movie
df_movie = df.groupby('Movie_Id')['Cust_Id'].apply(list)

print(df_movie.shape)
df_movie

(4499,)


Movie_Id
1       [1488844, 822109, 885013, 30878, 823519, 89398...
2       [2059652, 1666394, 1759415, 1959936, 998862, 2...
3       [1025579, 712664, 1331154, 2632461, 44937, 656...
4       [1065039, 1544320, 410199, 732556, 1201419, 66...
5       [1745265, 885013, 1997470, 30878, 840543, 2477...
                              ...                        
4495    [885635, 1436640, 2625420, 2343417, 1800117, 7...
4496    [1917746, 364518, 1392773, 1527030, 1990901, 5...
4497    [1224344, 1395430, 2630287, 306466, 466862, 32...
4498    [539735, 1645486, 247794, 2454205, 497196, 103...
4499    [66358, 1916420, 327122, 769, 1684516, 567279,...
Name: Cust_Id, Length: 4499, dtype: object

## 3. MinHash 

Helper function: Calculate jaccard similarity

In [11]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union
la = [0,1,2]
lb = [1,5]
print(jaccard(la, lb))

0.25


### 3.1 User-User Similarity: Find similar users based on movies they have watched

Build LSH

In [None]:
random_seed = 100

# Initialize the LSH
start_time = time.time()
forest = MinHashLSHForest(num_perm=128)

M = {}

for i in tqdm(range(len(df_user))):
    
    user_id = df_user.index[i]

    users = df_user[user_id]
    M[user_id] = MinHash(seed = random_seed, num_perm = 128)
    
    for user in users:
        M[user_id].update(str(user).encode('utf8'))
    
    forest.add(user_id, M[user_id])

print("--- %s seconds ---" % (time.time() - start_time))

forest.index()


Query from LSH. Given a user id, find similar users.

In [None]:
## Query from lsh: Given a user id, find similar users

## user id
q = df_user.index[0]
print('User ID:', q, '\n')
## num of neighbors (including itself)
N = 11


result = forest.query(M[q], N)
print('Neighbor ids:', result)

jaccard_real = []
jaccard_predict = []

## find real jaccard similarity and predicted similarity
for idx, i in enumerate(result):   


        
    print('Actually jaccard similarity: ', jaccard(df_user[q], df_user[i]))
    jaccard_real.append(jaccard(df_user[q], df_user[i]))
    print('Predicted Similarity:', M[q].jaccard(M[i]) , '\n')
    jaccard_predict.append(M[q].jaccard(M[i]))

Find real neighbors of this user

In [13]:
q = df_user.index[0]

neighbors = []
for i in tqdm(range(len(df_user))):
    
    u_id = df_user.index[i]
    
    if u_id != q:
        sim = jaccard(df_user[q], df_user[u_id])
        
        neighbors.append((u_id, sim))
        
neighbors.sort(key = lambda x: -x[1])

print(neighbors[:10])

HBox(children=(FloatProgress(value=0.0, max=470758.0), HTML(value='')))


[(14936, 0.36199095022624433), (347549, 0.3452914798206278), (2570388, 0.34177215189873417), (11676, 0.33766233766233766), (1502513, 0.3346938775510204), (1170278, 0.3346774193548387), (115668, 0.3333333333333333), (248362, 0.3320610687022901), (2267168, 0.33191489361702126), (1120757, 0.32857142857142857)]
