Maximum inner product search using Asymmetric Locality Sensitive Hashing

In [86]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

Executed on a smaller dataset at present


In [87]:
# Run this only after downloading the small or large dataset and make sure the path is set

# This is in case of small dataset
dataset = "datasets" + os.path.sep + "ml-latest-small"

# This is in case of large dataset
# dataset = "datasets"+os.path.sep+"ml-latest"

# Name of the csv file
name = "ratings.csv"

# Read the csv file and get the appropriate column IDs
ratings_df = pd.read_csv(dataset + os.path.sep + name, names= ["UserID", "MovieID", "Rating", "Timestamp"], header=0)

# Converting to numbers and other changes
ratings_df["UserID"] = pd.to_numeric(ratings_df["UserID"], errors='ignore')
ratings_df["MovieID"] = pd.to_numeric(ratings_df["MovieID"], errors='ignore')
ratings_df["Rating"] = pd.to_numeric(ratings_df["Rating"], errors='ignore')
ratings_df["Timestamp"] = pd.to_numeric(ratings_df["Timestamp"], errors='ignore')

In [88]:
# Checking the head of the df created

ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [89]:
# Randomly shuffle the data 

ratings_df = shuffle(ratings_df)
print ratings_df.shape

(100004, 4)


In [90]:
# Take 80% of the shuffled data as the actual data and the rest as query data

ratings_data_df =  ratings_df[0:int(0.8*len(ratings_df))]
ratings_query_df = ratings_df[int(0.8*len(ratings_df)):]

print ratings_data_df.shape
print ratings_query_df.shape

(80003, 4)
(20001, 4)


In [91]:
# Create the matrix of the rating of movies with the userIDs, fill with 0 for the missing values 

R_df = ratings_data_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The below part is the SVD part on data after splitting the dataset into 80:20 for data and query

In [92]:
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [93]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [94]:
sigma = np.diag(sigma)