In [1]:
# Built on python2
import numpy as np

## Setting constants

In [2]:
DATA_PATH = './ml-100k/u.data' # ml-100k data set has 100k ratings, 943 users and 1682 items

# Dataset Parameters
DATA_TYPE =  0              # 0: CSR format, 1: 2D matrix format
DELIMITER = "\t"            # tab separated or comma separated data format
N_RATINGS = 100000
USERS = 943
ITEMS = 1682
FIRST_INDEX = 1             # index starts from 0 or 1

# Hardcoding values
UNOBSERVED = -1
GET_PRODUCT_FAIL_RETURN = UNOBSERVED    #TODO: This hardcoding can be removed in future
OFFSET = USERS + 10     # keep it >= #USERS
TRAIN_TEST_SPLIT = 0.2  # %age of test ratings wrt train rating ; value in between 0 and 1
AVG_RATING = 3 .        # ratings for which we dont have predicted rating

# HyperParameters
C1 = 0.2                # probability of edges in training set going to E1
C2 = 0.3                # probability of edges in training set going to E2
C3 = 1 - C1 - C2
RADIUS = 0              # radius of neighborhood, r = 1 implies movies, which were rated by 1 hop friend users...
THRESHOLD =  USERS      # distance/similarity threshold for users, who are used in avg weighted estimate computation

In [3]:
#checks on parameters set in previous cell
if C3 <= 0:
    print('ERROR: Please set the values of C1 and C2, s.t, C1+C2 < 1')

## Functions for data handling

(need to handle for boundary cases)

In [4]:
''' Function to get data in matrix format for given data in CSR format '''
def csr_to_matrix(data_csr, symmetry=False):
    users = max(data_csr[:,0]) + 1
    items = max(data_csr[:,1]) + 1
    if symmetry:
        users = items = max(users,items)
        
    data_matrix = np.full((users, items), UNOBSERVED, dtype=int)
    for line in data_csr:
        data_matrix[line[0]][line[1]] = line[2]
        if symmetry:
            data_matrix[line[1]][line[0]] = line[2]
            
    return data_matrix

''' Function to get data in CSR format for given data in matrix format '''
def matrix_to_csr(data_matrix):
    data_matrix = np.empty([0,0], dtype=int)
    data_csr = np.array([ [i,j,data_matrix[i,j]]\
                          for j in range(len(temp[i]))\
                              for i in range(len(temp))\
                                  if temp[i,j] > 0])
    return data_csr

''' Function to read data file, given in CSR format; Assuming 1st 3 values as: user_id, item_id, rating '''
def read_data_csr(fname=DATA_PATH, delimiter=DELIMITER, dtype=int):
    data_csr = np.loadtxt(fname=fname, delimiter=delimiter, dtype=dtype) # Reading data to array
    data_csr = data_csr[:, :3]                                           # Extracting 1st 3 columns: 0,1,2
    data_csr[:,:2] = data_csr[:,0:2] - FIRST_INDEX                       # Making sure index starts from 0
    return data_csr

''' Function to read CSR data file in matrix format '''
# to be removed in future
import csv
def read_csr_data_as_matrix(users=USERS, items=ITEMS, symmetry=False):
    if symmetry:
        users = items = max(users, items)
    data_matrix = np.full((users, items), UNOBSERVED, dtype=int)
    with open(DATA_PATH) as tsv:
        for line in csv.reader(tsv, delimiter="\t"):    # line is a list containing all tab separated items
            line = [int(i) for i in line]               # convert the list of str to list of int
            data_matrix[line[0] - FIRST_INDEX][line[1] - FIRST_INDEX] = line[2]

    return data_matrix


''' Function to read data file, given in matrix format '''
# TODO
def read_data_matrix():
    data_matrix = np.empty([0,0], dtype=int)
    return data_matrix


## Read dataset

In [5]:
data_csr = read_data_csr()

if data_csr.shape[0] == N_RATINGS:  # gives total no of ratings read; useful for verification
    print('done')
else:
    print('fail')
    #print( '# of missing ratings: ' + str(N_RATINGS - data_csr.shape[0]))

done


## Creating a train/test split for non negative ratings

In [6]:
# we use data_csr as it is easy to only shuffle it and accordingly create train and test set
np.random.shuffle(data_csr) # inplace shuffle

train_sz = int((1 - TRAIN_TEST_SPLIT) * data_csr.shape[0])

train_data_csr = data_csr[: train_sz ,:]
test_data_csr = data_csr[train_sz : ,:]

if train_data_csr.shape[0]+test_data_csr.shape[0] == data_csr.shape[0]:
    print('done')
else:
    print('fail')

done


## Explore/Modify Dataset

In [7]:
# some data analysis nos and plots here

## Algorithm Begins from here

# Step 1: Sample Splitting

Split the matrix edges into 3 parts, M1, M2, M3

In [8]:
# Again using data_csr is easier to split sample into 3 parts

#reshuffling training data can be avoided as it was obtained shuffled already
#np.random.shuffle(train_data_csr) # inplace shuffle

m1_sz = int(C1 * train_data_csr.shape[0])
m2_sz = int(C2 * train_data_csr.shape[0])

m1_csr = train_data_csr[              : m1_sz         ,:]
m2_csr = train_data_csr[        m1_sz : m1_sz + m2_sz ,:]
m3_csr = train_data_csr[m1_sz + m2_sz :               ,:]

if m1_csr.shape[0]+m2_csr.shape[0]+m3_csr.shape[0] == train_data_csr.shape[0]:
    print('done')
else:
    print('fail')

done


# Step 2: Expanding the neighborhood

use M1 to build neighborhood based on radius *r*

Normalize the product of ratings by total no of final movies at the boundary

Building BFS tree rooted at each vertex, s/t
- each node in a path from user to boundary item is unique
- shortest path (#path edges) between user and boundary item
- in case of multiple paths (or trees) choose any one path (i.e. any one tree) at random

In [76]:
#m1_csr = read_data_csr(fname='./very_small_graph.txt')   ##### REMOVE THIS CELL
#OFFSET = 4 + 6

## Step 2.1: Create adjacency list: dictionary of sets

In [77]:
'''Step 2.1.1 Convert M1 from csr to matrix format'''
# item_ids += OFFSET
# so that user_ids != item_ids
# and we can create an undirected graph (important to get an edge from item to user)

new_m1_csr = np.copy(m1_csr)
#new_m1_csr = new_m1_csr + 1                                ##### REMOVE THIS LINE
new_m1_csr[:,1] = new_m1_csr[:,1] + OFFSET
new_m1_matrix = csr_to_matrix(new_m1_csr, symmetry=True)

In [78]:
'''Step 2.1.2 Create an (unweighted) adjacency list for the graph'''
# we still have the 2D matrix for the weights

graph = dict()
for i in range(len(new_m1_matrix)):
    temp_set = set()
    for j in range(len(new_m1_matrix[i])):
        if new_m1_matrix[i,j] > 0:
            temp_set.add(j)
    graph[i] = temp_set

print('done')  # TODO : put a check for fail

done


In [79]:
#if 0 in graph:                                              ##### REMOVE THIS CELL
#    del graph[0]
    
#graph

{0: set(),
 1: {11, 12},
 2: {11},
 3: {11, 13},
 4: {12, 13},
 5: set(),
 6: set(),
 7: set(),
 8: set(),
 9: set(),
 10: set(),
 11: {1, 2, 3},
 12: {1, 4},
 13: {3, 4}}

## Step 2.2: For all users: Get paths for all movies

(maybe only unrated movies: TODO)

In [112]:
# This implementation for this step is inspired from:
# http://eddmann.com/posts/depth-first-search-and-breadth-first-search-in-python/

import random

'''Step 2.2.1: Function gives all possible path from 'start' vertex to 'goal' vertex, inclusive of both '''
def bfs_paths(graph, start, goal):
    queue = [(start, [start])]
    while queue:
        (vertex, path) = queue.pop(0)
        for next in graph[vertex] - set(path):
            if next == goal:
                yield path + [next]
            else:
                queue.append((next, path + [next]))

'''Step 2.2.2: Function to get paths from user to item at r-hop distance'''
# assuming self rated movies are at radius = 0 and direct friends' movie at radius = 2
def get_r_hop_path(path, radius=RADIUS):
    exact_path = []
    for p in path:
        if ((len(p)/2) - 1) == radius:
            exact_path.append(p)
    return exact_path

'''Step 2.2.3: Function to get product for r-hop path from user to item
   Choose any path at random if #paths > 1'''
def get_product(path):
    if len(path) < 1:
        return GET_PRODUCT_FAIL_RETURN
    idx = random.randint(0, len(path)-1)    # in case of multiple paths to same item
    p = path[idx]                           # choose any one path at random

    product = 1
    for i in range(len(p)-1):
        product = product * new_m1_matrix[p[i],p[i+1]]

    return product

'''Function to find and replace some values ; for only 1d and 2d numpy arrays'''
def find_and_replace(data, find_value, replace_value):
    if len(data.shape) == 1:
        for i in range(len(data)):
            if data[i] == find_value:
                data[i] = replace_value
    elif len(data.shape) == 2:
        for i in range(len(data)):
            for j in range(len(data[i])):
                if data[i,j] == find_value:
                    data[i,j] = replace_value
    return data


In [113]:
user_list = np.array(list(set(new_m1_csr[:,0])))
item_list = np.array(list(set(new_m1_csr[:,1])))

product_matrix = np.full((len(user_list), len(item_list)), UNOBSERVED, dtype=int)

for user in user_list:
    for item in item_list:
        path = list(bfs_paths(graph, user, item))
        path = get_r_hop_path(path, radius=1)
        predicted_matrix[user, (item - OFFSET)] = get_product(path)       ##### UNCOMMENT THIS LINE
        #product_matrix[user-1, (item - OFFSET)-1] = get_product(path)    ##### REMOVE THIS LINE

# Step 3: Computing the distances

use M2 to get distance (or similarity) between two users based on neighbourhood from previous step

In [115]:
# making all unobserved entries in product_matrix as zero
# makes it simpler for pearson similarity calculation, probably..

product_matrix = find_and_replace(data=product_matrix, find_value=UNOBSERVED, replace_value=0)

In [116]:
product_matrix

array([[  0,   0,  20],
       [  0,  20, 100],
       [  0,  32,   0],
       [ 40,   0,   0]])

In [36]:
# Currently using simple pearson similarity:

from scipy import stats

user_sim_matrix = np.full((len(user_list), len(user_list)), UNOBSERVED, dtype=float)
for user1 in user_list:
    #user1 = user1 - 1                         ##### REMOVE THIS LINE
    for user2 in user_list:
        #user2 = user2 - 1                         ##### REMOVE THIS LINE
        if user1 >= user2:
            [sim, p_value] = stats.pearsonr(product_matrix[user1], product_matrix[user2])
            user_sim_matrix[user1,user2] = user_sim_matrix[user2,user1] = sim #abs(sim) TODO, check
            
            # similarity is between 0 and 1
            # therefore, these can be directly used as weights on users' rating for prediction

In [37]:
user_sim_matrix

array([[ 1.        ,  0.98198051,  0.5       ,  0.5       ],
       [ 0.98198051,  1.        ,  0.32732684,  0.65465367],
       [ 0.5       ,  0.32732684,  1.        ,  0.5       ],
       [ 0.5       ,  0.65465367,  0.5       ,  1.        ]])

# Step 4: Averaging datapts to get final estimates

use M3 to perform a weighted avg using similarity computed in previous step ; account for threshold n_n

In [39]:
m1_matrix = csr_to_matrix(m1_csr)
m1_matrix

array([[0, 3, 0],
       [4, 0, 0],
       [4, 0, 3],
       [0, 3, 1]])

In [None]:
# making all unobserved entries in product_matrix as zero
# makes it simpler for weighted dot product for rating prediction

m1_matrix = find_and_replace(data=m1_matrix, find_value=UNOBSERVED, replace_value=0)

In [59]:
predicted_matrix = np.full((len(user_list), len(item_list)), UNOBSERVED, dtype=float)
for user in user_list:
    #user = user - 1                                     ##### REMOVE THIS LINE
    for item in item_list:
        #item = item - 1                                     ##### REMOVE THIS LINE
        if m1_matrix[user,item] == UNOBSERVED:
            predicted_rating = user_sim_matrix[user].dot(m1_matrix[:,item]) #TODO This step assumes that UNOBSERVED = 0
            if predicted_rating > 5:
                predicted_rating = 5
            elif predicted_rating < 0:     # TODO: check: can we have predicted_rating<0, if sim>0 & user_ratings>0
                predicted_rating = 100
            predicted_matrix[user,item] = predicted_rating

5


In [None]:
# making all predicted ratings < 0 (set as 100 above) as 0

# making all zero(0) entries in product_matrix as UNOBSERVED
# makes it simpler to identify unpredicted rating in evaluation

#dont change the order of steps below
predicted_matrix = find_and_replace(data=predicted_matrix, find_value=0, replace_value=UNOBSERVED)
predicted_matrix = find_and_replace(data=predicted_matrix, find_value=100, replace_value=0)

In [60]:
predicted_matrix

array([[  0,   0,  20],
       [  0,  20, 100],
       [  0,  32,   0],
       [ 40,   0,   0]])

## Evaluation

We evaluate using RMSE for now, will account for other advanced metrics in future

In [None]:
# for all the available ratings in testset
# and for all the predicted rating for those available rating
# put them in two separate vectors
# get rmse using scikit libraries

y_actual  = np.full((len(test_data_csr)), UNOBSERVED, dtype=float)
y_predict = np.full((len(test_data_csr)), UNOBSERVED, dtype=float)

for testpt in len(test_data_csr):
    y_actual  = testpt[2]
    y_predict = predicted_matrix[testpt[0], testpt[1]]
    if y_predict == UNOBSERVED:       # i.e. we could not generate a rating for this test user item pair
        y_predict = AVG_RATING


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_actual, y_predicted))

In [61]:
test_data_csr

array([[ 62, 180,   3],
       [385, 120,   3],
       [683, 721,   2],
       ..., 
       [226,  13,   4],
       [322, 287,   3],
       [478, 107,   4]])

In [None]:
UNOBSERVED = 0
data_matrix = read_csr_data_as_matrix()
if data_matrix[data_matrix > 0].size == N_RATINGS:  # gives total no of ratings read; useful for verification
    print('Reading data_matrix: success')
else:
    print('Reading data_matrix: FAILED')
    print( '# of missing ratings: ' + str(N_RATINGS - data_matrix[data_matrix > 0].size))

In [None]:
data_matrix[:10,:10]