In [1]:
# Built and testing on python2
import numpy as np
from tqdm import *

## Setting constants

In [2]:
DATA_PATH = './ml-100k/u.data' # ml-100k data set has 100k ratings, 943 users and 1682 items
#DATA_PATH = './filmtrust/ratings.txt' # filmtrust data set has 35497 ratings, 1508 users and 2071 items

# Dataset Parameters
DATA_TYPE =  0              # 0: CSR format, 1: 2D matrix format  # TODO: use it
DELIMITER = "\t"            # tab separated or comma separated data format
FIRST_INDEX = 1
N_RATINGS = 100000
USERS = 943
ITEMS = 1682

# Hardcoding values
UNOBSERVED = -1
GET_PRODUCT_FAIL_RETURN = UNOBSERVED    #TODO: This hardcoding can be removed in future
TRAIN_TEST_SPLIT = 0.2  # %age of test ratings wrt train rating ; value in between 0 and 1
AVG_RATING = 3          # ratings for which we dont have predicted rating

# HyperParameters
C1 = 0.2                # probability of edges in training set going to E1
C2 = 0.3                # probability of edges in training set going to E2
C3 = 1 - C1 - C2
RADIUS = 3              # radius of neighborhood, radius = # edges between start and end vertex
THRESHOLD = 943

In [3]:
#checks on parameters set in previous cell
if C3 <= 0:
    print('ERROR: Please set the values of C1 and C2, s.t, C1+C2 < 1')

## Functions for data handling

(need to handle for boundary cases)

In [4]:
''' Function to get data in matrix format for given data in CSR format '''
def csr_to_matrix(data_csr, symmetry=False):
    
    if symmetry:                                 ### TODO: Implement this better
        users = max(data_csr[:,0]) + 1
        items = max(data_csr[:,1]) + 1
        users = items = max(users,items)
    else:
        users = USERS
        items = ITEMS
        
    data_matrix = np.full((users, items), UNOBSERVED, dtype=int)
    for line in data_csr:
        data_matrix[line[0]][line[1]] = line[2]
        if symmetry:
            data_matrix[line[1]][line[0]] = line[2]
            
    return data_matrix

''' Function to get data in CSR format for given data in matrix format '''
def matrix_to_csr(data_matrix):             # TODO: Check if it works
    data_matrix = np.empty([0,0], dtype=int)
    data_csr = np.array([ [i,j,data_matrix[i,j]]\
                          for j in range(len(temp[i]))\
                              for i in range(len(temp))\
                                  if temp[i,j] != UNOBSERVED])
    return data_csr

''' Function to read data file, given in CSR format
    Assuming 1st 3 values of a row as: user_id, item_id, rating '''
def read_data_csr(fname, delimiter, dtype=int):
    data_csr = np.loadtxt(fname=fname, delimiter=delimiter, dtype=dtype) # Reading data to array
    data_csr = data_csr[:, :3]                                           # Extracting 1st 3 columns: 0,1,2
    data_csr[:,:2] = data_csr[:,0:2] - FIRST_INDEX                       # Making sure index starts from 0
    return data_csr

''' Function to read data file, given in matrix format '''
# TODO
def read_data_matrix():
    data_matrix = np.empty([0,0], dtype=int)
    return data_matrix


## Read dataset

In [5]:
data_csr = read_data_csr(fname=DATA_PATH, delimiter=DELIMITER)

if data_csr.shape[0] == N_RATINGS:  # gives total no of ratings read; useful for verification
    print('done')
else:
    print('fail')
    #print( '# of missing ratings: ' + str(N_RATINGS - data_csr.shape[0]))

done


In [6]:
# setting some constants
unique_users = len(np.array(list(set(data_csr[:,0]))))
unique_items = len(np.array(list(set(data_csr[:,1]))))

USERS = max(data_csr[:,0]) + 1
ITEMS = max(data_csr[:,1]) + 1

OFFSET = USERS + 10     # TODO: MAKE IT HARDCODING; keep it >= #USERS
THRESHOLD =  USERS      # TODO: MAKE IT HARDCODING; distance/similarity threshold for users, who are used in avg weighted estimate computation

In [7]:
print('USERS: ' + str(USERS))
print('ITEMS: ' + str(ITEMS))

if USERS != unique_users:
    print('No of users with no ratings: ' + str(USERS - unique_users))
if ITEMS != unique_items:
    print('No of items with no ratings: ' + str(items - unique_items))
if USERS == unique_users and ITEMS == unique_items:
    print('All users and items have at least one rating! Good!')

USERS: 943
ITEMS: 1682
All users and items have at least one rating! Good!


## Creating a train/test split for non negative ratings

In [8]:
# we use data_csr as it is easy to only shuffle it and accordingly create train and test set
np.random.shuffle(data_csr) # inplace shuffle

train_sz = int((1 - TRAIN_TEST_SPLIT) * data_csr.shape[0])

train_data_csr = data_csr[: train_sz ,:]
test_data_csr = data_csr[train_sz : ,:]

if train_data_csr.shape[0]+test_data_csr.shape[0] == data_csr.shape[0]:
    print('done')
else:
    print('fail')

done


## Explore/Modify Dataset

In [9]:
# some data analysis nos and plots here

## Algorithm Begins from here

# Step 1: Sample Splitting

Split the matrix edges into 3 parts, M1, M2, M3

In [10]:
# Again using data_csr is easier to split sample into 3 parts

#reshuffling training data can be avoided as it was obtained shuffled already
#np.random.shuffle(train_data_csr) # inplace shuffle

m1_sz = int(C1 * train_data_csr.shape[0])
m2_sz = int(C2 * train_data_csr.shape[0])

m1_csr = train_data_csr[              : m1_sz         ,:]
m2_csr = train_data_csr[        m1_sz : m1_sz + m2_sz ,:]
m3_csr = train_data_csr[m1_sz + m2_sz :               ,:]

if m1_csr.shape[0]+m2_csr.shape[0]+m3_csr.shape[0] == train_data_csr.shape[0]:
    print('done')
else:
    print('fail')

done


In [11]:
#m1_csr = np.copy(train_data_csr)                                   ##### REMOVE THIS CELL
#m2_csr = np.copy(train_data_csr)
#m3_csr = np.copy(train_data_csr)

# Step 2: Expanding the neighborhood

use M1 to build neighborhood based on radius *r*

Normalize the product of ratings by total no of final movies at the boundary

Building BFS tree rooted at each vertex, s/t
- each node in a path from user to boundary item is unique
- shortest path (#path edges) between user and boundary item
- in case of multiple paths (or trees) choose any one path (i.e. any one tree) at random

In [12]:
#import numpy as np
#from tqdm import *
#FIRST_INDEX = 1
#OFFSET = 4 + 6
#UNOBSERVED = -1
#
#m1_csr = read_data_csr(fname='./very_small_graph.txt', delimiter="\t")   ##### REMOVE THIS CELL

## Step 2.1: Create adjacency list: dictionary of sets

In [13]:
'''Step 2.1.1 Convert M1 from csr to matrix format'''
# item_ids += OFFSET
# so that user_ids != item_ids
# and we can create an undirected graph (important to get an edge from item to user)

new_m1_csr = np.copy(m1_csr)
new_m1_csr[:,1] = new_m1_csr[:,1] + OFFSET
new_m1_matrix = csr_to_matrix(new_m1_csr, symmetry=True)

print('done')  # TODO : put a check for fail

done


In [14]:
'''Step 2.1.2 Create an (unweighted) adjacency list for the graph'''
# we still have the 2D matrix for the weights

graph = dict()
for i in tqdm(range(len(new_m1_matrix))):
    temp_set = set()
    for j in range(len(new_m1_matrix[i])):
        if new_m1_matrix[i,j] > 0:
            temp_set.add(j)
    graph[i] = temp_set

print('done')  # TODO : put a check for fail

100%|██████████| 2632/2632 [00:05<00:00, 478.83it/s]

done





In [15]:
#graph

## Step 2.2: For all users: Get path products for all movies

(maybe only unrated movies: TODO)

In [16]:
# This implementation for this step is inspired from:
# http://eddmann.com/posts/depth-first-search-and-breadth-first-search-in-python/

import random

'''Step 2.2.1: Function gives all possible path from 'start' vertex to 'goal' vertex, inclusive of both '''
# radius = # edges between start and end vertex
def bfs_paths(graph, start, radius):
    queue = [(start, [start])]
    while queue:
        (vertex, path) = queue.pop(0)
        for next in graph[vertex] - set(path):
            depth = len(path + [next]) - 1
            if depth == radius:
                yield path + [next]
            else:
                queue.append((next, path + [next]))

'''Step 2.2.2: Function which returns a dictionary for a given user
   where each item represents the key in the dictionary
   and it returns a list of lists(paths) from user to item r-hop distance apart'''
# help from 
# https://www.kumari.net/index.php/programming/programmingcat/22-python-making-a-dictionary-of-lists-a-hash-of-arrays
def create_item_dict(all_path):
    dict_path = dict()
    for path in all_path:
        r_hop_item = path[-1]
        dict_path.setdefault(r_hop_item,[]).append(path) 
    return dict_path

'''Step 2.2.3: Function to get product for r-hop path from user to item
   Choose any path at random if #paths > 1'''
def get_product(path):
    if len(path) < 1:
        return GET_PRODUCT_FAIL_RETURN
    idx = random.randint(0, len(path)-1)    # in case of multiple paths to same item
    p = path[idx]                           # choose any one path at random

    product = 1
    for i in range(len(p)-1):
        product = product * new_m1_matrix[p[i],p[i+1]]
    return product

'''Function to find and replace some values
   for only 1d and 2d numpy arrays'''
def find_and_replace(data, find_value, replace_value):
    if len(data.shape) == 1:
        for i in range(len(data)):
            if data[i] == find_value:
                data[i] = replace_value
    elif len(data.shape) == 2:
        for i in range(len(data)):
            for j in range(len(data[i])):
                if data[i,j] == find_value:
                    data[i,j] = replace_value
    return data


In [17]:
user_list = np.array(list(set(data_csr[:,0])))                 ##### UNCOMMENT THIS CELL
item_list = np.array(list(set(data_csr[:,1])))

In [18]:
#user_list = np.array(list(set(m1_csr[:,0])))                  ##### REMOVE THIS CELL
#item_list = np.array(list(set(m1_csr[:,1])))
#RADIUS = 3

In [19]:
product_matrix = np.full((len(user_list), len(item_list)), UNOBSERVED, dtype=int)

for user in tqdm(user_list):
    all_path = list(bfs_paths(graph, user, radius=RADIUS))  # get a list of all r-hop paths from given user
    dict_path = create_item_dict(all_path)                  # create dict of paths from user to individual items
    for item in dict_path:
        paths = dict_path[item]                             # get the set of user-item paths
        product = get_product(paths)                        # get product for a unique user-item path (at random)
        product_matrix[user, (item - OFFSET)] = product

print('done')

100%|██████████| 943/943 [00:55<00:00, 17.10it/s]

done





# Step 3: Computing the distances

use M2 to get distance (or similarity) between two users based on neighbourhood from previous step

In [20]:
# making all unobserved entries in product_matrix as zero
# makes it simpler for pearson similarity calculation, probably..

product_matrix = find_and_replace(data=product_matrix, find_value=UNOBSERVED, replace_value=0)

In [21]:
# Currently using simple pearson similarity:

from scipy import stats

user_sim_matrix = np.full((len(user_list), len(user_list)), UNOBSERVED, dtype=float)
for user1 in tqdm(user_list):
    for user2 in user_list:
        if user1 >= user2:
            [sim, p_value] = stats.pearsonr(product_matrix[user1], product_matrix[user2])
            if np.isnan(sim):                       # TODO: check if this is valid to do?
                sim = 0
            user_sim_matrix[user1,user2] = user_sim_matrix[user2,user1] = sim
            # similarity is between 0 and 1
            # therefore, these can be directly used as weights on users' rating for prediction

#del product_matrix  #no need in future

  r = r_num / r_den
100%|██████████| 943/943 [00:54<00:00, 17.35it/s]


# Step 4: Averaging datapts to get final estimates

use M3 to perform a weighted avg using similarity computed in previous step ; account for threshold n_n

In [22]:
m3_matrix = csr_to_matrix(m3_csr)

In [23]:
predicted_matrix = np.full((len(user_list), len(item_list)), UNOBSERVED, dtype=float)
for user in tqdm(user_list):
    for item in item_list:
        if m3_matrix[user,item] == UNOBSERVED:
            # we basically do a dot product but avoid taking UNOBSERVED user similarities or item ratings
            predicted_rating = user_sim_matrix[user, m3_matrix[:,item] != UNOBSERVED      ]\
                                .dot(m3_matrix[      m3_matrix[:,item] != UNOBSERVED, item])
                
            #if np.isnan(predicted_rating):
            #    print(user,item)
            
            if predicted_rating > 5:
                predicted_rating = 5
            elif predicted_rating < 1:
                predicted_rating = 1
            predicted_matrix[user,item] = predicted_rating

#del user_sim_matrix
print('done')  # TODO : put a check for fail

100%|██████████| 943/943 [00:50<00:00, 18.68it/s]

done





## Evaluation

We evaluate using RMSE for now, will account for other advanced metrics in future

In [24]:
# for all the available ratings in testset
# and for all the predicted rating for those available rating
# put them in two separate vectors
# get rmse using scikit libraries

y_actual  = np.full((len(test_data_csr)), UNOBSERVED, dtype=float)
y_predict = np.full((len(test_data_csr)), UNOBSERVED, dtype=float)

for i in tqdm(range(len(test_data_csr))):
    testpt = test_data_csr[i]
    y_actual[i]  = testpt[2]
    y_predict[i] = predicted_matrix[testpt[0], testpt[1]]
    if y_predict[i] == UNOBSERVED:       # i.e. we could not generate a rating for this test user item pair
        y_predict[i] = AVG_RATING

#del predicted_matrix
print('done')  # TODO : put a check for fail

100%|██████████| 20000/20000 [00:00<00:00, 239108.17it/s]

done





In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_actual, y_predict))
rms

1.8278700378337203

Rough work: functions to be used for debugging, or no longer neede

In [26]:
#Function to read CSR data file in matrix format
import csv
def read_csr_data_as_matrix(users=USERS, items=ITEMS, symmetry=False):
    if symmetry:
        users = items = max(users, items)
    data_matrix = np.full((users, items), UNOBSERVED, dtype=int)
    with open(DATA_PATH) as tsv:
        for line in csv.reader(tsv, delimiter="\t"):    # line is a list containing all tab separated items
            line = [int(i) for i in line]               # convert the list of str to list of int
            data_matrix[line[0] - FIRST_INDEX][line[1] - FIRST_INDEX] = line[2]
    return data_matrix
'''
data_matrix = read_csr_data_as_matrix()
if data_matrix[data_matrix > 0].size == N_RATINGS:  # gives total no of ratings read; useful for verification
    print('Reading data_matrix: success')
else:
    print('Reading data_matrix: FAILED')
    print( '# of missing ratings: ' + str(N_RATINGS - data_matrix[data_matrix > 0].size))
'''

"\ndata_matrix = read_csr_data_as_matrix()\nif data_matrix[data_matrix > 0].size == N_RATINGS:  # gives total no of ratings read; useful for verification\n    print('Reading data_matrix: success')\nelse:\n    print('Reading data_matrix: FAILED')\n    print( '# of missing ratings: ' + str(N_RATINGS - data_matrix[data_matrix > 0].size))\n"

In [27]:
# Function gives all possible path from 'start' vertex to 'goal' vertex, inclusive of both
def old_bfs_paths(graph, start, goal):
    queue = [(start, [start])]
    while queue:
        (vertex, path) = queue.pop(0)
        for next in graph[vertex] - set(path):
            if next == goal:
                yield path + [next]
            else:
                queue.append((next, path + [next]))


# Function to get paths from user to item at r-hop distance
# assuming self rated movies are at radius = 0 and direct friends' movie at radius = 2
def get_r_hop_path(path, radius=RADIUS):
    exact_path = []
    for p in path:
        if ((len(p)/2) - 1) == radius:
            exact_path.append(p)
    return exact_path


In [28]:
#Step 2.2.1: Function gives all possible path from 'start' vertex to 'goal' vertex, inclusive of both
def new_bfs_paths(graph, start, radius):
    print('Start: ' + str(start))
    print(" ")
    queue = [(start, [start])]
    while queue:
        print("Begin: " + str(queue))
        (vertex, path) = queue.pop(0)
        print('Mid: ' + str(queue))
        print('Vertex: ' + str(vertex) + ' ; path: ' + str(path))
        for next in graph[vertex] - set(path):
            print('Next: ' + str(next))
            depth = len(path + [next]) - 1
            if depth == radius:
                print('Max depth reached: ' + str(depth))
                yield path + [next]
            #if next == goal:
            #    print('Goal reached, final path: ' + str(path + [next]))
            else:
                print('Appended: ' + str(path + [next]))
                queue.append((next, path + [next]))
            print('Depth: ' + str(depth))
        print(" ")


#path = list(new_bfs_paths(graph, 2, 3))