In [24]:
import numpy as np
import pandas as pd

from math import sqrt
from sklearn.metrics  import mean_squared_error
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

In [25]:
dataset = pd.read_excel('jester-data-1.xls', header=None)

In [26]:
uid = np.arange(1, len(dataset.index)+1)
dataset.insert(loc=1, column="userId", value=uid) 
dataset = dataset.drop([0], axis=1)

dataset = dataset.melt(id_vars="userId", var_name="jokeId", value_name='rating')
dataset.head()

Unnamed: 0,userId,jokeId,rating
0,1,1,-7.82
1,2,1,4.08
2,3,1,99.0
3,4,1,99.0
4,5,1,8.5


In [27]:
missing_pivot = dataset.pivot_table(values='rating', index='userId', columns='jokeId')

rate = {}
rows_indexes = {}

for i, row in missing_pivot.iterrows():
    rows = [x for x in range(0,len(missing_pivot.columns))]
    combine = list(zip(row.index, row.values, rows))
    rated = [(x,z) for x,y,z in combine if str(y) != 'nan']
    index = [i[1] for i in rated]
    row_names = [i[0] for i in rated]
    rows_indexes[i] = index
    rate[i] = row_names

missing_pivot.head()

jokeId,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
2,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
3,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
4,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
5,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [28]:
pivot_table = dataset.pivot_table(values= 'rating', index= 'userId', columns= 'jokeId').fillna(0)
pivot_table = pivot_table.apply(np.sign)


notrated = {}
notrated_indexes = {}

for i,row in pivot_table.iterrows():
    rows = [x for x in range(0, len(missing_pivot.columns))]
    combine = list(zip(row.index, row.values, row))
    idx_row = [(idx,col) for idx, val, col  in combine if not val > 0]
    indices = [i[1] for i in idx_row]
    row_names = [i[0] for i in idx_row]
    notrated[i] = row_names
    notrated_indexes[i] = indices

pivot_table.head()

jokeId,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0
2,1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,...,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0,-1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
n = 5
nn = NearestNeighbors(n_neighbors=n, algorithm='brute', metric= 'cosine')
item_nn_fit = nn.fit(pivot_table.T.values)
item_distance, item_indices = item_nn_fit.kneighbors(pivot_table.T.values)

In [30]:
item_dic = {}
for i in range(len(pivot_table.T.index)):
    item_idx = item_indices[i]
    col_names = pivot_table.T.index[item_idx].tolist()
    item_dic[pivot_table.T.index[i]] = col_names

In [31]:
topRecs = {}
for k,v in rows_indexes.items():
    item_idx = [j for i in item_indices[v] for j in  i]
    item_dist = [j for i in item_distance[v] for j in i]
    combine = list(zip(item_dist, item_idx))
    diction = {1:d for d,i in  combine if i not in v}
    zipped = list(zip(diction.keys(), diction.values()))
    sort = sorted(zipped, key=lambda x:x[1])
    recommendations = [(pivot_table.columns[i], d) for i,d in sort]
    topRecs[k] = recommendations

In [34]:
def getrecommendations(user, number_of_recs = 30):
    if user > len(pivot_table.index):
        print('Out of Range, there are only {} users, try number below!'.format(len(pivot_table.index)))
    else:
        print("You watched these movies: \n\n{}".format('\n'.join(rate[user])))
        print()
        print('Recommended Movies:\n')
    for k,v in topRecs.items():
        if user == k:
            for i in v[:number_of_recs]:
                print('{} with similarity: {:.4f}'.format(i[0], 1 - i[1]))

In [None]:
getrecommendations(601)

In [17]:
item_distance = 1 - item_distance
predictions = item_distance.T.dot(pivot_table.T.values)/np.array([np.abs(item_distance.T).sum(axis=1)]).T
ground_truth = pivot_table.T.values[item_distance.argsort()[0]]

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

error_rate = rmse(predictions, ground_truth)
print("Accuracy: {:.3f}".format(100-error_rate))
print("RMSE: {:.5f}".format(error_rate))

Accuracy: 99.081
RMSE: 0.91921
