# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [2]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

#943 users, 1664 movies

(943, 1664)
(943, 1664)


In [3]:
user_ratings_train.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,5.0,,,3.0,4.0,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,4.0,,,...,,,,4.0,,,,,,


In [4]:
user_ratings_test.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,,,,,,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,,,,...,,,,,,,,,4.0,


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [18]:
# your code
from sklearn.neighbors import NearestNeighbors

df = user_ratings_train
df = df.fillna(0)

knn = NearestNeighbors(metric='correlation')
knn.fit(df.values)
distances, indices = knn.kneighbors(df.values, n_neighbors=6)

array([[0.00000000e+00, 6.47544752e-01, 6.60925605e-01, 6.87676851e-01,
        6.91347417e-01, 6.93769310e-01],
       [0.00000000e+00, 5.33992742e-01, 5.57585434e-01, 5.59853814e-01,
        5.64183989e-01, 5.71700215e-01],
       [2.22044605e-16, 6.01102428e-01, 6.35592747e-01, 6.50536175e-01,
        6.55089862e-01, 6.61778133e-01],
       ...,
       [0.00000000e+00, 5.06089531e-01, 5.44251320e-01, 5.85830872e-01,
        6.03092022e-01, 6.04274489e-01],
       [0.00000000e+00, 6.27357479e-01, 6.42508716e-01, 6.68873411e-01,
        6.82356423e-01, 6.83230542e-01],
       [0.00000000e+00, 5.13492714e-01, 5.39363404e-01, 5.52326991e-01,
        5.56149507e-01, 5.58960736e-01]])

In [20]:
distances[0]

array([0.        , 0.64754475, 0.66092561, 0.68767685, 0.69134742,
       0.69376931])

In [23]:
indices[0]

array([  0, 822, 513, 863, 912, 520], dtype=int64)

In [30]:
train_users_avg = user_ratings_train.mean(axis=1).values
len(train_users_avg)

943

In [82]:
train_data = user_ratings_train.copy()
display(train_data)

train_data = train_data.fillna(0)
train_data = train_data.values

test_data = user_ratings_test.copy().fillna(0).values

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,5.0,,,3.0,4.0,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,4.0,,,...,,,,4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,3.0,,3.0,...,,,,,,,,,,


In [83]:
pred_arr = []
real_arr = []

for i in range(0, len(train_data)): #row
    for j in range(0, len(train_data[i])): #column
        if (train_data[i][j] == 0 and test_data[i][j] != 0):
            toAdd = 0
            for k in range(1, 6): #nearest neighbors to i
                if train_data[indices[i][k]][j] != 0:
                    toAdd += (distances[i][k] * (train_data[indices[i][k]][j] - train_users_avg[indices[i][k]])) / sum(distances[i][1:])
                else:
                    toAdd += distances[i][k] / sum(distances[i][1:])
            pred = train_users_avg[i] + toAdd
            train_data[i][j] = pred
            pred_arr.append(pred)
            real_arr.append(test_data[i][j])

In [84]:
train_data[0][0:6]

array([0.        , 0.        , 4.30085625, 5.        , 0.        ,
       0.        ])

In [85]:
test_data[0][0:6]

array([0., 0., 2., 0., 0., 0.])

In [88]:
from sklearn.metrics import mean_absolute_error
print('MAE: {}'.format(mean_absolute_error(pred_arr, real_arr)))

MAE: 1.0001660736596059


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [93]:
# your code
df2 = user_ratings_train
df2 = df2.fillna(0)

knn2 = NearestNeighbors(metric='cosine')
knn2.fit(df2.values)
distances2, indices2 = knn2.kneighbors(df2.values, n_neighbors=6)

#user_ratings_train.corr(method='pearson')

print(distances2, indices2)

[[1.11022302e-16 5.91773448e-01 6.02749742e-01 6.15857667e-01
  6.31609840e-01 6.32849658e-01]
 [0.00000000e+00 5.25538749e-01 5.45552880e-01 5.47289564e-01
  5.52654706e-01 5.64547173e-01]
 [0.00000000e+00 5.93191585e-01 6.21828886e-01 6.38877268e-01
  6.43475069e-01 6.45874741e-01]
 ...
 [0.00000000e+00 4.98309711e-01 5.35760013e-01 5.76765086e-01
  5.91794047e-01 5.96183301e-01]
 [2.22044605e-16 5.84281432e-01 6.15994820e-01 6.18793685e-01
  6.41684895e-01 6.42957319e-01]
 [0.00000000e+00 4.50233090e-01 4.90613702e-01 4.98535447e-01
  5.06781071e-01 5.08027674e-01]] [[  0 822 513 863 591 605]
 [  1 519 734 677 700 265]
 [  2 655 751 610 783 586]
 ...
 [940 688 816 729 581 741]
 [941 453 473 779 487 715]
 [942 681 932 550 708 585]]


In [94]:
train_data2 = user_ratings_train.copy().fillna(0).values
test_data2 = user_ratings_test.copy().fillna(0).values

pred_arr2 = []
real_arr2 = []

for i in range(0, len(train_data2)): #row
    for j in range(0, len(train_data2[i])): #column
        if (train_data2[i][j] == 0 and test_data2[i][j] != 0):
            toAdd = 0
            for k in range(1, 6): #nearest neighbors to i
                if train_data2[indices2[i][k]][j] != 0:
                    toAdd += (distances2[i][k] * (train_data2[indices2[i][k]][j] - train_users_avg[indices2[i][k]])) / sum(distances2[i][1:])
                else:
                    toAdd += distances2[i][k] / sum(distances2[i][1:])
            pred2 = train_users_avg[i] + toAdd
            train_data2[i][j] = pred
            pred_arr2.append(pred2)
            real_arr2.append(test_data2[i][j])

In [95]:
print('MAE: {}'.format(mean_absolute_error(pred_arr2, real_arr2)))

MAE: 0.9858113952503229
