# Recommender sytems
### Loading the libaries and reading the data

In [1]:
from surprise import KNNBasic, SVD, BaselineOnly
from surprise import Dataset,accuracy
from surprise import Reader, AlgoBase
import os
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import NMF
import numpy as np

In [2]:
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)


In [3]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fd86a8053d0>

### Spliting the data and training the model

In [4]:
from surprise.model_selection import train_test_split

In [5]:
trainset, testset = train_test_split(data, test_size=.25)

In [6]:
# We'll use the famous SVD algorithm.
algo = SVD()

In [7]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [8]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9407


0.940676612767661

### Cross validation

In [9]:
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9306  0.9363  0.9341  0.9476  0.9294  0.9356  0.0065  
MAE (testset)     0.7306  0.7394  0.7373  0.7455  0.7351  0.7376  0.0049  
Fit time          26.45   30.37   27.82   28.54   25.82   27.80   1.61    
Test time         1.60    1.16    1.42    1.97    1.28    1.49    0.29    


{'test_rmse': array([0.9305993 , 0.93627061, 0.93414112, 0.94758108, 0.92942359]),
 'test_mae': array([0.73056134, 0.73940869, 0.73734321, 0.74545279, 0.73512622]),
 'fit_time': (26.447699546813965,
  30.369539976119995,
  27.81802988052368,
  28.536977291107178,
  25.815666675567627),
 'test_time': (1.598282814025879,
  1.155954360961914,
  1.4189350605010986,
  1.9739329814910889,
  1.27901029586792)}

### KNNBasic algorithm

In [10]:
# Retrieve the trainset.
trainset = data.build_full_trainset()


In [11]:
# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f5fd7789310>

In [12]:
pred = algo.test(testset)

In [13]:
 # Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.9407


0.940676612767661

In [14]:
# Compute mean absolute error
accuracy.mae(predictions, verbose=True)

MAE:  0.7401


0.7401460437863023

### Tuning hyperparameters

In [15]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

In [16]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [17]:
gs.fit(data)

In [18]:
# best RMSE score
print(gs.best_score['rmse'])

0.9640258480175135


In [19]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [20]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f5fd5f31990>

### Cosine similarity

In [6]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }


In [7]:
algo = KNNBasic(sim_options=sim_options)

In [8]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fd868837f10>

In [9]:
predictions = algo.test(testset)

In [10]:
cosine_rmse = accuracy.rmse(predictions, verbose=True)
cosine_rmse

RMSE: 1.0362


1.0361966176161543

In [11]:
cosine_mae = accuracy.mae(predictions, verbose=True)
cosine_mae

MAE:  0.8233


0.823342215047417

### Pearson similarity

In [12]:
sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0  # no shrinkage
               }

In [13]:
algo = KNNBasic(sim_options=sim_options)


In [14]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fd868837ad0>

In [15]:
predictions = algo.test(testset)

In [16]:
pearson_rmse = accuracy.rmse(predictions, verbose=True)

RMSE: 1.0147


In [17]:
pearson_mae = accuracy.mae(predictions, verbose=True)

MAE:  0.8051


## Baseline estimates
### Altenating least square

In [18]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }

Using ALS


In [19]:
algo = BaselineOnly(bsl_options=bsl_options)

In [20]:
algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fd868864b50>

In [21]:
predictions = algo.test(testset)

In [22]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.9467


0.9467357076898029

In [23]:
accuracy.mae(predictions, verbose=True)

MAE:  0.7521


0.7521304501702972

### Stochastic gradient descent

In [24]:
bsl_options = {'method': 'als',
               'n_epochs': 20,
               }
sim_options = {'name': 'pearson_baseline'}


In [25]:
algo = KNNBasic(bsl_options=bsl_options, sim_options=sim_options)

In [26]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fd867179390>

In [27]:
predictions = algo.test(testset)

In [28]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.0051


1.0051244636409176

## Triangle similarity

In [29]:
class triangle(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def estimate(self, u, i):
        triangle =  (1 - (u - i)/(u+i))
        return triangle


In [30]:
algo = triangle()

In [33]:
algo.fit(trainset)

<__main__.triangle at 0x7fd8671788d0>

In [34]:
predictions = algo.test(testset)

TypeError: unsupported operand type(s) for -: 'int' and 'str'

In [51]:
triangle_rmse = accuracy.rmse(predictions, verbose=True)
triangle_rmse

RMSE: 1.0051


1.0051244636409176

In [52]:
triangle_mae = accuracy.mae(predictions, verbose=True)
triangle_mae

MAE:  0.7971


0.7970969229061583

In [53]:
accuracy.mse(predictions, verbose=True)

MSE: 1.0103


1.0102751874094422

#### Fit method

In [79]:
class triangle_sim(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in
                                 self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        return self.the_mean

In [80]:
algo = triangle_sim()

In [81]:
algo.fit(trainset)

<__main__.triangle_sim at 0x7fd83bfd5290>

In [82]:
predictions = algo.test(testset)

In [83]:
triangle_sim_rmse = accuracy.rmse(predictions, verbose=True)

RMSE: 0.0000


In [84]:
triangle_sim_mae = accuracy.mae(predictions, verbose=True)
triangle_sim_mae

MAE:  0.0000


0.0

In [85]:
accuracy.mse(predictions, verbose=True)

MSE: 0.0000


0.0

## Triangle multiplying Jaccard

In [86]:
class tmj_sim(AlgoBase):

    def __init__(self, sim_options={}, bsl_options={}):

        AlgoBase.__init__(self, sim_options=sim_options,
                          bsl_options=bsl_options)

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)

        # Compute baselines and similarities
        self.bu, self.bi = self.compute_baselines()
        self.sim = self.compute_similarities()

        return self

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)
        triangle =  (1 - (u - i)/(u+i))
        intersection = u&i
        jaccard = ((intersection)/(u + i- intersection))
        tmj = triangle * jaccard
        print('The 3 nearest neighbors of user', str(u), 'are:')
        for v, sim_uv in neighbors[:3]:
            print('user {0:} with sim {1:1.2f}'.format(v, sim_uv))

        # ... Aaaaand return the baseline estimate anyway ;)

In [87]:
algo = tmj_sim()

In [88]:
algo.fit(trainset)

<__main__.tmj_sim at 0x7fd83c0716d0>

In [89]:
predictions = algo.test(testset)

The 3 nearest neighbors of user 0 are:
user 56 with sim 0.39
user 393 with sim 0.35
user 335 with sim 0.33


TypeError: '<' not supported between instances of 'NoneType' and 'int'

In [90]:
tmj_rmse = accuracy.rmse(predictions, verbose=True)
tmj_rmse

RMSE: 0.0000


0.0

In [91]:
tmj_mae = accuracy.mae(predictions, verbose=True)
tmj_mae

MAE:  0.0000


0.0

In [92]:
accuracy.mse(predictions, verbose=True)

MSE: 0.0000


0.0

## Getting top n recommendations for each user

In [93]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [94]:
algo = tmj_sim()

In [95]:
algo.fit(trainset)

<__main__.tmj_sim at 0x7fd83bb5e4d0>

In [96]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

In [97]:
predictions = algo.test(testset)

The 3 nearest neighbors of user 0 are:
user 56 with sim 0.39
user 393 with sim 0.35
user 335 with sim 0.33


TypeError: '<' not supported between instances of 'NoneType' and 'int'

In [98]:
top_n = get_top_n(predictions, n=10)

In [99]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

551 ['805', '436', '132', '1063', '523', '175', '463', '184', '116', '190']
184 ['1304', '436', '628', '1063', '31', '85', '451', '790', '732', '463']
28 ['1304', '805', '79', '70', '628', '117', '132', '1063', '85', '451']
562 ['1304', '805', '436', '70', '628', '117', '132', '1063', '31', '85']
608 ['1304', '805', '436', '628', '117', '31', '85', '451', '790', '523']
761 ['1304', '805', '436', '79', '70', '132', '1063', '31', '85', '451']
682 ['1304', '805', '436', '132', '1063', '31', '523', '463', '116', '190']
175 ['1304', '805', '436', '79', '70', '628', '117', '1063', '85', '451']
303 ['1304', '628', '132', '1063', '85', '523', '732', '175', '463', '116']
336 ['1304', '805', '436', '79', '628', '132', '1063', '31', '790', '523']
246 ['1304', '805', '436', '79', '70', '1063', '31', '85', '523', '732']
586 ['1304', '805', '70', '132', '1063', '451', '523', '732', '175', '463']
660 ['1304', '805', '436', '79', '70', '628', '132', '1063', '31', '85']
92 ['1304', '805', '70', '628', 

776 ['1304', '805', '79', '70', '628', '117', '1063', '31', '85', '451']
494 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
76 ['1304', '805', '436', '79', '117', '132', '1063', '31', '85', '451']
308 ['1304', '70', '1063', '85', '451', '790', '523', '463', '190', '281']
158 ['1304', '805', '436', '628', '132', '1063', '31', '451', '790', '523']
928 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
708 ['1304', '805', '436', '79', '70', '132', '1063', '31', '85', '451']
54 ['1304', '805', '436', '79', '70', '628', '132', '1063', '31', '85']
466 ['1304', '805', '436', '70', '628', '132', '1063', '31', '85', '451']
913 ['1304', '805', '79', '70', '628', '1063', '31', '85', '451', '790']
533 ['1304', '805', '436', '79', '628', '1063', '85', '790', '523', '732']
267 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
443 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
905 ['1304', '805', '436', '79', '70', '

131 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
575 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
515 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
386 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
765 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
914 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
220 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
147 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']
742 ['1304', '805', '436', '79', '70', '628', '117', '132', '1063', '31']


## Comparison between different similarity measures

In [100]:
import matplotlib.pyplot as plt

In [101]:
print(cosine_mae, pearson_mae, triangle_mae, tmj_mae)

0.823342215047417 0.8050681852575561 0.7970969229061583 0.0


In [102]:
print(cosine_rmse, pearson_rmse, triangle_rmse, tmj_rmse)

1.0361966176161543 1.0147192389796589 1.0051244636409176 0.0


In [103]:
tmj_rmse

0.0

In [104]:
type(tmj_mae)

numpy.float64