# Recommender sytems
### Loading the libaries and reading the data

In [None]:
pip install surprise

In [None]:
from surprise import KNNBasic, SVD, BaselineOnly
from surprise import Dataset,accuracy
from surprise import Reader, AlgoBase
import os
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import NMF
import numpy as np

In [None]:
data = Dataset.load_builtin('ml-100k')

### Spliting the data and training the model

In [None]:
from surprise.model_selection import train_test_split

In [None]:
trainset, testset = train_test_split(data, test_size=.25)

In [None]:
# We'll use the famous SVD algorithm.
algo = SVD()

In [None]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [None]:
# Then compute RMSE
accuracy.rmse(predictions)

### Cross validation

In [None]:
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### KNNBasic algorithm

In [None]:
# Retrieve the trainset.
trainset = data.build_full_trainset()


In [None]:
# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

In [None]:
pred = algo.test(testset)

In [None]:
 # Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

In [None]:
# Compute mean absolute error
accuracy.mae(predictions, verbose=True)

### Tuning hyperparameters

In [None]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

In [None]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)


In [None]:
gs.fit(data)

In [None]:
# best RMSE score
print(gs.best_score['rmse'])

In [None]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

### Cosine similarity

In [None]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }


In [None]:
algo = KNNBasic(sim_options=sim_options)

In [None]:
algo.fit(trainset)

In [None]:
predictions = algo.test(testset)

In [None]:
cosine_rmse = accuracy.rmse(predictions, verbose=True)
cosine_rmse

In [None]:
cosine_mae = accuracy.mae(predictions, verbose=True)
cosine_mae

Splitting the data into 5 Kfolds and evaluating the performance

In [None]:
# Run 5 k-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Hyperparameter tuning

In [None]:
param_grid = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg': [1, 2]},
              'k': [2, 3],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }

In [None]:
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)


In [None]:
gs.fit(data)


In [None]:
# best RMSE score
tuned_cosine_rmse = gs.best_score['rmse']
print(cosine_rmse)

In [None]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
tuned_cosine_mae = gs.best_score['mae']
tuned_cosine_mae

In [None]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['mae'])

In [None]:
import pandas as pd
results_df = pd.DataFrame.from_dict(gs.cv_results)


In [None]:
results_df.plot()

### Pearson similarity

In [None]:
sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0  # no shrinkage
               }

In [None]:
algo = KNNBasic(sim_options=sim_options)


In [None]:
algo.fit(trainset)

In [None]:
predictions = algo.test(testset)

In [None]:
pearson_rmse = accuracy.rmse(predictions, verbose=True)

In [None]:
pearson_mae = accuracy.mae(predictions, verbose=True)

In [None]:
# Run 5 k-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

**Jaccard Similarity**

In [None]:
class jaccard(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def estimate(self, u, i):
        intersection = u & i
        jaccard = intersection/(u+i-intersection) 
        return jaccard
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
algo = BaselineOnly(bsl_options=bsl_options)


In [None]:
algo = jaccard()

In [None]:
algo.fit(trainset)

In [None]:
predictions = algo.test(testset, verbose=False)

In [None]:
jaccard_rmse = accuracy.rmse(predictions, verbose=True)

In [None]:
jaccard_mae = accuracy.mae(predictions, verbose=True)

## Triangle multiplying Jaccard

In [None]:
class tmj_sim(AlgoBase):

    def __init__(self, sim_options={}, bsl_options={}):

        AlgoBase.__init__(self, sim_options=sim_options,
                          bsl_options=bsl_options)

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)

        # Compute baselines and similarities
        self.bu, self.bi = self.compute_baselines()
        self.sim = self.compute_similarities()

        return self

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)
        triangle =  (1 - (u - i)/(u+i))
        intersection = u&i
        jaccard = ((intersection)/(u + i- intersection))
        tmj = triangle * jaccard
        print('The 3 nearest neighbors of user', str(u), 'are:')
        for v, sim_uv in neighbors[:3]:
            print('user {0:} with sim {1:1.2f}'.format(v, sim_uv))

        # ... Aaaaand return the baseline estimate anyway ;)
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)

In [None]:
algo.fit(trainset)

In [None]:
predictions = algo.test(testset)

In [None]:
tmj_rmse = accuracy.rmse(predictions, verbose=True)
tmj_rmse

In [None]:
tmj_mae = accuracy.mae(predictions, verbose=True)
tmj_mae

In [None]:
accuracy.mse(predictions, verbose=True)

## Getting top n recommendations for each user

In [None]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [None]:
algo = tmj_sim()

In [None]:
algo.fit(trainset)

In [None]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

In [None]:
predictions = algo.test(testset)

In [None]:
top_n = get_top_n(predictions, n=10)

In [None]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

## Comparison between different similarity measures

In [None]:
import matplotlib.pyplot as plt

In [None]:
print(cosine_mae, pearson_mae, tmj_mae)

In [None]:
print(cosine_rmse, pearson_rmse, tmj_rmse)

In [None]:
tmj_rmse

In [None]:
plt.plot(rmse)

In [None]:
mae = (cosine_mae, pearson_mae,  tmj_mae)

In [None]:
plt.plot(mae)