# Revisiting Recommender Systems

The goal of this notebooks it to take a different, more thoughtful approach to building a recommender system (henceforth **recsys**). We will be leveraging more of the Yelp Data this time in order to build more meaningful attributes as a basis for recommendation. 

## Notebook Guide

# Surprise Implementation

## Step 1: Getting and examining the Data from disparate datasets

In [7]:
# Imports 
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

from surprise import SVD
from surprise.model_selection import KFold, cross_validate
from surprise import Dataset
from collections import defaultdict

Let's make sure we get how this works with a little practice run with a built in dataset

# Practice

## Getting the Top-n recommendations for each user

In [8]:
def get_top_n(predictions, n=10):
    '''
    Returns the top-N recommendations for each user from a set of predictions.
    -------------------------------------------------------------------------
    Parameters:
    - predictions(list of Prediction objects): The list of predictions, as
    returned by the test method of an algorithm.
    - n(int): The number of recommendation to output for each user. Default
            is 10.
    -------------------------------------------------------------------------
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
        
    '''

    # Mapping the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sorting the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [9]:
# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/schubert/.surprise_data/ml-100k


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fca4ec5fa58>

In [10]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

In [11]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

196 ['64', '50', '169', '515', '408', '603', '480', '178', '479', '963']
186 ['313', '427', '408', '513', '498', '178', '661', '1142', '199', '496']
22 ['56', '98', '357', '169', '302', '318', '493', '100', '22', '150']
244 ['14', '178', '483', '98', '137', '12', '134', '493', '474', '203']
166 ['318', '64', '408', '515', '483', '496', '174', '169', '50', '12']
298 ['313', '513', '480', '64', '408', '272', '515', '191', '169', '963']
115 ['179', '134', '603', '169', '189', '474', '135', '709', '408', '1449']
253 ['178', '170', '174', '114', '498', '251', '423', '520', '511', '480']
305 ['513', '124', '613', '647', '661', '524', '137', '641', '489', '1021']
6 ['114', '603', '607', '654', '198', '963', '663', '652', '657', '313']
62 ['603', '408', '175', '315', '192', '169', '488', '923', '657', '480']
286 ['98', '318', '494', '663', '12', '480', '64', '648', '1063', '659']
200 ['316', '12', '190', '64', '520', '272', '479', '136', '251', '963']
210 ['511', '100', '272', '474', '169', '4

# Computing `Precision` @ K and `Recall` @ K 

$$ Precision @ K = \frac{|Recommended\ Items\ that\ are\ Relevant|}{|Recommended\ Items|} $$

&nbsp;


$$ Recall @ K = \frac{|Recommended\ Items\ that\ are\ Relevant|}{|Relevant\ Items|} $$

In [2]:
# loading in the data 
sc_reviews = pd.read_csv("food_reviews_sc")
