In [73]:
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [74]:
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')

In [75]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [76]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [77]:
movies_with_ratings[movies_with_ratings.userId == 8.0].title.unique()

array(['Jumanji (1995)', 'GoldenEye (1995)',
       'American President, The (1995)', 'Get Shorty (1995)',
       'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Babe (1995)',
       'Clueless (1995)', 'Seven (a.k.a. Se7en) (1995)',
       'Usual Suspects, The (1995)', 'Braveheart (1995)',
       'Birdcage, The (1996)', 'Apollo 13 (1995)',
       'Batman Forever (1995)', 'Net, The (1995)', 'Nine Months (1995)',
       'Waterworld (1995)', 'Dumb & Dumber (Dumb and Dumber) (1994)',
       'Ed Wood (1994)', 'French Kiss (1995)', 'I.Q. (1994)',
       'Interview with the Vampire: The Vampire Chronicles (1994)',
       'Nell (1994)', 'Outbreak (1995)', 'Pulp Fiction (1994)',
       'Santa Clause, The (1994)', 'Shawshank Redemption, The (1994)',
       'While You Were Sleeping (1995)', 'Forrest Gump (1994)',
       'Four Weddings and a Funeral (1994)', 'Lion King, The (1994)',
       'Mask, The (1994)', 'Speed (1994)', 'True Lies (1994)',
       "City Slickers II: The Legend of Curly's Gold (1

In [78]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [79]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [80]:
ratings.rating.min()

0.5

In [81]:
ratings.rating.max()

5.0

In [82]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [83]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [115]:
algo = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7f5d27b526d8>

In [116]:
test_pred = algo.test(testset)

In [117]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8976


0.8975681874093613

In [118]:
algo.predict(uid=2, iid='Casino (1995)')

Prediction(uid=2, iid='Casino (1995)', r_ui=None, est=4.328928529933083, details={'actual_k': 50, 'was_impossible': False})

In [122]:
b = []
for i in movies_with_ratings.title.unique():
    a = algo.predict(uid=2, iid=i)
    b.append(a)

In [128]:
def byest_key(person):
    return person.est

In [129]:
sorted(b, key = byest_key, reverse = True)

[Prediction(uid=2, iid='Denise Calls Up (1995)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='Babes in Toyland (1934)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='My Man Godfrey (1957)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='On the Beach (1959)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='Bossa Nova (2000)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='Man with the Golden Arm, The (1955)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='Light Years (Gandahar) (1988)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='Two Family House (2000)', r_ui=None, est=5.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='Hop