In [1]:
import pandas as pd
import numpy as np

columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)

columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]

combined_movies_data = pd.merge(df, movie_names, on='item_id')
combined_movies_data = combined_movies_data[['user_id','movie title', 'rating']]
combined_movies_data.head()

Unnamed: 0,user_id,movie title,rating
0,196,Kolya (1996),3
1,63,Kolya (1996),3
2,226,Kolya (1996),5
3,154,Kolya (1996),3
4,306,Kolya (1996),5


In [3]:
# my user_id is the 1001
my_ratings = pd.read_csv('my_movies_rating.csv')
my_ratings 

Unnamed: 0,user_id,movie title,rating
0,1001,Aladdin (1992),1.0
1,1001,Braveheart (1995),5.0
2,1001,"Clockwork Orange, A (1971)",2.0
3,1001,Dances with Wolves (1990),3.5
4,1001,"English Patient, The (1996)",2.0
5,1001,Face/Off (1997),3.5
6,1001,Forrest Gump (1994),4.0
7,1001,"Game, The (1997)",3.5
8,1001,"Godfather, The (1972)",5.0
9,1001,Jurassic Park (1993),3.5


In [4]:
combined_movies_data = pd.concat([combined_movies_data, my_ratings], axis=0)

# rename the columns to userID, itemID and rating
combined_movies_data.columns = ['userID', 'itemID', 'rating']

# use the transform method group by userID and count to keep the movies with more than 25 reviews

combined_movies_data['reviews'] = combined_movies_data.groupby(['itemID'])['rating'].transform('count')

combined_movies_data= combined_movies_data[combined_movies_data.reviews>25][['userID', 'itemID', 'rating']]

In [5]:
from surprise import NMF, SVD, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, CoClustering
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset

In [6]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(combined_movies_data, reader)

In [7]:
# get the list of the movie ids
unique_ids = combined_movies_data['itemID'].unique()

# get the list of the ids that the userid 1001 has rated
iids1001 = combined_movies_data.loc[combined_movies_data['userID']==1001, 'itemID']

# remove the rated movies for the recommendations
movies_to_predict = np.setdiff1d(unique_ids,iids1001)

## Recomender usinf NMF

In [8]:
algo = NMF()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
141,Casablanca (1942),4.04932
820,"Wrong Trousers, The (1993)",3.962472
585,Priest (1994),3.947852
608,Rear Window (1954),3.86969
401,Jean de Florette (1986),3.799231
787,Vertigo (1958),3.729461
536,North by Northwest (1959),3.679385
331,"Grand Day Out, A (1992)",3.665212
784,"Usual Suspects, The (1995)",3.663828
580,"Postino, Il (1994)",3.641549


## Recommender using SVD

In [9]:
algo = SVD()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
641,Schindler's List (1993),4.066133
168,"Close Shave, A (1995)",3.975784
608,Rear Window (1954),3.962249
549,One Flew Over the Cuckoo's Nest (1975),3.920732
330,"Graduate, The (1967)",3.882316
324,"Godfather: Part II, The (1974)",3.870759
661,"Shawshank Redemption, The (1994)",3.867759
796,Wallace & Gromit: The Best of Aardman Animatio...,3.838512
731,Taxi Driver (1976),3.802969
740,"Third Man, The (1949)",3.777051


## Recommender using SVD++

In [10]:
algo = SVDpp()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
641,Schindler's List (1993),3.943526
740,"Third Man, The (1949)",3.741685
549,One Flew Over the Cuckoo's Nest (1975),3.730276
354,Henry V (1989),3.676126
1,12 Angry Men (1957),3.67045
766,"Treasure of the Sierra Madre, The (1948)",3.662882
110,"Boot, Das (1981)",3.657877
608,Rear Window (1954),3.649203
820,"Wrong Trousers, The (1993)",3.636048
661,"Shawshank Redemption, The (1994)",3.629555


## Recommender Using KNN

In [12]:
algo = KNNWithZScore()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,iid,predictions
425,L.A. Confidential (1997),4.284275
661,"Shawshank Redemption, The (1994)",4.214673
168,"Close Shave, A (1995)",4.177236
820,"Wrong Trousers, The (1993)",4.161339
608,Rear Window (1954),4.118845
55,As Good As It Gets (1997),4.10819
1,12 Angry Men (1957),4.096164
549,One Flew Over the Cuckoo's Nest (1975),4.088914
784,"Usual Suspects, The (1995)",4.086552
647,Secrets & Lies (1996),4.082041


## Recommender using co-clustering

In [13]:
algo = CoClustering()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
168,"Close Shave, A (1995)",3.995159
141,Casablanca (1942),3.960877
796,Wallace & Gromit: The Best of Aardman Animatio...,3.951848
740,"Third Man, The (1949)",3.837421
158,Citizen Kane (1941),3.797017
549,One Flew Over the Cuckoo's Nest (1975),3.795754
647,Secrets & Lies (1996),3.769519
658,Shall We Dance? (1996),3.764957
463,"Manchurian Candidate, The (1962)",3.763629
231,Dr. Strangelove or: How I Learned to Stop Worr...,3.756665


## Evaluating

In [14]:
cv = []
# Iterate over all recommender system algorithms
for recsys in [NMF(), SVD(), SVDpp(), KNNWithZScore(), CoClustering()]:
    # Perform cross validation
    tmp = cross_validate(recsys, data, measures=['RMSE'], cv=3, verbose=False)
    cv.append((str(recsys).split(' ')[0].split('.')[-1], tmp['test_rmse'].mean()))

pd.DataFrame(cv, columns=['RecSys', 'RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,RecSys,RMSE
0,NMF,0.958624
1,SVD,0.933278
2,SVDpp,0.915737
3,KNNWithZScore,0.942214
4,CoClustering,0.956883
