In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('uucf.csv')
data=data.fillna('')
data.head()

Unnamed: 0,UserId,11: Star Wars: Episode IV - A New Hope (1977),12: Finding Nemo (2003),13: Forrest Gump (1994),14: American Beauty (1999),22: Pirates of the Caribbean: The Curse of the Black Pearl (2003),24: Kill Bill: Vol. 1 (2003),38: Eternal Sunshine of the Spotless Mind (2004),63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995),77: Memento (2000),...,8467: Dumb & Dumber (1994),8587: The Lion King (1994),9331: Clear and Present Danger (1994),9741: Unbreakable (2000),9802: The Rock (1996),9806: The Incredibles (2004),10020: Beauty and the Beast (1991),36657: X-Men (2000),36658: X2: X-Men United (2003),36955: True Lies (1994)
0,1648,,,,,4.0,3.0,,,,...,,4.0,,,5.0,3.5,3.0,,3.5,
1,5136,4.5,5.0,5.0,4.0,5.0,5.0,5.0,3.0,,...,1.0,5.0,,,,5.0,5.0,4.5,4.0,
2,918,5.0,5.0,4.5,,3.0,,5.0,,5.0,...,,5.0,,,,3.5,,,,
3,2824,4.5,,5.0,,4.5,4.0,,,5.0,...,,3.5,,,,,,,,
4,3867,4.0,4.0,4.5,,4.0,3.0,,,,...,1.0,4.0,,,,3.0,4.0,4.0,3.5,3.0


# Convert given data into a format which can be used for Collaborative Filtering Algorithms.

In [33]:
flat_data = pd.DataFrame(columns=['user','item','rating'])

cols = data.columns
users = data['UserId']
for u in users:
    for c in cols:
        if c =='UserId':
            continue
        i = c.split(':')[0]
        r = data[data['UserId']==u][c].tolist()[0]
        
        #print ('Adding user (%s), item (%s), rating (%s)'%(u,i,r))
        
        #if there are no ratings for a given user and movie then ignore that record
        if r=='':
            continue
        else:
            #put records in seperate dataframe    
            flat_data = flat_data.append({'user': u,'item':i,'rating':r}, ignore_index=True)
        
print (flat_data.shape)        
flat_data = flat_data.sample(frac=1).reset_index(drop=True)
print (flat_data.shape)        

(1581, 3)
(1581, 3)


# Collaboraive Filtering

    - I have used Surprise library for implementing CF see https://surprise.readthedocs.io/en/stable/index.html
    - For a list of supported algorithms see https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html
    
    - After looking at the results from both SVD and KNNWithMeans two observations come out
        - Both algorithms tend to recommend similar Top 5 items; although actual ranking within Top 5 may vary
        - However, SVD gives much better Prediction (of rating) as compared to SVD

## UUCF Using SVD - Model Based Approach

    - Model-based Collaborative Filtering: Singular Value Decomposition
    
    - load the required surprise library classes
    - Run a grid search to identify the best hyper parameters for SVD algorithm

In [74]:
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(1, 5.0))

# The columns must correspond to user id, item id and ratings (in that order).
data_srp_cf = Dataset.load_from_df(flat_data[['user', 'item', 'rating']], reader)

param_grid = {'n_epochs': [20,40,60,80], 'lr_all': [0.0001, 0.0005, 0.001, 0.002, 0.003],
              'reg_all': [0.01,0.02,0.03,0.04, 0.05]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(data_srp_cf)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# best RMSE score
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['mae'])

trainset = data_srp_cf.build_full_trainset()

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

0.9386502932821387
{'n_epochs': 80, 'lr_all': 0.002, 'reg_all': 0.04}


0.7193261114738474
{'n_epochs': 80, 'lr_all': 0.003, 'reg_all': 0.05}


## Create final SVD model using the identified best parameters

    - SVD is a type of Matrix Factorization Algorithm, https://surprise.readthedocs.io/en/stable/matrix_factorization.html

In [75]:
svd_algo = SVD(n_epochs=80, lr_all=0.002, reg_all= 0.04)
svd_algo.fit(trainset)
predictions_svd = svd_algo.test(testset)

## Make recommendations

    - Here we do two things
    - 1. Predict the ratings which a user would make for a particular movie
    - 2. Recommend movies which the user may want to see basis their Predicted ratings

In [76]:
def predictions_and_recommendations(preds,user,num_recommendation):
    recommended_item = []
    recommended_item_rating = []
    
    # scan through the predictions set and extract items which can be recommended to this user.
    for pred in preds:
        if pred[0]==user:
            recommended_item.append(pred[1])
            recommended_item_rating.append(pred[3])
               
    # create a DF
    recommendations_df = pd.DataFrame(data=list(zip(recommended_item, recommended_item_rating)),columns=['Items','Ratings'])
    recommendations_df = recommendations_df.sort_values(by=['Ratings'], ascending=False)[0:num_recommendation]
    return recommendations_df

In [77]:
pred_recommendations_svd = predictions_and_recommendations(predictions_svd, 89,5)
print ( pred_recommendations_svd)

   Items   Ratings
7     77  4.824305
3     85  4.735729
53   807  4.714744
14   274  4.615752
26   424  4.569902


## UUCF Using KNNWithMeans - Memory Based Approach

In [69]:
sim_options = {
    "name": ["msd", "cosine","pearson"],
    "min_support": [1,2,3, 4, 5,7,9,11,3,15],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(data_srp_cf)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

print(gs.best_score["mae"])
print(gs.best_params["mae"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

In [66]:
from surprise import KNNWithMeans

# here instead of cosine we could also choose msd, pearson. 
# See https://surprise.readthedocs.io/en/stable/similarities.html for options
knn_algo = KNNWithMeans(sim={'name': 'msd', 'min_support': 5, 'user_based': False})

knn_algo.fit(trainset)
predictions_knn = knn_algo.test(testset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [67]:
pred_recommendations_knn = predictions_and_recommendations(predictions_knn, 89,5)
print ( pred_recommendations_knn)

   Items   Ratings
3     85  4.941163
7     77  4.886317
26   424  4.839573
53   807  4.822805
49   568  4.789277


In [89]:
print ('Prediction from KNN',knn_algo.predict(89,'607'))
print ('Prediction from SVD',svd_algo.predict(89, '607'))

Prediction from KNN user: 89         item: 607        r_ui = None   est = 4.07   {'actual_k': 24, 'was_impossible': False}
Prediction from SVD user: 89         item: 607        r_ui = None   est = 3.95   {'was_impossible': False}


In [84]:
flat_data[flat_data['user']==89]

Unnamed: 0,user,item,rating
72,89,2501,4.0
93,89,602,4.0
124,89,603,4.5
294,89,604,3.5
302,89,453,5.0
322,89,862,4.5
389,89,607,3.0
468,89,38,5.0
499,89,194,5.0
555,89,278,4.5
