In [2]:
import numpy as np
import pandas as pd
import io
from zipfile import ZipFile
import urllib

#Download the zipfile. I have mentioned this step because if you would like
#to download the dataset directly from the source and unzip

#Uncomment the next line to execute
##tmpFile = urllib.request.urlopen('https://www.librec.net/datasets/filmtrust.zip')

#Now you need to unzip the file
#Uncomment the next line to execute
##tmpFile = zipfile.ZipFile(io.BytesIO(tmpFile.read()))

#open the desired file as pandas dataframe, close zipfile
#Uncomment the next 2 lines to execute
##dataset = pd.read_table(io.BytesIO(tmpFile.read('ratings.txt')), sep=' ', names=['uid', 'iid', 'rating'])
##tmpFile.close()



# the otherwise step would be download the same file from the source separately and unzip it
# through the python utility in the code
filename = "../data/filmtrust.zip"
with ZipFile(filename, 'r') as zip:
    dataset = zip.read('ratings.txt')

dataset = pd.read_table(io.BytesIO(dataset), sep = ' ', names= ['uid', 'iid', 'rating'])

dataset.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [4]:
#for fitting the model, the package is used surprise and we are to load the data into package
#using the Reader class
import surprise
lower_rating = dataset['rating'].min()
upper_rating = dataset['rating'].max()
print('Review range: {0} to {1}'.format(lower_rating, upper_rating))

Review range: 0.5 to 4.0


In [5]:
#Review range default for surprise package is 1-5
reader = surprise.Reader(rating_scale=(0.5, 4.0))
data = surprise.Dataset.load_from_df(dataset, reader)

# the method of Single Value Decomposition SVD++ will be used for the recommender system
alg = surprise.SVDpp()
output = alg.fit(data.build_full_trainset())

#predicting the model with a specific value
pred = alg.predict(uid='50', iid='52')
score = pred.est
print(score)

3.0028030537791928


In [6]:
#making recommendations

# get the list of all movie ids
iids = dataset['iid'].unique()
# get the list iids that uid 50 has rated
iids50 = dataset.loc[dataset['uid'] == 50, 'iid']
#remove the iids that uid has rated from the list of all movie ids
iids_to_pred = np.setdiff1d(iids, iids50)

#predicting the score of the each of the movie ids that user 50 didn't rate
testset = [[50, iid, 4.] for iid in iids_to_pred]
predictions = alg.test(testset)
predictions[0]

Prediction(uid=50, iid=14, r_ui=4.0, est=3.0783304988599336, details={'was_impossible': False})

In [7]:
#to find the best
pred_ratings = np.array([pred.est for pred in predictions])
#Find the index of the maximum predicted ratings
i_max = pred_ratings.argmax()
#use this to find the corresponding iid to recommend
iid = iids_to_pred[i_max]
print('Top item for user 50 has iid {0} with predicted rating {1}'.format(iid, pred_ratings[i_max]))

Top item for user 50 has iid 189 with predicted rating 4.0


In [8]:
#evaluation of the model
param_grid = {'lr_all' : [.001, .01], 'reg_all' : [.1, .5]}
gs = surprise.model_selection.GridSearchCV(surprise.SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

#printing the combination of the parameters that gave the best score
print(gs.best_params['rmse'])

{'lr_all': 0.01, 'reg_all': 0.1}


In [9]:
#cross validation
alg = surprise.SVDpp(lr_all=.001)
output = surprise.model_selection.cross_validate(alg, data, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8237  0.8317  0.8263  0.8365  0.8242  0.8285  0.0049  
MAE (testset)     0.6521  0.6527  0.6544  0.6642  0.6547  0.6556  0.0044  
Fit time          32.46   33.00   35.97   42.95   45.41   37.96   5.28    
Test time         0.69    0.70    1.20    0.86    0.77    0.84    0.19    
