In [1]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 15.0 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626454 sha256=1b1ac7ea6882ae25bada6368a6896f96d15095ee5d650147689deebf9b4effc4
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import pandas as pd
import numpy as np
import surprise

In [3]:
ratings = pd.read_csv("ratings.txt",sep=' ',names = ['uid','iid','rating'])
ratings.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [4]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

Ratings range between 0.5 and 4.0


Converting the data into **surprise** format:

In [5]:
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

surprise.dataset.DatasetAutoFolds

**Similarity options:**

In case of Item based filtering, 'user_based' value will be `False`

In [6]:
similarity_options = {'name': 'cosine', 'user_based': True}

Default K = 40

In [7]:
algo = surprise.KNNBasic(sim_options = similarity_options)
output = algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


The above `.fit()` calculates expected rating for all the users

Say we want expected rating of user=50 for item 217

In [None]:
pred = algo.predict(uid='50',iid='217')
score = pred.est
print(score)

All the users

In [None]:
iids = ratings['iid'].unique()
iids

Consider ratings given by user=50

In [None]:
rec_50 = ratings[ratings['uid'] == 50 ]
iids50 = rec_50['iid']
print("List of iid that uid={0} has rated:".format(50))
print(iids50)

In [None]:
iids_to_predict = np.setdiff1d(iids,iids50)
print("List of iid which uid={0} did not rate(in all {1}) :".format(50,len(iids_to_predict)))
print(iids_to_predict)

Create a testset for getting the expected rating

In [None]:
testset = [[50,iid,0.] for iid in iids_to_predict]
testset

Generate predictions on testset

In [16]:
predictions = algo.test(testset)
predictions[5]

Prediction(uid=50, iid=20, r_ui=0.0, est=2.502007451844843, details={'actual_k': 7, 'was_impossible': False})

In [17]:
type(predictions)

list

In [18]:
pred_ratings = np.array([pred.est for pred in predictions])
pred_ratings

array([1.02491128, 2.3010819 , 3.36565625, ..., 2.5       , 3.        ,
       3.        ])

In [19]:
iids_to_predict

array([  14,   15,   16, ..., 2069, 2070, 2071])

Finding the index of maximum predicted rating

In [20]:
i_max = pred_ratings.argmax()
i_max

20

In [21]:
iids_to_predict[i_max] 

35

Recommending the item with maximum predicted rating

In [22]:
iid_recommend_most = iids_to_predict[i_max] 
print("Top item to be recommended for user {0} is {1} with predicted rating as {2}".format(50,iid_recommend_most,pred_ratings[i_max]))

Top item to be recommended for user 50 is 35 with predicted rating as 4.0


Getting top 10 items to be recommended for uid = 50

In [None]:
import heapq
i_sorted_10 = heapq.nlargest(10,  range(len(pred_ratings)), pred_ratings.take)
top_10_items = iids_to_predict[i_sorted_10]
print(top_10_items)

[ 35  54  68  97 107 111 118 136 162 228]


Tuning with different K

In [None]:
np.arange(30,110,10)

array([ 30,  40,  50,  60,  70,  80,  90, 100])

In [None]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'k': np.arange(30,110,10)}
kfold = KFold(n_splits=5, random_state=2021, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid, measures=['rmse', 'mae'], cv=kfold)

Running the Grid Search CV

In [None]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

**Best Score**

In [None]:
print(gs.best_score['rmse'])

0.8641237962182551


**Best Param**

In [None]:
print(gs.best_params['rmse'])

{'k': 60}
