In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163497 sha256=a0493bb9bf6d223bab194699325a55a0c18fd11d08c1f9014858554a677ce215
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import pandas as pd
import numpy as np
import surprise
import os

In [3]:
os.chdir("/content/drive/MyDrive/Cases/ml-100k")
ratings = pd.read_csv("u.data",sep='\t',names = ['uid','iid','rating','ts'])
ratings.drop('ts', axis=1, inplace=True)
ratings.head()

Unnamed: 0,uid,iid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))

Ratings range between 1 and 5


In [5]:
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [40]:
similarity_options = {'name': 'cosine', 'user_based': True}

Tuning for best K

In [41]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

In [42]:
param_grid = {'k': np.arange(20,80,10)}
param_grid

{'k': array([20, 30, 40, 50, 60, 70])}

In [43]:
kfold = KFold(n_splits=5, random_state=2023, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid,
                  measures=['rmse', 'mae'], cv=kfold)

In [44]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Best Score:

In [45]:
print(gs.best_score['rmse'])

0.9764110833622416


Best Parameter:

In [46]:
print(gs.best_params['rmse'])

{'k': 20}


We can now use the algorithm that yields the best rmse:

In [47]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7a230ddef160>

Total Items:

In [48]:
iids = ratings['iid'].unique()
print(iids)

[ 242  302  377 ... 1637 1630 1641]


The list of items rated by user 100:

In [49]:
u_iid = ratings[ratings['uid']==100]['iid'].unique()
print(u_iid)

[ 344  354  268  321  355  750  266  288  302  340  689  905  289  691
  316 1236  342  990  333  752  323  348  313  292 1238  879  300  328
 1235 1237  678  286  908  690  874  880  349  310  347 1234  270 1233
  326  269  258  900  886  294  272  881  895  892  887  885  346  751
  271  898  315]


List of the items not rated by user 100:

In [50]:
iids_to_predict = np.setdiff1d(iids, u_iid)
print(iids_to_predict)

[   1    2    3 ... 1680 1681 1682]


Extracting the estimated rating from iids_to_predict

In [51]:
testset = [[100,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)

In [52]:
predictions[0]

Prediction(uid=100, iid=1, r_ui=0.0, est=4.666666666666667, details={'actual_k': 20, 'was_impossible': False})

Getting items with top ratings

In [None]:
pred_ratings = [ (predictions[i].iid,predictions[i].est) for i in range(0,len(predictions)) ]
predicted_rating = pd.DataFrame(pred_ratings, columns=['Item_ID','est_rating'])
exp_ratings = predicted_rating.sort_values(by='est_rating', ascending=False)
sorted_exp = exp_ratings.sort_values(by=['est_rating','Item_ID'], ascending=[False, True])
sorted_exp.head(10)

Movie Information:

In [26]:
movies = pd.read_csv("u.item", sep="|",encoding='latin-1', header=None)
movies = movies.iloc[:,:3]
movies.columns = ['Item_ID','movie','release_date']
movies.head()

Unnamed: 0,Item_ID,movie,release_date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


In [55]:
sorted_exp = sorted_exp.merge(movies,on='Item_ID')
sorted_exp.head(10)

Unnamed: 0,Item_ID,est_rating,movie,release_date
0,814,5.0,"Great Day in Harlem, A (1994)",01-Jan-1994
1,1122,5.0,They Made Me a Criminal (1939),01-Jan-1939
2,1189,5.0,Prefontaine (1997),24-Jan-1997
3,1201,5.0,Marlene Dietrich: Shadow and Light (1996),02-Apr-1996
4,1293,5.0,Star Kid (1997),16-Jan-1998
5,1536,5.0,Aiqing wansui (1994),22-Jul-1996
6,1599,5.0,Someone Else's America (1995),10-May-1996
7,1653,5.0,Entertaining Angels: The Dorothy Day Story (1996),27-Sep-1996
8,1467,5.0,"Saint of Fort Washington, The (1993)",01-Jan-1993
9,1500,5.0,Santa with Muscles (1996),08-Nov-1996


## Item Based Collaborative Filtering

Tuning for best number of Items

In [30]:
param_grid = {'k': np.arange(20,80,10),
               'sim_options': {'name': ['cosine'],
                              'user_based': [False]}}
param_grid

{'k': array([20, 30, 40, 50, 60, 70]),
 'sim_options': {'name': ['cosine'], 'user_based': [False]}}

In [31]:
kfold = KFold(n_splits=5, random_state=23, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid,
                  measures=['rmse', 'mae'], cv=kfold)
gs.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

Best Score:

In [32]:
print(gs.best_score['rmse'])

1.0240756703702425


Best Parameter:

In [33]:
print(gs.best_params['rmse'])

{'k': 60, 'sim_options': {'name': 'cosine', 'user_based': False}}


We can now use the algorithm that yields the best rmse:

In [None]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f8c0932a890>

Extracting the estimated rating from iids_to_predict

In [None]:
testset = [[100,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)

In [None]:
predictions[0]

Prediction(uid=100, iid=1, r_ui=0.0, est=3.0669535757554143, details={'actual_k': 56, 'was_impossible': False})

In [None]:
pred_ratings = np.array([pred.est for pred in predictions])

Getting the item with highest expected rating

In [None]:
i_max = np.argmax(pred_ratings)
print("Item:",iids_to_predict[i_max],"is the item with highest expected rating as",pred_ratings[i_max])

Item: 1619 is the item with highest expected rating as 4.333333333333333


In [None]:
exp_ratings = pd.DataFrame({'Item_ID': iids_to_predict, 'Exp_Rating':pred_ratings})
sorted_exp = exp_ratings.sort_values(by=['Exp_Rating','Item_ID'], ascending=[False, True])
sorted_exp.head(10)

Movie Information:

In [None]:
movies = pd.read_csv("u.item", sep="|",encoding='latin-1', header=None)
movies = movies.iloc[:,:3]
movies.columns = ['Item_ID','movie','release_date']
movies.head()

In [None]:
sorted_exp = sorted_exp.merge(movies,on='Item_ID')
sorted_exp.head(10)

Unnamed: 0,Item_ID,Exp_Rating,movie,release_date
0,1619,4.333333,All Things Fair (1996),08-Mar-1996
1,1556,4.0,Condition Red (1995),01-Jan-1995
2,1674,4.0,Mamma Roma (1962),01-Jan-1962
3,1661,3.923077,"New Age, The (1994)",01-Jan-1994
4,1306,3.905969,Delta of Venus (1994),01-Jan-1994
5,1616,3.888889,Desert Winds (1995),01-Jan-1995
6,1307,3.82118,Carmen Miranda: Bananas Is My Business (1994),01-Jan-1994
7,1627,3.8,"Wife, The (1995)",26-Jul-1996
8,1308,3.763161,Babyfever (1994),01-Jan-1994
9,1546,3.75,Shadows (Cienie) (1988),01-Jan-1988
