## User-based Collaborative Filtering

In [26]:
## Example of concepts of User-based Collaborative Filtering

In [3]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp311-cp311-macosx_11_0_arm64.whl size=1100032 sha256=52405665aed0e6551b901ab7c4944c33136001b8410f9f12d34a67caef926ee5
  Stored in directory: /Users/minseokoh/Library/Caches/pip/wheels/f4/2b/26/e2a5eae55d3b7688995e66abe7f40473aac6c95ddd8ee174a8
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [4]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split, cross_validate

In [7]:
ratings_dict = {
    "user_id": [1, 1, 1, 2, 2, 2, 3, 3, 3],
    "movie_id": [1, 2, 3, 1, 2, 3, 1, 2, 3],
    "rating": [4, 5, 3, 5, 3, 2, 3, 4, 4]
}

In [9]:
rating_df = pd.DataFrame(ratings_dict)
rating_df

Unnamed: 0,user_id,movie_id,rating
0,1,1,4
1,1,2,5
2,1,3,3
3,2,1,5
4,2,2,3
5,2,3,2
6,3,1,3
7,3,2,4
8,3,3,4


In [15]:
reader = Reader(rating_scale=(1,5))
reader
data = Dataset.load_from_df(rating_df[['user_id', 'movie_id', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x169a19250>

In [16]:
sim_options = {
    "name": "cosine",
    "user_based": True,  # User-based CF
}

In [17]:
model = KNNBasic(sim_options=sim_options)

In [18]:
trainset, testset = train_test_split(data, test_size=0.25)

In [19]:
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x16997f110>

In [21]:
user_id = 1
movie_id = 3
predicted_rating = model.predict(user_id, movie_id)
print(predicted_rating)

user: 1          item: 3          r_ui = None   est = 3.49   {'actual_k': 2, 'was_impossible': False}


In [25]:
predictions = model.test(testset)
predictions

## uid = user, iid= movie_id, r_ui= the actual rating given by the user to the item in the test set.
## est = the rating predicted by the model for the user-item pair.
## actual_k = the number of similar users considered when making the rating prediction.
## was_impossible =  If True, prediction was not possible (often due to lack of data). 
## If False,the prediction was successfully made.

[Prediction(uid=2, iid=2, r_ui=3.0, est=4.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid=3, r_ui=2.0, est=2.491253569948684, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=1, iid=1, r_ui=4.0, est=3.9884056990928314, details={'actual_k': 3, 'was_impossible': False})]

In [24]:
cross_validate(model, data, measures=['RMSE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.5546  0.8660  1.1902  1.2036  0.2813  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.55456318, 0.8660254 , 1.19023807]),
 'fit_time': (0.0005269050598144531,
  4.124641418457031e-05,
  2.3126602172851562e-05),
 'test_time': (0.00010395050048828125,
  2.47955322265625e-05,
  1.7881393432617188e-05)}