In [1]:
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
# Step 1: Load dataset
ratings = pd.read_csv("/home/steve/Documents/Projects/Projects/ML Projects/Data/ratings.csv")

# Surprise needs a Reader with rating scale
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [3]:
# Step 2: Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
# Step 3: User-Based CF algorithm
sim_options = {
    "name": "cosine",   # similarity metric: 'cosine', 'pearson', 'msd'
    "user_based": False  # True = User-based CF, False = Item-based CF
}
algo = KNNBasic(sim_options=sim_options)

# Train the model
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7be4c3ae3510>

In [5]:
# Step 4: Predict and Evaluate
predictions = algo.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.9800
MAE:  0.7615


0.7614697475628528

In [None]:
pred_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, pred.est) for pred in predictions],
                      columns=['userId', 'movieId', 'actual_rating', 'predicted_rating'])
print(pred_df.head())

   userId  movieId  actual_rating  predicted_rating
0     140     6765            3.5          3.425000
1     603      290            4.0          3.550000
2     438     5055            4.0          3.162500
3     433   164179            5.0          3.608478
4     474     5114            4.0          3.503229
