In [None]:
!pip install surprise
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection.split import train_test_split
import matplotlib.pyplot as plt

In [None]:
data = load_dataset("nbtpj/movielens-1m-ratings")["train"].shuffle(seed=10).select(range(200000))
movielens_df = pd.DataFrame(data)
movielens_df = movielens_df[["user_id", "movie_id", "user_rating"]]

In [None]:
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(movielens_df, reader)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=10)

In [None]:
algo = KNNBasic()
algo.fit(train_data)

predictions = algo.test(test_data)
rmse = accuracy.rmse(predictions)

rmse

In [None]:
actual_ratings = [pred.r_ui for pred in predictions]
predicted_ratings = [round(pred.est) for pred in predictions]

plt.figure(figsize=(10,5))
plt.hist(predicted_ratings, bins=5, alpha=0.5, label="Predicted", color="#fc1c49")
plt.hist(actual_ratings, bins=5, alpha=0.5, label="Actual", color="#00a67d")
plt.title("Predicted vs. Actual Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.legend()
plt.show()