In [22]:
import pandas as pd
from surprise import Reader

In [23]:
file_path2 = '/Users/rogerlbcn/Library/CloudStorage/OneDrive-Personal/Python_Projects/Movies_RecSys/Movies_RecSys/ml-25m/ratings.csv'
# ratings_df_s = pd.read_csv(file_path2, nrows=1000000) # NOTE: The kernel doesn't crush with 1M observations
ratings_df_s = pd.read_csv(file_path2, nrows=5000000) # Using 5 Million observations

## Inspecting dataset

In [24]:
ratings_df_s

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
4999995,32564,54259,3.5,1337991854
4999996,32564,56367,3.5,1337991737
4999997,32564,64614,4.5,1337991044
4999998,32564,64620,3.5,1337992055


In [25]:
rating_count = ratings_df_s.count()
rating_count


userId       5000000
movieId      5000000
rating       5000000
timestamp    5000000
dtype: int64

In [26]:
ratings_df_s.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [27]:
ratings_df_s.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,5000000.0,5000000.0,5000000.0,5000000.0
mean,16532.77,21624.2,3.531443,1216921000.0
std,9412.794,39525.61,1.063327,227614500.0
min,1.0,1.0,0.5,789652000.0
25%,8455.0,1197.0,3.0,1013909000.0
50%,16658.0,2951.0,3.5,1196567000.0
75%,24737.0,8641.0,4.0,1449384000.0
max,32564.0,209163.0,5.0,1574328000.0


In [28]:
ratings_df_s = ratings_df_s.drop(columns=['timestamp'])  # Features excluding 'movieId'

In [29]:
ratings_df_s

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
4999995,32564,54259,3.5
4999996,32564,56367,3.5
4999997,32564,64614,4.5
4999998,32564,64620,3.5


In [30]:
ratings_df_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 114.4 MB


In [31]:
ratings_df_s['rating'].unique()

array([5. , 3.5, 4. , 2.5, 4.5, 3. , 0.5, 2. , 1. , 1.5])

In [32]:
for i in ratings_df_s:
    print(i)

userId
movieId
rating


In [33]:
from surprise import Dataset
from surprise import Reader

# Define a Reader object specifying the rating scale
reader = Reader(rating_scale=(0, 5))  # Assuming your ratings range from 1 to 5

data = Dataset.load_from_df(ratings_df_s[['userId', 'movieId', 'rating']], reader)

In [34]:
data

<surprise.dataset.DatasetAutoFolds at 0x30acbf790>

In [35]:
len(ratings_df_s)

5000000

In [36]:
from surprise.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)



## Creating and fitting the model (`KNNBasic`)

This model `KNNBasic` with 5M overvations took 13 min 47 sec to run

In [37]:
from surprise import KNNBasic

# Initialize the model (you can use any other algorithm as well)
model = KNNBasic()

# Train the model on the training set
model.fit(train_set)


Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x30accfd10>

## Creating predictions on the test set

It took a bit over 16 min to get the predictions

In [38]:
# Creating some predicitons on the test set:

predictions = model.test(test_set)
predictions[:5] # showing the first 5

[Prediction(uid=23433, iid=595, r_ui=3.0, est=3.9375, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=19718, iid=164909, r_ui=2.5, est=4.289689069233612, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=31776, iid=413, r_ui=3.0, est=2.6866965042390167, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=30197, iid=31658, r_ui=3.0, est=4.086720237723111, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=15526, iid=780, r_ui=5.0, est=3.93428502405989, details={'actual_k': 40, 'was_impossible': False})]

In [39]:
### Using the ur method ###
# Checking user 890's ratings 

train_set.ur[890]

[(189, 3.0),
 (2706, 4.0),
 (12, 5.0),
 (524, 5.0),
 (7897, 2.0),
 (871, 4.0),
 (922, 4.0),
 (3459, 4.0),
 (5325, 5.0),
 (3192, 4.0),
 (2199, 5.0),
 (1493, 4.0),
 (34, 3.0),
 (3297, 4.0),
 (663, 4.0),
 (1829, 5.0),
 (4845, 2.0),
 (604, 4.0),
 (1551, 3.0),
 (937, 3.0),
 (784, 5.0),
 (1818, 4.0),
 (4518, 4.0),
 (2725, 4.0),
 (1642, 5.0),
 (2275, 3.0),
 (3434, 3.0),
 (2097, 5.0),
 (773, 3.0),
 (798, 4.0),
 (1500, 4.0),
 (10708, 3.0),
 (10577, 4.0),
 (1631, 5.0),
 (5113, 3.0),
 (931, 3.0),
 (6444, 5.0),
 (1024, 3.0),
 (1866, 3.0),
 (470, 5.0),
 (4439, 4.0),
 (792, 4.0),
 (3075, 4.0),
 (1938, 3.0),
 (778, 4.0),
 (1405, 3.0),
 (5084, 4.0),
 (1680, 4.0),
 (131, 5.0),
 (2638, 3.0),
 (4285, 4.0),
 (605, 5.0),
 (818, 4.0),
 (1957, 5.0),
 (4497, 4.0),
 (1612, 4.0),
 (2370, 5.0),
 (1045, 3.0),
 (607, 4.0),
 (138, 4.0),
 (2052, 2.0),
 (133, 3.0),
 (3040, 3.0),
 (343, 4.0),
 (195, 5.0),
 (4060, 4.0),
 (596, 5.0),
 (3176, 4.0),
 (416, 5.0),
 (7210, 4.0),
 (2568, 4.0),
 (320, 3.0),
 (1900, 4.0),
 (124

In [40]:
# Calculating the RMSE of the RecSys Model:
from surprise import accuracy

accuracy.rmse(predictions)

RMSE: 0.9000


0.9000257907774956

In [43]:
import matplotlib.pyplot as plt

# List of K values for K-Nearest Neighbors
k_values = [5, 10, 15, 20, 25]
rmse_scores = []

for k in k_values:
    model = KNNBasic(k=k, sim_options={'name': 'cosine', 'user_based': True})
    model.fit(train_set)
    predictions = model.test(test_set)
    rmse = accuracy.rmse(predictions)
    rmse_scores.append(rmse)

# Plotting
plt.figure(figsize=(8, 4))
plt.plot(k_values, rmse_scores, marker='o', linestyle='-', color='b')
plt.title('RMSE vs. Number of Neighbors (K)')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()


In [41]:
from surprise import SVD
from surprise import accuracy

In [42]:
svd_recommender = SVD()
svd_recommender.fit(train_set)

svd_predictions = svd_recommender.test(test_set)

accuracy.rmse(svd_predictions)

RMSE: 0.8005


0.8005130364986649