In [1]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2023-09-23 13:20:17--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-09-23 13:20:17 (21.4 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

In [3]:
import pandas as pd


data_dir = '/content/ml-100k/'

# Load the user ratings data into a Pandas DataFrame
ratings_file = data_dir + 'u.data'  # This file contains user ratings
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv(ratings_file, sep='\t', names=ratings_cols)

# Load movie information (if available)
movies_file = data_dir + 'u.item'  # This file contains movie information
movies_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies_df = pd.read_csv(movies_file, sep='|', names=movies_cols, encoding='latin-1')

# Preprocessing

# Drop unnecessary columns (e.g., timestamp, video_release_date, imdb_url)
ratings_df = ratings_df[['user_id', 'movie_id', 'rating']]

# Handle missing values (if any)
ratings_df.dropna(inplace=True)

# Convert categorical variables to numerical representations (e.g., user_id and movie_id)
ratings_df['user_id'] = ratings_df['user_id'].astype('category').cat.codes
ratings_df['movie_id'] = ratings_df['movie_id'].astype('category').cat.codes

# Optionally, you can merge movie information into the ratings DataFrame
# For example, to add movie titles to the ratings DataFrame:
ratings_df = pd.merge(ratings_df, movies_df[['movie_id', 'title']], on='movie_id')

# Now, you have a preprocessed DataFrame ready for building your recommendation system
print(ratings_df.head())

   user_id  movie_id  rating  title
0      307         0       4      0
1      307         0       4      0
2      307         0       4      0
3      307         0       4      0
4      307         0       4      0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['user_id'] = ratings_df['user_id'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['movie_id'] = ratings_df['movie_id'].astype('category').cat.codes


In [5]:
ratings_df.to_csv('preprocessed_ratings.csv', index=False)

In [4]:
!pip install scikit-surprise -qqq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m419.8/772.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone


In [9]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import Reader

ratings_df = pd.read_csv('/content/preprocessed_ratings.csv')

# Building the Recommendation System with Matrix Factorization (SVD)

# Create a Surprise Dataset object from the DataFrame
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Initialize the SVD model
svd_model = SVD(n_factors=100, n_epochs=20, random_state=42)

# Fit the model on the training data
svd_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7999326c3b20>

In [10]:
user_id_to_predict = 6
user_ratings = ratings_df[ratings_df['user_id'] == user_id_to_predict]
movies_not_rated_by_user = ratings_df[~ratings_df['movie_id'].isin(user_ratings['movie_id'])]


movies_to_predict = list(movies_not_rated_by_user['movie_id'])

predictions = [svd_model.predict(user_id_to_predict, movie_id) for movie_id in movies_to_predict]

# Sort the predictions by estimated rating (higher first)
sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

# Get the top N recommended movie IDs
top_n = 10
top_movie_ids = [prediction.iid for prediction in sorted_predictions[:top_n]]

# Get the movie titles corresponding to the recommended movie IDs
recommended_movies = movies_df[movies_df['movie_id'].isin(top_movie_ids)]['title']

print("Top {} movie recommendations for user {}:".format(top_n, user_id_to_predict))
for idx, movie in enumerate(recommended_movies, start=1):
    print("{}. {}".format(idx, movie))


test_predictions = svd_model.test(testset)
rmse = accuracy.rmse(test_predictions)
print("Root Mean Squared Error (RMSE) on test set: {:.4f}".format(rmse))

Top 10 movie recommendations for user 6:
1. 0
2. 0
3. 0
4. 0
5. 0
6. 0
7. 1
8. 0
9. 0
10. 0
11. 0
12. 0
13. 0
14. 0
15. 0
16. 0
17. 0
18. 0
19. 0
20. 0
21. 0
22. 0
23. 0
24. 0
25. 0
26. 0
27. 1
28. 0
29. 1
30. 0
31. 0
32. 0
33. 0
34. 0
35. 0
36. 0
37. 0
38. 0
39. 0
40. 0
41. 0
42. 0
43. 0
44. 0
45. 0
46. 0
47. 0
48. 0
49. 0
50. 0
51. 0
52. 0
53. 0
54. 0
55. 0
56. 0
57. 0
58. 0
59. 0
60. 0
61. 0
62. 0
63. 0
64. 1
65. 0
66. 0
67. 0
68. 0
69. 0
70. 0
71. 0
72. 0
73. 0
74. 0
75. 0
76. 0
77. 0
78. 0
79. 0
80. 0
81. 0
82. 0
83. 0
84. 0
85. 0
86. 0
87. 0
88. 1
89. 0
90. 0
91. 0
92. 0
93. 0
94. 0
95. 0
96. 0
97. 0
98. 0
99. 0
100. 0
101. 0
102. 0
103. 0
104. 0
105. 0
106. 0
107. 0
108. 0
109. 0
110. 0
111. 0
112. 0
113. 0
114. 0
115. 0
116. 0
117. 0
118. 0
119. 0
120. 0
121. 0
122. 0
123. 0
124. 0
125. 0
126. 0
127. 0
128. 0
129. 0
130. 0
131. 0
132. 0
133. 0
134. 0
135. 0
136. 0
137. 0
138. 0
139. 0
140. 0
141. 0
142. 0
143. 0
144. 0
145. 0
146. 0
147. 0
148. 0
149. 0
150. 0
151. 0
152. 0
153