In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,date
0,1,16,4.0,1217897793,2008-08-05 00:56:33
1,1,24,1.5,1217895807,2008-08-05 00:23:27
2,1,32,4.0,1217896246,2008-08-05 00:30:46
3,1,47,4.0,1217896556,2008-08-05 00:35:56
4,1,50,4.0,1217896523,2008-08-05 00:35:23


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'])

In [None]:
movies = movies.drop_duplicates(subset=['movieId'])
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [None]:
ratings['date'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [None]:
ratings_with_titles = pd.merge(ratings, movies, on='movieId')
ratings_with_titles

Unnamed: 0,userId,movieId,rating,timestamp,date,title,genres
0,1,16,4.0,1217897793,2008-08-05 00:56:33,Casino (1995),Crime|Drama
1,1,24,1.5,1217895807,2008-08-05 00:23:27,Powder (1995),Drama|Sci-Fi
2,1,32,4.0,1217896246,2008-08-05 00:30:46,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,4.0,1217896556,2008-08-05 00:35:56,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,4.0,1217896523,2008-08-05 00:35:23,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...,...
105334,668,142488,4.0,1451535844,2015-12-31 04:24:04,Spotlight (2015),Thriller
105335,668,142507,3.5,1451535889,2015-12-31 04:24:49,Pawn Sacrifice (2015),Drama
105336,668,143385,4.0,1446388585,2015-11-01 14:36:25,Bridge of Spies (2015),Drama|Thriller
105337,668,144976,2.5,1448656898,2015-11-27 20:41:38,Bone Tomahawk (2015),Horror|Western


In [None]:
df = ratings_with_titles.pivot_table(index='userId', columns='movieId', values='rating')
df

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,5.0,,2.0,,3.0,,,,,,...,,,,,,,,,,
3,,,,,3.0,,3.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,,,,,,,,,,,...,,,,,,,,,,
665,,,,,,,,,,,...,,,,,,,,,,
666,,,,,,,,,,,...,,,,,,,,,,
667,,,,,,,,,,,...,,,,,,,,,,


In [None]:
user_item_matrix = df.fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_item_matrix.shape

(668, 10325)

In [None]:
user_means = user_item_matrix.mean(axis=1)
normalized_matrix = user_item_matrix.sub(user_means, axis=0)
normalized_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,...,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709,-0.039709
2,4.989056,-0.010944,1.989056,-0.010944,2.989056,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944,...,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944,-0.010944
3,-0.026828,-0.026828,-0.026828,-0.026828,2.973172,-0.026828,2.973172,-0.026828,-0.026828,-0.026828,...,-0.026828,-0.026828,-0.026828,-0.026828,-0.026828,-0.026828,-0.026828,-0.026828,-0.026828,-0.026828
4,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,...,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976,-0.049976
5,3.979031,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,...,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969,-0.020969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,...,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153,-0.029153
665,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,...,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605,-0.070605
666,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,...,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833,-0.072833
667,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,...,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167,-0.027167


In [None]:
svd = TruncatedSVD(n_components=200)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_

In [None]:
predicted_ratings = np.dot(user_factors, item_factors)
denormalized_ratings = predicted_ratings + user_means.values[:, np.newaxis]
denormalized_ratings

array([[ 6.94772559e-01, -2.39012351e-01,  1.85688005e-01, ...,
         1.79241101e-02,  2.98661913e-02,  1.06623324e-02],
       [ 4.40473972e+00,  3.80646395e-01,  2.05993460e+00, ...,
        -3.16379377e-04,  6.65753412e-03, -4.06994248e-03],
       [-2.79240278e-01,  4.60619006e-01,  5.85030962e-01, ...,
         2.05341647e-02,  5.25051178e-03,  1.84361906e-02],
       ...,
       [-1.40448016e-01,  4.36564731e-01, -3.30994558e-02, ...,
         6.78419073e-02,  1.03230037e-01,  6.61782332e-02],
       [ 6.92692993e-01, -4.01412703e-01, -4.62405534e-01, ...,
         2.74046540e-02,  1.02299646e-01,  2.74838486e-02],
       [ 4.51224532e+00,  4.52733817e+00,  3.52140254e+00, ...,
         1.53814870e+00,  6.03719698e+00,  1.53811271e+00]])

In [None]:
min_rating = 1
max_rating = 5
[471, 695, 2056, 426, 4457, 3885, 971, 44, 922, 982]
denormalized_ratings_clipped = np.clip(denormalized_ratings, min_rating, max_rating)

In [None]:
def get_recommendations(predictions, user_id, original_ratings, top_n=10):
    user_predictions = predictions[user_id - 1]
    user_rated_movies = original_ratings.loc[user_id, original_ratings.loc[user_id] > 0].index.tolist()
    unrated_movies = [i for i in range(len(user_predictions)) if i not in user_rated_movies]
    sorted_movie_indices = sorted(unrated_movies, key=lambda x: user_predictions[x], reverse=True)
    return sorted_movie_indices[:top_n]

recommendations = get_recommendations(denormalized_ratings_clipped, user_id=20, original_ratings=user_item_matrix)

In [None]:
recommendations

[8626, 7711, 7666, 8347, 7910, 7197, 8117, 4802, 5880, 7270]

In [None]:
recommended_movie_ids = df.columns[recommendations]
recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()
recommended_movie_titles

['Pirates of the Caribbean: The Curse of the Black Pearl (2003)', 'Incredibles, The (2004)', 'Casino Royale (2006)', 'Ratatouille (2007)', 'Dark Knight, The (2008)', 'Iron Man (2008)', 'Slumdog Millionaire (2008)', 'Up (2009)', 'Avatar (2009)', 'Inception (2010)']
