In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD


In [2]:
ratings = pd.read_csv(
    "../data/ml-100k/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

user_movie = ratings.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)

# Mean centering
user_means = user_movie.mean(axis=1)
user_movie_centered = user_movie.sub(user_means, axis=0)

# Fill NaNs with 0
matrix = user_movie_centered.fillna(0).values


In [3]:
k = 20


In [4]:
svd = TruncatedSVD(n_components=k, random_state=42)
U = svd.fit_transform(matrix)
Sigma = svd.singular_values_
VT = svd.components_


In [5]:
print(U.shape)      # (users, k)
print(Sigma.shape) # (k,)
print(VT.shape)    # (k, movies)


(943, 20)
(20,)
(20, 1682)


In [6]:
Sigma_matrix = np.diag(Sigma)
reconstructed = np.dot(np.dot(U, Sigma_matrix), VT)


In [7]:
reconstructed += user_means.values.reshape(-1, 1)


In [8]:
reconstructed_df = pd.DataFrame(
    reconstructed,
    index=user_movie.index,
    columns=user_movie.columns
)

reconstructed_df.head()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60.806546,-5.408948,-6.002044,5.934714,-16.008613,6.89839,58.582252,21.155579,55.280187,13.08713,...,3.610509,3.598407,3.624802,3.656386,3.517752,2.88416,3.568709,3.226434,3.635476,3.162724
2,2.756193,3.354869,-1.245336,4.315434,2.486831,5.218668,9.993978,12.164082,13.30838,1.957832,...,3.558561,3.716423,3.649097,3.517209,3.656629,3.648479,3.706173,3.677326,3.708244,3.74831
3,0.64109,1.581151,-0.543322,5.652423,2.917747,3.109414,1.322581,2.573878,4.824746,2.524461,...,2.773451,2.799435,2.759447,2.679222,2.796683,3.120161,2.814844,2.967502,2.793966,2.785063
4,9.315613,4.412405,4.789807,3.073277,2.585963,4.467742,7.314908,5.626426,5.698263,4.980957,...,4.365363,4.334576,4.323666,4.302621,4.327335,4.0393,4.316494,4.177897,4.33507,4.299425
5,41.732761,1.505962,-8.421217,12.874197,-3.96509,4.085836,47.768134,18.27546,-3.068532,8.421112,...,2.976195,2.906664,2.803447,2.649225,2.944176,3.051174,2.884416,2.967795,2.878416,2.728988


In [9]:
movies = pd.read_csv(
    "../data/ml-100k/u.item",
    sep="|",
    encoding="latin-1",
    header=None
)

movies = movies[[0, 1]]
movies.columns = ["movie_id", "title"]


In [11]:
def recommend_movies(user_id, n=5):
    user_ratings = user_movie.loc[user_id]
    unseen_movies = user_ratings[user_ratings.isna()].index
    
    predictions = reconstructed_df.loc[user_id, unseen_movies]
    top_movies = predictions.sort_values(ascending=False).head(n)
    
    return movies[movies["movie_id"].isin(top_movies.index)]


In [12]:
recommend_movies(user_id=10, n=5)


Unnamed: 0,movie_id,title
78,79,"Fugitive, The (1993)"
88,89,Blade Runner (1982)
186,187,"Godfather: Part II, The (1974)"
317,318,Schindler's List (1993)
426,427,To Kill a Mockingbird (1962)
