In [27]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD


In [29]:
ratings = pd.read_csv(
    "../data/ml-100k/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

user_movie = ratings.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)

user_means = user_movie.mean(axis=1)
user_movie_centered = user_movie.sub(user_means, axis=0)

matrix = user_movie_centered.fillna(0).values


In [30]:
k = 20

svd = TruncatedSVD(n_components=k, random_state=42)
U = svd.fit_transform(matrix)
VT = svd.components_

print(U.shape)   # (users, k)
print(VT.shape)  # (k, movies)


(943, 20)
(20, 1682)


In [31]:
reconstructed = U @ VT
reconstructed += user_means.values.reshape(-1, 1)

reconstructed_df = pd.DataFrame(
    reconstructed,
    index=user_movie.index,
    columns=user_movie.columns
)

reconstructed_df.describe()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
count,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,...,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0
mean,3.711254,3.560007,3.536308,3.595376,3.566383,3.59139,3.697193,3.692207,3.669452,3.595757,...,3.58777,3.588194,3.587485,3.585947,3.587775,3.585689,3.588048,3.586869,3.588232,3.587689
std,0.515761,0.464118,0.463922,0.465484,0.471006,0.447937,0.489666,0.490124,0.545523,0.455703,...,0.445098,0.445216,0.445571,0.446343,0.445211,0.448818,0.445428,0.44708,0.445232,0.445256
min,1.291477,1.043823,1.657879,1.138084,1.219273,1.526157,1.378006,1.417499,0.766104,1.326778,...,1.496407,1.492958,1.48824,1.480155,1.494119,1.41569,1.487586,1.451638,1.491704,1.492408
25%,3.408684,3.293891,3.252529,3.311406,3.296617,3.333732,3.407644,3.397598,3.351648,3.314687,...,3.324768,3.323053,3.319698,3.313672,3.323637,3.31356,3.321821,3.319209,3.322913,3.325432
50%,3.731784,3.594451,3.560307,3.639877,3.603325,3.626825,3.712444,3.714571,3.686205,3.629154,...,3.617739,3.620746,3.618716,3.618177,3.619491,3.620209,3.620545,3.620805,3.620704,3.62014
75%,4.023956,3.850906,3.832826,3.90655,3.863563,3.871754,4.017835,4.011389,4.022685,3.886321,...,3.867831,3.869482,3.869699,3.869168,3.868773,3.86801,3.869456,3.868411,3.869618,3.869048
max,5.740086,4.875745,4.87044,5.373849,4.862381,4.869923,5.681264,5.00614,5.8407,4.869301,...,4.869749,4.869603,4.86939,4.86901,4.869695,4.868454,4.869502,4.868978,4.869565,4.869519


In [33]:
movies = pd.read_csv(
    "../data/ml-100k/u.item",
    sep="|",
    encoding="latin-1",
    header=None
)

genre_cols = [
    "unknown", "Action", "Adventure", "Animation", "Children",
    "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
    "Sci-Fi", "Thriller", "War", "Western"
]

movies.columns = (
    ["movie_id", "title", "release_date", "video_release_date", "IMDb_URL"]
    + genre_cols
)


In [34]:
def get_movie_genres(movie_row):
    return ", ".join(
        [g for g in genre_cols if movie_row[g] == 1]
    )


In [49]:
def show_user_history(user_id, top_n=5):
    seen = (
        user_movie.loc[user_id]
        .dropna()
        .sort_values(ascending=False)
        .head(top_n)
    )
    
    history = movies[movies["movie_id"].isin(seen.index)].copy()
    history["rating"] = history["movie_id"].map(seen)
    history["genres"] = history.apply(get_movie_genres, axis=1)
    
    return history[["title", "genres", "rating"]].sort_values(
        "rating", ascending=False
    )


In [50]:
def recommend_movies_verbose(user_id, n=5):
    user_ratings = user_movie.loc[user_id]
    unseen_movies = user_ratings[user_ratings.isna()].index
    
    predictions = reconstructed_df.loc[user_id, unseen_movies]
    top_preds = predictions.sort_values(ascending=False).head(n)
    
    recs = movies[movies["movie_id"].isin(top_preds.index)].copy()
    recs["predicted_rating"] = recs["movie_id"].map(top_preds)
    recs["genres"] = recs.apply(get_movie_genres, axis=1)
    
    return recs[["title", "genres", "predicted_rating"]].sort_values(
        "predicted_rating", ascending=False
    )


In [53]:
user_id = 15

print(" Movies this user has watched:")
display(show_user_history(user_id))

print("\n\n\n Recommended movies for this user:")
display(recommend_movies_verbose(user_id))


 Movies this user has watched:


Unnamed: 0,title,genres,rating
49,Star Wars (1977),"Action, Adventure, Romance, Sci-Fi, War",5.0
180,Return of the Jedi (1983),"Action, Adventure, Romance, Sci-Fi, War",5.0
254,My Best Friend's Wedding (1997),"Comedy, Romance",5.0
307,FairyTale: A True Story (1997),"Children, Drama, Fantasy",5.0
753,Red Corner (1997),"Crime, Thriller",5.0





 Recommended movies for this user:


Unnamed: 0,title,genres,predicted_rating
171,"Empire Strikes Back, The (1980)","Action, Adventure, Drama, Romance, Sci-Fi, War",3.548411
271,Good Will Hunting (1997),Drama,3.540637
312,Titanic (1997),"Action, Drama, Romance",3.47122
275,Leaving Las Vegas (1995),"Drama, Romance",3.425776
474,Trainspotting (1996),Drama,3.307253
