In [22]:
import numpy as np
import pandas as pd

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [23]:
ratings = pd.read_csv(
    "../data/ml-100k/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [24]:
genre_cols = [
    "unknown", "Action", "Adventure", "Animation", "Children",
    "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
    "Sci-Fi", "Thriller", "War", "Western"
]

movies = pd.read_csv(
    "../data/ml-100k/u.item",
    sep="|",
    encoding="latin-1",
    header=None
)

movies.columns = (
    ["movie_id", "title", "release_date", "video_release_date", "imdb_url"]
    + genre_cols
)

# ÿ™ÿ®ÿØ€åŸÑ ⁄òÿßŸÜÿ±Ÿáÿß ÿ®Ÿá €å⁄© ÿ≥ÿ™ŸàŸÜ ŸÖÿ™ŸÜ€å
movies["genres"] = movies[genre_cols].apply(
    lambda row: ", ".join(row.index[row == 1]),
    axis=1
)

movies = movies[["movie_id", "title", "genres"]]


In [25]:
np.random.seed(42)

test_size = 0.2
test_indices = np.random.choice(
    ratings.index,
    size=int(len(ratings) * test_size),
    replace=False
)

test_ratings = ratings.loc[test_indices]
train_ratings = ratings.drop(test_indices)

print("Train size:", len(train_ratings))
print("Test size:", len(test_ratings))


Train size: 80000
Test size: 20000


In [26]:
train_matrix = train_ratings.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)

train_matrix.head()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,,3.0,,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [27]:
user_means = train_matrix.mean(axis=1)

train_centered = train_matrix.sub(user_means, axis=0)

train_filled = train_centered.fillna(0)


In [28]:
k = 20

svd = TruncatedSVD(n_components=k, random_state=42)
U = svd.fit_transform(train_filled.values)
VT = svd.components_


In [29]:
reconstructed = U @ VT
reconstructed += user_means.values.reshape(-1, 1)

reconstructed_df = pd.DataFrame(
    reconstructed,
    index=train_matrix.index,
    columns=train_matrix.columns
)

reconstructed_df.head()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.555872,3.596436,3.847759,3.705887,3.418536,3.585401,4.358061,3.778103,4.140089,3.982395,...,3.687319,3.687319,3.684941,3.663604,3.6892,3.70362,3.700753,3.688845,3.694799,3.687187
2,3.740188,3.783417,3.710227,3.850129,3.773119,3.85782,3.864452,3.678228,3.927539,3.763002,...,3.805285,3.805285,3.800552,3.811242,3.799423,3.801953,3.799797,3.803886,3.801842,3.804457
3,2.809567,2.805548,2.775474,2.857743,2.815641,2.820019,2.738165,2.779729,2.89028,2.783789,...,2.799943,2.799943,2.801479,2.802215,2.801371,2.791793,2.803066,2.800311,2.801689,2.800033
4,4.473987,4.534571,4.511729,4.453607,4.48796,4.493504,4.47573,4.548026,4.467326,4.508135,...,4.499945,4.499945,4.499063,4.496348,4.500826,4.499322,4.500521,4.500053,4.500287,4.499947
5,3.270686,2.859401,2.748516,2.766138,2.97463,2.832107,3.665294,2.882139,2.266857,2.909806,...,2.862547,2.862547,2.860454,2.869228,2.870986,2.865899,2.855168,2.861593,2.858381,2.862479


In [30]:
reconstructed_df.describe()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
count,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,...,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0
mean,3.667468,3.556855,3.553327,3.593552,3.569146,3.591783,3.67614,3.668116,3.662432,3.600751,...,3.588974,3.588974,3.58775,3.586043,3.588355,3.586972,3.586218,3.588501,3.587359,3.588728
std,0.483349,0.463532,0.468756,0.476195,0.466529,0.456541,0.487379,0.469386,0.529164,0.462099,...,0.453317,0.453317,0.454402,0.453824,0.453424,0.45474,0.455684,0.45368,0.454658,0.453467
min,1.48234,0.824927,1.690143,0.932522,1.586861,1.443721,1.658894,1.325892,1.280215,1.305264,...,1.497041,1.497041,1.47229,1.504326,1.496278,1.476273,1.440012,1.487586,1.463799,1.493084
25%,3.372481,3.291737,3.286506,3.311362,3.295745,3.32514,3.393562,3.394832,3.352457,3.330775,...,3.320072,3.320072,3.320096,3.317191,3.320048,3.324253,3.316552,3.318274,3.317833,3.319357
50%,3.694439,3.589718,3.57513,3.627001,3.600649,3.622419,3.689454,3.708341,3.679363,3.628746,...,3.617777,3.617777,3.617134,3.613426,3.615742,3.619411,3.615841,3.617077,3.614553,3.617723
75%,3.978489,3.837653,3.857009,3.914438,3.870772,3.88342,4.002973,3.976577,4.009304,3.904121,...,3.874142,3.874142,3.873403,3.872973,3.872273,3.87494,3.872239,3.873998,3.872559,3.873993
max,4.943533,4.944704,4.950316,5.342958,4.951434,4.947792,5.610465,4.943768,5.477029,4.94674,...,4.947322,4.947322,4.947737,4.947168,4.94739,4.947752,4.948056,4.947438,4.947747,4.947365


In [31]:
records = []

for _, row in test_ratings.iterrows():
    user_id = row.user_id
    movie_id = row.movie_id
    true_rating = row.rating
    
    if user_id in reconstructed_df.index and movie_id in reconstructed_df.columns:
        pred_rating = reconstructed_df.loc[user_id, movie_id]
        
        records.append({
            "user_id": user_id,
            "movie_id": movie_id,
            "true_rating": true_rating,
            "predicted_rating": pred_rating
        })

eval_df = pd.DataFrame(records)

eval_df.head()


Unnamed: 0,user_id,movie_id,true_rating,predicted_rating
0,877,381,4,3.784644
1,815,602,3,3.800217
2,94,431,4,3.787612
3,416,875,2,3.760779
4,500,182,2,3.605116


In [32]:
rmse = mean_squared_error(
    eval_df["true_rating"],
    eval_df["predicted_rating"],
    squared=False
)

mae = mean_absolute_error(
    eval_df["true_rating"],
    eval_df["predicted_rating"]
)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


RMSE: 0.9888
MAE: 0.7834


In [33]:
user_rating_counts = train_ratings.groupby("user_id").size()

eval_df["user_rating_count"] = eval_df["user_id"].map(user_rating_counts)

eval_df["abs_error"] = abs(
    eval_df["true_rating"] - eval_df["predicted_rating"]
)

eval_df.groupby(
    pd.qcut(eval_df["user_rating_count"], 4)
)["abs_error"].mean()


user_rating_count
(11.999, 77.0]    0.818434
(77.0, 139.0]     0.792501
(139.0, 223.0]    0.749157
(223.0, 578.0]    0.773378
Name: abs_error, dtype: float64

In [34]:
def show_user_history(user_id, user_movie, movies, top_n=5):
    seen = (
        user_movie.loc[user_id]
        .dropna()
        .sort_values(ascending=False)
        .head(top_n)
    )

    history = movies[movies["movie_id"].isin(seen.index)].copy()
    history["rating"] = history["movie_id"].map(seen)

    return history[["title", "genres", "rating"]].sort_values(
        "rating", ascending=False
    )


In [35]:
def recommend_movies_verbose(
    user_id,
    user_movie,
    reconstructed_df,
    movies,
    n=5
):
    user_ratings = user_movie.loc[user_id]
    unseen_movies = user_ratings[user_ratings.isna()].index

    predictions = reconstructed_df.loc[user_id, unseen_movies]
    top_preds = predictions.sort_values(ascending=False).head(n)

    recs = movies[movies["movie_id"].isin(top_preds.index)].copy()
    recs["predicted_rating"] = recs["movie_id"].map(top_preds)

    return recs[["title", "genres", "predicted_rating"]].sort_values(
        "predicted_rating", ascending=False
    )


In [36]:
# ===============================
# Recommendation demo (Notebook 04)
# Model trained with 20% removed
# ===============================

user_id = 10   # ÿ®€åŸÜ 1 ÿ™ÿß 943

print("üé¨ Movies this user has watched (TRAIN DATA ONLY):")
display(
    show_user_history(
        user_id=user_id,
        user_movie=train_matrix,
        movies=movies,
        top_n=5
    )
)

print("\n\n‚≠ê Recommended movies for this user (SVD trained with 20% removed):")
display(
    recommend_movies_verbose(
        user_id=user_id,
        user_movie=train_matrix,
        reconstructed_df=reconstructed_df,
        movies=movies,
        n=5
    )
)


üé¨ Movies this user has watched (TRAIN DATA ONLY):


Unnamed: 0,title,rating
132,Gone with the Wind (1939),5.0
133,Citizen Kane (1941),5.0
134,2001: A Space Odyssey (1968),5.0
509,"Magnificent Seven, The (1954)",5.0
602,Rear Window (1954),5.0




‚≠ê Recommended movies for this user (SVD trained with 20% removed):


Unnamed: 0,title,predicted_rating
482,Casablanca (1942),4.542324
317,Schindler's List (1993),4.507726
126,"Godfather, The (1972)",4.411757
426,To Kill a Mockingbird (1962),4.401464
514,"Boot, Das (1981)",4.395845
