In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("/home/anonymous/code/KHDL/btl/data/tmdb_cleaned.csv")

# nếu các cột list đang ở dạng string "[...]" thì parse
for col in ["genres", "keywords", "cast_top5"]:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.strip().startswith("[") else (x if isinstance(x, list) else []))

df["director"] = df.get("director", "").fillna("").astype(str)
df["overview"] = df.get("overview", "").fillna("").astype(str)

# tạo text input (giống BERT để công bằng)
df["tfidf_text"] = (
    df["genres"].apply(lambda x: " ".join([str(t) for t in x])) + " " +
    df["keywords"].apply(lambda x: " ".join([str(t) for t in x])) + " " +
    df["cast_top5"].apply(lambda x: " ".join([str(t) for t in x])) + " " +
    df["director"] + " " +
    df["overview"]
).str.strip()

# TF-IDF
tfidf = TfidfVectorizer(stop_words="english", max_features=50000)
X = tfidf.fit_transform(df["tfidf_text"])

# cosine similarity matrix
cos_sim_tfidf = cosine_similarity(X, X)

print("TF-IDF matrix:", X.shape, "cos_sim:", cos_sim_tfidf.shape)


TF-IDF matrix: (2173, 25433) cos_sim: (2173, 2173)


In [2]:
# map title -> index
title_to_idx = pd.Series(df.index, index=df["title"].astype(str).str.lower()).drop_duplicates()

def recommend_tfidf_by_title(title, top_k=10):
    key = str(title).strip().lower()

    if key not in title_to_idx:
        # gợi ý gần đúng (contains)
        cand = df[df["title"].str.lower().str.contains(key, na=False)]["title"].head(10).tolist()
        raise ValueError(f"Không tìm thấy title chính xác. Gợi ý gần giống: {cand}")

    q = int(title_to_idx[key])

    sims = cos_sim_tfidf[q]  # (N,)
    top_idx = np.argsort(sims)[::-1][1:top_k+1]  # bỏ chính nó ở vị trí 0

    rec = df.loc[top_idx, ["tmdb_id","title"]].copy() if "tmdb_id" in df.columns else df.loc[top_idx, ["title"]].copy()
    rec["score"] = sims[top_idx]
    return rec.reset_index(drop=True)

print(recommend_tfidf_by_title("Avatar", top_k=10))


   tmdb_id                                              title     score
0   216527                                           Avatar 4  0.265345
1    83533                               Avatar: Fire and Ash  0.238332
2    76600                           Avatar: The Way of Water  0.211521
3   393209                                           Avatar 5  0.209452
4      679                                             Aliens  0.116879
5  1059673  Avatar: The Deep Dive - A Special Edition of 2...  0.110826
6   696506                                          Mickey 17  0.101229
7    21778                                         RRRrrrr!!!  0.095766
8    10530                                         Pocahontas  0.090824
9   447365                     Guardians of the Galaxy Vol. 3  0.085286


In [3]:
import joblib
joblib.dump(tfidf, "tfidf_vectorizer.joblib")
# X là sparse matrix, có thể lưu bằng scipy
from scipy import sparse
sparse.save_npz("tfidf_matrix.npz", X)
np.save("cos_sim_tfidf.npy", cos_sim_tfidf)
df[["tmdb_id","title"]].to_csv("movie_index_tfidf.csv", index=False, encoding="utf-8-sig")
print("Saved TF-IDF artifacts")


Saved TF-IDF artifacts


In [4]:
print("TF-IDF:")
print(recommend_tfidf_by_title("Avatar", 10))
from rcm_bert import recommend_by_title
print("\nBERT:")
print(recommend_by_title("Avatar", 10))


TF-IDF:
   tmdb_id                                              title     score
0   216527                                           Avatar 4  0.265345
1    83533                               Avatar: Fire and Ash  0.238332
2    76600                           Avatar: The Way of Water  0.211521
3   393209                                           Avatar 5  0.209452
4      679                                             Aliens  0.116879
5  1059673  Avatar: The Deep Dive - A Special Edition of 2...  0.110826
6   696506                                          Mickey 17  0.101229
7    21778                                         RRRrrrr!!!  0.095766
8    10530                                         Pocahontas  0.090824
9   447365                     Guardians of the Galaxy Vol. 3  0.085286
   tmdb_id                     title                       poster_path  \
0    76600  Avatar: The Way of Water  /t6HIqrRAclMCA60NsSmeqe9RmNV.jpg   
1    70981                Prometheus  /qsYQflQhOuhDp