In [1]:
import pandas as pd
import ast
import numpy as np

df = pd.read_csv("/home/anonymous/code/KHDL/btl/data/tmdb_cleaned.csv")

def to_list(x):
    """Đảm bảo output là list[str]"""
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return [str(i) for i in x if pd.notna(i)]
    if isinstance(x, str):
        s = x.strip()
        # nếu là string dạng list: "['a','b']"
        if s.startswith("[") and s.endswith("]"):
            try:
                v = ast.literal_eval(s)
                if isinstance(v, list):
                    return [str(i) for i in v if pd.notna(i)]
            except:
                pass
        # nếu là chuỗi thường, coi như 1 token
        return [s]
    # số/float -> bỏ hoặc cast
    return []

def to_text(x):
    """Đảm bảo output là str"""
    if pd.isna(x):
        return ""
    return str(x)

# Chuẩn hóa 3 cột list
for col in ["genres", "keywords", "cast_top5"]:
    if col in df.columns:
        df[col] = df[col].apply(to_list)
    else:
        df[col] = [[]]*len(df)

# Chuẩn hóa text
df["director"] = df["director"].apply(to_text)
df["overview"] = df["overview"].apply(to_text)

def make_text(row):
    return (
        " ".join(row["genres"]) + " " +
        " ".join(row["keywords"]) + " " +
        " ".join(row["cast_top5"]) + " " +
        row["director"] + " " +
        row["overview"]
    ).strip()

df["bert_text"] = df.apply(make_text, axis=1)

print(df["bert_text"].head(3))
print("Empty bert_text:", (df["bert_text"].str.len() == 0).sum())


0    science_fiction adventure fantasy witch clone ...
1    action mystery drama based_on_novel_or_book le...
2    animation comedy adventure family mystery snak...
Name: bert_text, dtype: object
Empty bert_text: 0


In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer

# df đang có cột bert_text từ bước trước
texts = df["bert_text"].fillna("").astype(str).tolist()

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # quan trọng: giúp cosine similarity nhanh & ổn hơn
)

# lưu để lần sau khỏi encode lại
np.save("movie_embeddings.npy", embeddings)

# lưu index (cần title + tmdb_id + poster_path nếu có)
keep_cols = ["tmdb_id", "title"]
if "poster_path" in df.columns:
    keep_cols.append("poster_path")

df[keep_cols].to_csv("movie_index.csv", index=False, encoding="utf-8-sig")

print("Saved: movie_embeddings.npy + movie_index.csv")
print("Embeddings shape:", embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Saved: movie_embeddings.npy + movie_index.csv
Embeddings shape: (2173, 384)


In [2]:
import pandas as pd
import numpy as np

idx_df = pd.read_csv("movie_index.csv")
emb = np.load("movie_embeddings.npy")

# map title -> index (lowercase)
title_to_idx = pd.Series(idx_df.index, index=idx_df["title"].astype(str).str.lower()).drop_duplicates()

def recommend_by_title(title, top_k=10):
    key = str(title).strip().lower()
    if key not in title_to_idx:
        # gợi ý gần đúng
        cand = idx_df[idx_df["title"].str.lower().str.contains(key, na=False)]["title"].head(10).tolist()
        raise ValueError(f"Không tìm thấy title chính xác. Gợi ý gần giống: {cand}")

    q = int(title_to_idx[key])

    # vì embeddings đã normalize -> cosine = dot product
    sims = emb @ emb[q]   # (N,)
    # lấy top_k + 1 để bỏ chính nó
    top_idx = np.argsort(sims)[::-1][:top_k+1]

    # bỏ chính nó
    top_idx = [i for i in top_idx if i != q][:top_k]

    rec = idx_df.iloc[top_idx].copy()
    rec["score"] = sims[top_idx]
    return rec.reset_index(drop=True)

print(recommend_by_title("Anaconda", top_k=10))


   tmdb_id                 title                       poster_path     score
0  1386827               Coyotes  /fSzs9ZXYcrln0FRrjGIPCjuB6En.jpg  0.658136
1    34851             Predators  /wdniP8NDaJIydi1hMxhpbJMUfr6.jpg  0.644610
2   629542          The Bad Guys  /6fcFmdVLCCbf1gFt8HlC6BRj8pt.jpg  0.632682
3    10730             King Kong  /paYKhEwUaxKA05vmOfU7FlleTln.jpg  0.628121
4      869    Planet of the Apes  /eEsOgbgKXMvI2FEw1WENamVXi41.jpg  0.610657
5  1175942        The Bad Guys 2  /c1msaKf1wyuKcmLjjJd6rIBPFcd.jpg  0.606672
6      106              Predator  /k3mW4qfJo6SKqe6laRyNGnbB9n5.jpg  0.604366
7     4247           Scary Movie  /fVQFPRuw3yWXojYDJvA5EoFjUOY.jpg  0.602479
8  1196364                Thamma  /udkbDwBbysCGEydt0FHnl9dVO2k.jpg  0.598963
9  1049942  Bambi: The Reckoning  /8oBbWxWDJrrDtNkmd0OzZpPFFUR.jpg  0.595286


In [18]:
from rapidfuzz import process, fuzz

def find_title_close(query, top_n=10):
    titles = idx_df["title"].astype(str).tolist()
    matches = process.extract(query, titles, scorer=fuzz.WRatio, limit=top_n)
    # trả về list (title, score)
    return matches

print(find_title_close("Avatar fire and ash", top_n=10))


[('Avatar', 90.0, 30), ('Avatar: Fire and Ash', 87.17948717948718, 0), ("Harry Potter and the Philosopher's Stone", 85.5, 129), ('The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 85.5, 146), ('Avatar 4', 85.5, 203), ('Harry Potter and the Chamber of Secrets', 85.5, 221), ('Harry Potter and the Goblet of Fire', 85.5, 222), ('Harry Potter and the Order of the Phoenix', 85.5, 227), ('Captain Sabertooth and the Countess of Grel', 85.5, 237), ('Harry Potter and the Prisoner of Azkaban', 85.5, 278)]


In [20]:
from sentence_transformers import SentenceTransformer
import numpy as np

# load model đúng cái mày đã dùng để encode
model = SentenceTransformer("all-MiniLM-L6-v2")

def recommend_by_query_text(query, top_k=10):
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
    sims = emb @ q_emb
    top_idx = np.argsort(sims)[::-1][:top_k]
    rec = idx_df.iloc[top_idx].copy()
    rec["score"] = sims[top_idx]
    return rec.reset_index(drop=True)

print(recommend_by_query_text("Avatar fire and ash", top_k=10))


   tmdb_id                                              title  \
0   976573                                          Elemental   
1    83533                               Avatar: Fire and Ash   
2   393209                                           Avatar 5   
3   216527                                           Avatar 4   
4       81                 Nausicaä of the Valley of the Wind   
5      128                                  Princess Mononoke   
6   635302  Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...   
7    12222                                Horton Hears a Who!   
8    25961                       Pokémon: The Rise of Darkrai   
9    11688                           The Emperor's New Groove   

                        poster_path     score  
0  /4Y1WNkd88JXmGfhtWR7dmDAo1T2.jpg  0.485050  
1  /g96wHxU7EnoIFwemb2RgohIXrgW.jpg  0.474995  
2  /rtmmvqkIC5zDMEd638Es2woxbz8.jpg  0.459310  
3  /qzMYKnT4MG1d0gnhwytr4cKhUvS.jpg  0.451863  
4  /tcrkfB8SRPQCgwI88hQScua6nxh.jpg  0.44285