In [9]:
import os
import re
import ast
import numpy as np
import pandas as pd
from unidecode import unidecode
from sentence_transformers import SentenceTransformer
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

# ========= normalize helpers =========
def norm_text(s: str) -> str:
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).strip().lower()
    s = unidecode(s)
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def parse_list_field(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        return [str(t) for t in x]
    s = str(x).strip()
    if not s:
        return []
    if s.startswith("[") and s.endswith("]"):
        try:
            v = ast.literal_eval(s)
            if isinstance(v, list):
                return [str(t) for t in v]
        except:
            return []
    return [s]

def make_person_fields(name: str):
    # trả 2 biến thể để match:
    # "rich lee" và "richlee"
    n = norm_text(name)
    return n, n.replace(" ", "")

def build_search_text(row):
    title = norm_text(row.get("title", ""))
    overview = norm_text(row.get("overview", ""))

    genres = " ".join(norm_text(x) for x in parse_list_field(row.get("genres", "")))
    keywords = " ".join(norm_text(x) for x in parse_list_field(row.get("keywords", "")))

    cast_list = parse_list_field(row.get("cast_top5", ""))
    cast_norms = []
    cast_joins = []
    for c in cast_list:
        n, j = make_person_fields(c)
        if n: cast_norms.append(n)
        if j: cast_joins.append(j)

    director = row.get("director", "")
    d_norm, d_join = make_person_fields(director)

    # search_text để TFIDF/embedding
    # nhét cả norm + join để user gõ "richlee" vẫn dính
    parts = [
        title,
        genres,
        keywords,
        " ".join(cast_norms),
        " ".join(cast_joins),
        d_norm,
        d_join,
        overview
    ]
    return " ".join([p for p in parts if p]).strip(), d_norm, d_join, cast_norms, cast_joins

# ========= paths =========
tmdb_csv = "/home/anonymous/code/KHDL/btl/web/backend/data/tmdb_cleaned.csv"
out_dir  = "/home/anonymous/code/KHDL/btl/web/backend/data"

out_index = os.path.join(out_dir, "movie_index_new.csv")
out_emb   = os.path.join(out_dir, "movie_embeddings_new.npy")
out_vec   = os.path.join(out_dir, "tfidf_vectorizer.pkl")
out_xtfidf= os.path.join(out_dir, "X_tfidf_new.npz")

# ========= load =========
df = pd.read_csv(tmdb_csv)

# build fields
search_texts = []
director_norms = []
director_joins = []
cast_norms_col = []
cast_joins_col = []

for _, row in df.iterrows():
    stext, dnorm, djoin, cnorms, cjoins = build_search_text(row)
    search_texts.append(stext)
    director_norms.append(dnorm)
    director_joins.append(djoin)
    cast_norms_col.append(cnorms)
    cast_joins_col.append(cjoins)

df["search_text"]   = search_texts
df["director_norm"] = director_norms
df["director_join"] = director_joins
df["cast_norm"]     = cast_norms_col
df["cast_join"]     = cast_joins_col
df["title_norm"]    = df["title"].map(norm_text)

# ========= TFIDF save =========
tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2)
X = tfidf.fit_transform(df["search_text"].fillna(""))

joblib.dump(tfidf, out_vec)
sparse.save_npz(out_xtfidf, X)

# ========= Embedding save =========
model = SentenceTransformer("all-MiniLM-L6-v2")
emb = model.encode(df["search_text"].tolist(), convert_to_numpy=True, normalize_embeddings=True)

np.save(out_emb, emb)

# ========= index save =========
# lưu metadata cần cho UI + search
keep_cols = [
    "tmdb_id","title","overview","release_date","genres","runtime",
    "vote_average","vote_count","popularity","cast_top5","director","keywords","poster_path",
    "title_norm","director_norm","director_join","cast_norm","cast_join"
]
keep_cols = [c for c in keep_cols if c in df.columns]
df[keep_cols].to_csv(out_index, index=False, encoding="utf-8-sig")

print("Saved:")
print(" -", out_index)
print(" -", out_emb)
print(" -", out_vec)
print(" -", out_xtfidf)


Saved:
 - /home/anonymous/code/KHDL/btl/web/backend/data/movie_index_new.csv
 - /home/anonymous/code/KHDL/btl/web/backend/data/movie_embeddings_new.npy
 - /home/anonymous/code/KHDL/btl/web/backend/data/tfidf_vectorizer.pkl
 - /home/anonymous/code/KHDL/btl/web/backend/data/X_tfidf_new.npz


In [10]:
def recommend_person_first(df, query, top_k=10):
    qn = norm_text(query)
    qj = qn.replace(" ", "")

    # 1) match director exact
    m = (df["director_norm"] == qn) | (df["director_join"] == qj)
    if m.any():
        out = df[m].copy()
        out["score"] = 1.0
        return out.sort_values(["popularity","vote_count"], ascending=False).head(top_k)

    # 2) match cast exact (cast_norm/cast_join là list)
    def has_cast(row):
        return (qn in row) or (qj in row)

    m2 = df["cast_norm"].apply(has_cast) | df["cast_join"].apply(has_cast)
    if m2.any():
        out = df[m2].copy()
        out["score"] = 0.9
        return out.sort_values(["popularity","vote_count"], ascending=False).head(top_k)

    return None


In [11]:
import re
from unidecode import unidecode

def norm_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = s.strip().lower()
    s = unidecode(s)
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def director_exact_boost(idx_df, query, top_k=10):
    qn = norm_text(query)
    qj = qn.replace(" ", "")

    # director raw -> norm/join
    d_norm = idx_df["director"].astype(str).map(norm_text)
    d_join = d_norm.str.replace(" ", "", regex=False)

    m = (d_norm == qn) | (d_join == qj)
    out = idx_df[m].copy()
    if len(out) == 0:
        return None

    out["score"] = 1.0
    return out[["tmdb_id","title","director","score"]].head(top_k)
