In [1]:
# ──────────────────────────────────────────────
# Build anime_features.csv from clean_anime.csv
# ──────────────────────────────────────────────
from pathlib import Path
import pandas as pd
import numpy as np

# 0️⃣  LOAD the file you just cleaned
df = pd.read_csv("../models/clean_anime.csv")   # adjust path if different

# 1️⃣  GENERATE anime_id  (sequential)
df = df.reset_index(drop=True)
df["anime_id"] = df.index + 1                   # 1 … N

# 2️⃣  RENAME title column lowercase
df = df.rename(columns={"Title": "title"})

# 3️⃣  MAP genre columns you already have
#     They are already 0/1, just add "genre_" prefix & Title‑case matching schema
genres_already = [
    c for c in df.columns
    if c not in ["title", "anime_id"]           # exclude meta
       and df[c].dropna().isin([0,1]).all()     # only 0/1 columns
]

genre_map = {g: f"genre_{g.title()}" for g in genres_already}
df = df.rename(columns=genre_map)

# 4️⃣  CREATE placeholder type_* columns (all 0, since you have no type info yet)
TYPES = ["Movie","Music","ONA","OVA","Special","TV","Unknown"]
for t in TYPES:
    df[f"type_{t}"] = 0

# 5️⃣  APPROXIMATE hist_score‑* with rounded Score  (one‑hot, 10 → 1)
HIST = list(range(10,0,-1))
score_round = df["Score"].round().clip(lower=1, upper=10).astype(int)
for s in HIST:
    df[f"hist_score-{s}"] = (score_round == s).astype(int)

# 6️⃣  KEEP only the required columns – missing genre_* stay absent (OK)
keep_cols = (
    ["anime_id", "title"] +
    [c for c in df.columns if c.startswith("genre_")] +
    [f"type_{t}" for t in TYPES] +
    [f"hist_score-{s}" for s in HIST]
)
anime_features = df[keep_cols]

# 7️⃣  SAVE
FEAT_CSV = Path("../models/anime_features.csv")
FEAT_CSV.parent.mkdir(parents=True, exist_ok=True)
anime_features.to_csv(FEAT_CSV, index=False)
print("✅  anime_features.csv saved →", FEAT_CSV)
print("Shape:", anime_features.shape)


✅  anime_features.csv saved → ..\models\anime_features.csv
Shape: (13231, 95)
