# Content based modelling


In [2]:
"""
Feature engineering that produces a tidy DataFrame.
- SBERT embeddings for Synopsis
- One-hot Genres (multi-label safe)
- One-hot Type / Rating
- Scaled numeric columns
"""

from __future__ import annotations
import pandas as pd
import re
import numpy as np
from anime_sensei.constant import *
from anime_sensei.utils.utility import read_file_from_S3, save_data_to_S3
from sentence_transformers import SentenceTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    Normalizer
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import scipy.sparse as sp

key = "Artifacts/Data_Cleaning/06-16-2025_20-05-59/Cleaned_Anime_Description.csv"
df = read_file_from_S3(key = key)
df.head()

Unnamed: 0,anime_id,Name,Score,Genres,Synopsis,Type,Episodes,Rating,Popularity,Favorites,Scored By,Members,Image URL,Duration_mins
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,R - 17+ (violence & profanity),602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...,115
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,PG-13 - Teens 13 or older,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...,24
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,PG-13 - Teens 13 or older,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...,25
4,8,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,PG - Children,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...,23


In [None]:
class TextCleaner(BaseEstimator, TransformerMixin):
    _PATTERN = re.compile(r"<.*?>|\\n")

    def fit(self, X, y=None): return self

    def transform(self, X):
        return pd.Series(X).fillna("").str.replace(self._PATTERN, " ", regex=True)

    # NEW ↓ just echoes whatever feature names came in
    def get_feature_names_out(self, input_features=None):
        # ColumnTransformer passes the original column name(s)
        return np.asarray(input_features, dtype=object)


class GenreOneHot(BaseEstimator, TransformerMixin):
    """
    Turn 'Action, Sci-Fi' → multi-hot columns.
    Output is *dense* ndarray so it merges cleanly with the SBERT matrix.
    """
    def __init__(self, sep: str = ","):
        self.sep = sep
        self.categories_: list[str] | None = None

    def fit(self, X, y=None):
        all_genres = (
            pd.Series(X)
            .fillna("")
            .str.split(self.sep)
            .explode()
            .str.strip()
            .loc[lambda s: s != ""]
        )
        self.categories_ = sorted(all_genres.unique())
        return self

    def transform(self, X):
        df = (
            pd.Series(X)
            .fillna("")
            .str.split(self.sep)
            .apply(lambda lst: {g.strip() for g in lst})
        )
        dense = np.zeros((len(df), len(self.categories_)), dtype=np.float32)
        for row, genre_set in enumerate(df):
            for col, g in enumerate(self.categories_):
                if g in genre_set:
                    dense[row, col] = 1.0
        return dense

    def get_feature_names_out(self, input_features=None):
        return np.array([f"genre_{g}" for g in self.categories_], dtype=object)


# ──────────────────────────────────────────────────────────────
# 1-B  SBERT embedder
# ──────────────────────────────────────────────────────────────
class SBERTVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 model_name: str = "all-MiniLM-L6-v2",
                 batch_size: int = 64,
                 show_progress_bar: bool = False):
        self.model_name = model_name
        self.batch_size = batch_size
        self.show_progress_bar = show_progress_bar
        self._model: SentenceTransformer | None = None

    def fit(self, X, y=None):
        self._model = SentenceTransformer(self.model_name)
        return self

    def transform(self, X):
        text = pd.Series(X).fillna("").tolist()
        vecs = self._model.encode(
            text,
            batch_size=self.batch_size,
            show_progress_bar=self.show_progress_bar,
            normalize_embeddings=True
        )
        return np.asarray(vecs, dtype=np.float32)

    def get_feature_names_out(self, input_features=None):
        return np.array([f"sbert_{i}" for i in range(384)], dtype=object)


# ──────────────────────────────────────────────────────────────
# 1-C  column setup
# ──────────────────────────────────────────────────────────────
TEXT_COL = "Synopsis"
GENRE_COL = "Genres"
CAT_COLS = ["Type", "Rating"]
NUM_COLS = [
    "Score",
    "Episodes",
    "Popularity",
    "Favorites",
    "Members",
    "Duration_mins",
]


def make_feature_pipeline() -> ColumnTransformer:
    synopsis_branch = Pipeline(
        [("clean", TextCleaner()), ("embed", SBERTVectorizer()), ("norm", Normalizer())]
    )

    genre_branch = GenreOneHot()          # custom multi-label one-hot
    cat_branch = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    num_branch = MinMaxScaler()

    return ColumnTransformer(
        [
            ("synopsis", synopsis_branch, TEXT_COL),
            ("genres", genre_branch, GENRE_COL),
            ("cats", cat_branch, CAT_COLS),
            ("nums", num_branch, NUM_COLS),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )


# ──────────────────────────────────────────────────────────────
# 2   public helper – returns DataFrame
# ──────────────────────────────────────────────────────────────
def featurize_dataframe(
    df: pd.DataFrame,
    *,
    pipeline: ColumnTransformer | None = None,
    fit: bool = True,
) -> tuple[pd.DataFrame, ColumnTransformer]:
    """
    Parameters
    ----------
    df        : raw input rows
    pipeline  : existing fitted ColumnTransformer (optional)
    fit       : if True fit+transform, otherwise transform only

    Returns  (feature_df, fitted_pipeline)
    """
    if pipeline is None:
        pipeline = make_feature_pipeline()

    X = pipeline.fit_transform(df) if fit else pipeline.transform(df)
    if sp.issparse(X):
        X = X.toarray()

    cols = pipeline.get_feature_names_out()
    feat_df = pd.DataFrame(X, columns=cols, index=df.index)
    return feat_df, pipeline


In [7]:
feats, pipe = featurize_dataframe(df)
print("Final shape :", feats.shape)
print("First 8 cols:\n", feats.iloc[:, :8].round(3))

Final shape : (20370, 425)
First 8 cols:
        synopsis__sbert_0  synopsis__sbert_1  synopsis__sbert_2  \
0                 -0.102             -0.027             -0.031   
1                 -0.092              0.064              0.032   
2                  0.019              0.089             -0.042   
3                 -0.047             -0.061             -0.050   
4                 -0.036              0.068             -0.100   
...                  ...                ...                ...   
20365             -0.009              0.012              0.034   
20366             -0.055              0.069              0.049   
20367             -0.057              0.063             -0.010   
20368             -0.039             -0.032              0.009   
20369             -0.054              0.020             -0.000   

       synopsis__sbert_3  synopsis__sbert_4  synopsis__sbert_5  \
0                 -0.000              0.073              0.081   
1                  0.039         

In [8]:
feats.head()

Unnamed: 0,synopsis__sbert_0,synopsis__sbert_1,synopsis__sbert_2,synopsis__sbert_3,synopsis__sbert_4,synopsis__sbert_5,synopsis__sbert_6,synopsis__sbert_7,synopsis__sbert_8,synopsis__sbert_9,...,cats__Rating_PG-13 - Teens 13 or older,cats__Rating_R - 17+ (violence & profanity),cats__Rating_R+ - Mild Nudity,cats__Rating_Rx - Hentai,nums__Score,nums__Episodes,nums__Popularity,nums__Favorites,nums__Members,nums__Duration_mins
0,-0.101728,-0.026759,-0.031014,-0.000403,0.07304,0.080651,0.063086,0.007205,-0.009755,-0.027027,...,0.0,1.0,0.0,0.0,0.951724,0.008505,0.00174,0.360859,0.47309,0.016
1,-0.092297,0.063572,0.031729,0.038844,0.075663,-0.008594,0.046599,0.05149,-0.038646,0.005079,...,0.0,1.0,0.0,0.0,0.90069,0.000327,0.024356,0.006654,0.096401,0.076667
2,0.018597,0.089144,-0.042042,0.016114,0.058198,0.04077,0.143477,0.034467,-0.003781,0.033659,...,1.0,0.0,0.0,0.0,0.878621,0.008505,0.009953,0.069093,0.194217,0.016
3,-0.04702,-0.061326,-0.049626,0.077566,-0.003977,0.020479,0.060575,0.019401,-0.032922,0.064706,...,1.0,0.0,0.0,0.0,0.744828,0.008505,0.072622,0.002817,0.029892,0.016667
4,-0.03577,0.067944,-0.100477,-0.044423,0.034286,0.07692,0.03377,0.010735,-0.011177,-0.001312,...,0.0,0.0,0.0,0.0,0.702069,0.01701,0.207388,6.4e-05,0.004006,0.015333


In [10]:
print(list(feats.columns))

['synopsis__sbert_0', 'synopsis__sbert_1', 'synopsis__sbert_2', 'synopsis__sbert_3', 'synopsis__sbert_4', 'synopsis__sbert_5', 'synopsis__sbert_6', 'synopsis__sbert_7', 'synopsis__sbert_8', 'synopsis__sbert_9', 'synopsis__sbert_10', 'synopsis__sbert_11', 'synopsis__sbert_12', 'synopsis__sbert_13', 'synopsis__sbert_14', 'synopsis__sbert_15', 'synopsis__sbert_16', 'synopsis__sbert_17', 'synopsis__sbert_18', 'synopsis__sbert_19', 'synopsis__sbert_20', 'synopsis__sbert_21', 'synopsis__sbert_22', 'synopsis__sbert_23', 'synopsis__sbert_24', 'synopsis__sbert_25', 'synopsis__sbert_26', 'synopsis__sbert_27', 'synopsis__sbert_28', 'synopsis__sbert_29', 'synopsis__sbert_30', 'synopsis__sbert_31', 'synopsis__sbert_32', 'synopsis__sbert_33', 'synopsis__sbert_34', 'synopsis__sbert_35', 'synopsis__sbert_36', 'synopsis__sbert_37', 'synopsis__sbert_38', 'synopsis__sbert_39', 'synopsis__sbert_40', 'synopsis__sbert_41', 'synopsis__sbert_42', 'synopsis__sbert_43', 'synopsis__sbert_44', 'synopsis__sbert_45

In [13]:
# keep the columns you need to display later
lookup = df[["anime_id", "Name"]].reset_index(drop=True)
# align index with feats
assert len(lookup) == len(feats)

In [14]:
from sklearn.neighbors import NearestNeighbors
import joblib, pathlib

MODEL_DIR = pathlib.Path("models")
MODEL_DIR.mkdir(exist_ok=True)

knn = NearestNeighbors(
    n_neighbors=50,          # keep a decent buffer; you can slice later
    metric="cosine",
    algorithm="brute",       # fast enough for <100k dense rows
).fit(feats.values)          # pass ndarray (425-dim)

# persist everything
joblib.dump({
    "knn": knn,
    "features": feats,       # dense matrix (float32)
    "lookup": lookup         # id ↔ row mapping
}, MODEL_DIR / "content_knn.joblib")
print("✅ saved → models/content_knn.joblib")

✅ saved → models/content_knn.joblib


In [None]:
import numpy as np
import joblib

ART = joblib.load("models/content_knn.joblib")
KN, F, LU = ART["knn"], ART["features"].values, ART["lookup"]

def recommend(anime_id: int, k: int = 10):
    # 1. locate row index
    row_idx = LU.index[LU["anime_id"] == anime_id]
    if row_idx.empty:
        raise KeyError(f"Anime ID {anime_id} not found")
    i = row_idx[0]

    # 2. query
    dist, idxs = KN.kneighbors(F[i].reshape(1, -1), n_neighbors=k + 1)
    idxs, dist = idxs[0][1:], dist[0][1:] 

    recs = LU.iloc[idxs].copy()
    recs["similarity"] = 1 - dist 
    return recs



In [24]:
print(df[df['anime_id'] == 21]['Name'])
print(df[df['anime_id'] == 21]['Synopsis'].values)
print("\n", recommend(anime_id=21, k=5))

11    One Piece
Name: Name, dtype: object
['Gol D. Roger was known as the "Pirate King," the strongest and most infamous being to have sailed the Grand Line. The capture and execution of Roger by the World Government brought a change throughout the world. His last words before his death revealed the existence of the greatest treasure in the world, One Piece. It was this revelation that brought about the Grand Age of Pirates, men who dreamed of finding One Piece—which promises an unlimited amount of riches and fame—and quite possibly the pinnacle of glory and the title of the Pirate King.\n\nEnter Monkey D. Luffy, a 17-year-old boy who defies your standard definition of a pirate. Rather than the popular persona of a wicked, hardened, toothless pirate ransacking villages for fun, Luffy\'s reason for being a pirate is one of pure wonder: the thought of an exciting adventure that leads him to intriguing people and ultimately, the promised treasure. Following in the footsteps of his childho

In [25]:
print(df[df['anime_id'] == 11061]['Name'])
print(df[df['anime_id'] == 11061]['Synopsis'].values)
print("\n", recommend(anime_id=11061, k=5))

6360    Hunter x Hunter (2011)
Name: Name, dtype: object
["Hunters devote themselves to accomplishing hazardous tasks, all from traversing the world's uncharted territories to locating rare items and monsters. Before becoming a Hunter, one must pass the Hunter Examination—a high-risk selection process in which most applicants end up handicapped or worse, deceased.\n\nAmbitious participants who challenge the notorious exam carry their own reason. What drives 12-year-old Gon Freecss is finding Ging, his father and a Hunter himself. Believing that he will meet his father by becoming a Hunter, Gon takes the first step to walk the same path.\n\nDuring the Hunter Examination, Gon befriends the medical student Leorio Paladiknight, the vindictive Kurapika, and ex-assassin Killua Zoldyck. While their motives vastly differ from each other, they band together for a common goal and begin to venture into a perilous world."]

       anime_id                Name  similarity
115        136     Hunter 

In [27]:
ONE_PIECE_SYNOPSIS_STRING = "Gol D. Roger was known as the 'Pirate King,' the strongest and most infamous being to have sailed the Grand Line. The capture and execution of Roger by the World Government brought a change throughout the world. His last words before his death revealed the existence of the greatest treasure in the world, One Piece. It was this revelation that brought about the Grand Age of Pirates, men who dreamed of finding One Piece—which promises an unlimited amount of riches and fame—and quite possibly the pinnacle of glory and the title of the Pirate King.\n\nEnter Monkey D. Luffy, a 17-year-old boy who defies your standard definition of a pirate. Rather than the popular persona of a wicked, hardened, toothless pirate ransacking villages for fun, Luffy\'s reason for being a pirate is one of pure wonder: the thought of an exciting adventure that leads him to intriguing people and ultimately, the promised treasure. Following in the footsteps of his childhood hero, Luffy and his crew travel across the Grand Line, experiencing crazy adventures, unveiling dark mysteries and battling strong enemies, all in order to reach the most coveted of all fortunes—One Piece."
HUNTER_SYNOPSIS_STRING = "Hunters devote themselves to accomplishing hazardous tasks, all from traversing the world's uncharted territories to locating rare items and monsters. Before becoming a Hunter, one must pass the Hunter Examination—a high-risk selection process in which most applicants end up handicapped or worse, deceased.\n\nAmbitious participants who challenge the notorious exam carry their own reason. What drives 12-year-old Gon Freecss is finding Ging, his father and a Hunter himself. Believing that he will meet his father by becoming a Hunter, Gon takes the first step to walk the same path.\n\nDuring the Hunter Examination, Gon befriends the medical student Leorio Paladiknight, the vindictive Kurapika, and ex-assassin Killua Zoldyck. While their motives vastly differ from each other, they band together for a common goal and begin to venture into a perilous world."
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")

s1 = ONE_PIECE_SYNOPSIS_STRING
s2 = HUNTER_SYNOPSIS_STRING

emb1, emb2 = model.encode([s1, s2], normalize_embeddings=True)
synopsis_sim = float(util.cos_sim(emb1, emb2))
print(f"Synopsis-only similarity = {synopsis_sim:.3f}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Synopsis-only similarity = 0.273


In [20]:
print(recommend(anime_id=1735, k=5))

       anime_id                             Name  similarity
10           20                           Naruto    0.958416
12088     34566  Boruto: Naruto Next Generations    0.928029
245         269                           Bleach    0.910462
6360      11061           Hunter x Hunter (2011)    0.893309
131         153                     Juuni Kokuki    0.888856
