 # Anime Popularity Prediction: Data Preprocessing and Feature Engineering

 ## Imports and Setup

In [1]:
import datetime
import re

import numpy as np
import pandas as pd
from tqdm.auto import tqdm  # noqa
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MultiLabelBinarizer, OrdinalEncoder

np.random.seed(42)  # Set random seed for reproducibility

 ## Data Loading

In [2]:
def load_data(anime_file, character_file):
    anime_df = pd.read_csv(anime_file)
    character_df = pd.read_csv(character_file)
    return anime_df, character_df


anime_df, character_df = load_data("data/anime.csv", "data/character.csv")

 ## Data Preprocessing

In [3]:
def select_features(df, features):
    return df[features]


anime_features = [
    "anime_id",
    "type",
    "source",
    "episodes",
    "status",
    "aired_from",
    "aired_to",
    "duration",
    "rating",
    "score",
    "scored_by",
    "rank",
    "popularity",
    "members",
    "favorites",
    "synopsis",
    "producers",
    "studios",
    "genres",
    "themes",
]

character_features = [
    "character_id",
    "anime_id",
    "role",
    "favorites",
    "about",
]

anime_df = select_features(anime_df, anime_features)
character_df = select_features(character_df, character_features)

In [4]:
def handle_missing_values(df):
    df["episodes"] = df["episodes"].fillna(0)
    df["score"] = df["score"].fillna(df["score"].mean())
    df["synopsis"] = df["synopsis"].fillna("")
    df["producers"] = df["producers"].fillna("Unknown")
    df["type"] = df["type"].fillna(df["type"].mode()[0])
    df["scored_by"] = df["scored_by"].fillna(0)
    df["rank"] = df["rank"].fillna(df["rank"].max() + 1)
    df["studios"] = df["studios"].fillna("Unknown Studio")
    return df


anime_df = handle_missing_values(anime_df)

In [5]:
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str).tz_localize(None)
    except:
        return pd.NaT


def impute_aired_from(df):
    df["aired_from"] = df["aired_from"].apply(parse_date)
    reference_date = pd.Timestamp("1970-01-01")
    df["aired_from_days"] = (df["aired_from"] - reference_date).dt.days

    features_for_imputation = ["aired_from_days", "score", "members", "favorites", "episodes"]
    imputation_data = df[features_for_imputation].copy()

    imputer = IterativeImputer(
        estimator=RandomForestRegressor(),
        max_iter=10,
        random_state=42,
        n_nearest_features=5,
    )
    imputed_data = imputer.fit_transform(imputation_data)

    df["aired_from"] = pd.to_datetime(reference_date) + pd.to_timedelta(
        imputed_data[:, 0].round(),
        unit="D",
    )

    min_date = pd.Timestamp("1900-01-01")
    max_date = pd.Timestamp.now()
    df["aired_from"] = df["aired_from"].clip(lower=min_date, upper=max_date)

    return df


def impute_aired_to(df):
    df["aired_to"] = df["aired_to"].apply(parse_date)
    df["aired_to"] = df.apply(
        lambda row: (
            pd.Timestamp.now()
            if row["status"] == "Currently Airing"
            else (
                row["aired_from"] + pd.Timedelta(days=(df["aired_to"] - df["aired_from"]).median().days)
                if pd.isnull(row["aired_to"])
                else row["aired_to"]
            )
        ),
        axis=1,
    )

    return df


anime_df = impute_aired_from(anime_df)
anime_df = impute_aired_to(anime_df)

In [6]:
def transform_values(df):
    df["rating"] = df["rating"].astype(str).apply(lambda x: x.split(" ")[0])
    df["genres"] = df["genres"].astype(str).apply(lambda x: x.lower().split(", "))
    df["themes"] = df["themes"].astype(str).apply(lambda x: x.lower().split(", "))
    return df


anime_df = transform_values(anime_df)

In [7]:
def encode_categorical_variables(df):
    features = ["type", "source", "status", "rating"]
    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    encoded = oe.fit_transform(df[features])

    encoded_df = pd.DataFrame(
        encoded,
        columns=[f"{col}_encoded" for col in features],
    )
    df = pd.concat([df, encoded_df], axis=1)

    df["is_tv"] = (df["type"] == "TV").astype(int)
    df["is_movie"] = (df["type"] == "Movie").astype(int)
    df["is_original"] = (df["source"] == "Original").astype(int)
    df["is_manga"] = (df["source"] == "Manga").astype(int)

    novel_sources = ["Novel", "Light novel", "Visual novel"]
    df["is_novel"] = (df["source"].isin(novel_sources)).astype(int)
    df["is_finished"] = (df["status"] == "Finished Airing").astype(int)
    df["is_adult"] = (df["rating"].isin(["R", "R+", "Rx"])).astype(int)

    df = df.drop(columns=features)
    return df


anime_df = encode_categorical_variables(anime_df)

 ## Feature Engineering

In [8]:
def create_date_features(df):
    current_date = datetime.datetime.now()
    time_since_aired = current_date - df["aired_from"]
    df["years_since_aired"] = (time_since_aired).dt.days / 365.25
    airing_duration = df["aired_to"] - df["aired_from"]
    df["days_aired"] = (airing_duration).dt.days
    df = df.drop(columns=["aired_to"])
    return df


anime_df = create_date_features(anime_df)

In [9]:
def extract_minutes(duration_str, num_episodes):
    if pd.isna(duration_str):
        return np.nan
    hours = re.findall(r"(\d+)\s*hr", duration_str)
    minutes = re.findall(r"(\d+)\s*min", duration_str)
    total_minutes = 0
    if hours:
        total_minutes += int(hours[0]) * 60
    if minutes:
        total_minutes += int(minutes[0])
    return total_minutes * num_episodes


def create_duration_feature(df):
    df["duration_minutes"] = df.apply(
        lambda row: extract_minutes(row["duration"], row["episodes"]),
        axis=1,
    )
    df = df.drop(columns=["duration"])
    return df


anime_df = create_duration_feature(anime_df)

In [10]:
def create_seasonal_features(df):
    df["air_month"] = df["aired_from"].dt.month
    df["air_season"] = pd.cut(
        df["air_month"],
        bins=[0, 3, 6, 9, 12],
        labels=["Winter", "Spring", "Summer", "Fall"],
        include_lowest=True,
    )

    season_avg_popularity = df.groupby("air_season", observed=False,)["popularity"].transform("mean")
    df["popularity_vs_season_avg"] = df["popularity"] / season_avg_popularity

    df = df.drop(columns=["aired_from", "air_month", "air_season"])
    return df


anime_df = create_seasonal_features(anime_df)

In [11]:
def apply_svd(encoded_matrix, prefix):
    n_features = encoded_matrix.shape[1]
    n_components = min(n_features - 1, 10)

    if n_components > 0:
        svd = TruncatedSVD(n_components=n_components, random_state=42)
        reduced = svd.fit_transform(encoded_matrix)
        return pd.DataFrame(reduced, columns=[f"{prefix}_svd_{i}" for i in range(n_components)])
    else:
        return pd.DataFrame()


def create_high_level_categories(row):
    genres = set(row["genres"])
    themes = set(row["themes"])

    return pd.Series(
        {
            "is_action_adventure": int(bool(genres & {"action", "adventure"})),
            "is_romance": int("romance" in genres),
            "is_comedy": int("comedy" in genres),
            "is_drama": int("drama" in genres),
            "is_scifi_fantasy": int(bool(genres & {"sci-fi", "fantasy"})),
            "is_isekai": int("isekai" in themes),
            "has_harem": int("harem" in themes),
            "has_school": int("school" in themes),
            "has_military": int("military" in themes),
            "has_supernatural": int(bool(genres & {"supernatural"} | {"mythology", "vampire"} & themes)),
        }
    )


def engineer_genre_theme_features(df):
    mlb_genres = MultiLabelBinarizer()
    mlb_themes = MultiLabelBinarizer()

    genres_encoded = mlb_genres.fit_transform(df["genres"])
    themes_encoded = mlb_themes.fit_transform(df["themes"])

    genres_df = apply_svd(genres_encoded, "genre")
    themes_df = apply_svd(themes_encoded, "theme")

    high_level_categories = df.apply(create_high_level_categories, axis=1)

    df = pd.concat([df, genres_df, themes_df, high_level_categories], axis=1)
    df = df.drop(columns=["genres", "themes"])

    return df


anime_df = engineer_genre_theme_features(anime_df)

In [12]:
def calculate_reputation_score(entity, df):
    mask = df[["studios", "producers"]].apply(lambda x: x.str.contains(entity, case=False, na=False)).any(axis=1)
    avg_rank = df.loc[mask, "rank"].mean()
    return 1 / (avg_rank + 1) if pd.notna(avg_rank) else 0


def calculate_mean_reputation(entities, reputation_scores):
    if pd.notna(entities):
        return np.mean([reputation_scores.get(e.strip(), 0) for e in str(entities).split(",")])
    return 0


def create_reputation_scores(df):
    for entity in ["studios", "producers"]:
        entity_list = df[entity].str.split(", ").explode().unique()
        reputation_scores = {e: calculate_reputation_score(e, df) for e in entity_list if pd.notna(e)}

        df[f"{entity}_reputation"] = df[entity].apply(lambda x: calculate_mean_reputation(x, reputation_scores))

    df = df.drop(columns=["studios", "producers"])
    return df


anime_df = create_reputation_scores(anime_df)

In [13]:
def engineer_engagement_features(df):
    df["score_to_scored_by_ratio"] = df["score"] / (df["scored_by"] + 1)
    df["favorites_to_members_ratio"] = df["favorites"] / (df["members"] + 1)
    df["score_popularity_interaction"] = df["score"] * np.log1p(df["popularity"])
    return df


anime_df = engineer_engagement_features(anime_df)

In [14]:
def engineer_character_features(anime_df, character_df):
    character_df["is_main"] = (character_df["role"] == "Main").astype(int)

    char_features = (
        character_df.groupby("anime_id")
        .agg(
            {
                "is_main": "sum",
                "favorites": ["mean", "max", "std"],
            }
        )
        .reset_index()
    )

    char_features.columns = [
        "anime_id",
        "main_character_count",
        "avg_char_favorites",
        "max_char_favorites",
        "std_char_favorites",
    ]

    char_features["character_diversity"] = char_features["std_char_favorites"] / char_features["avg_char_favorites"]

    anime_df = pd.merge(anime_df, char_features, on="anime_id", how="left")
    return anime_df


anime_df = engineer_character_features(anime_df, character_df)

In [15]:
def create_synopsis_embeddings(df):
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    embeddings = model.encode(
        df["synopsis"].fillna(""),
        show_progress_bar=True,
    )

    synopsis_df = pd.DataFrame(
        embeddings,
        columns=[f"synopsis_emb_{i}" for i in range(embeddings.shape[1])],
    )

    df = pd.concat([df, synopsis_df], axis=1)
    df = df.drop(columns=["synopsis"])

    return df


anime_df = create_synopsis_embeddings(anime_df)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

 ## Final Dataset

In [16]:
print(f"Final dataset shape: {anime_df.shape}")
print("\nColumns in the final dataset:")
print(anime_df.columns.tolist())

Final dataset shape: (400, 448)

Columns in the final dataset:
['anime_id', 'episodes', 'score', 'scored_by', 'rank', 'popularity', 'members', 'favorites', 'aired_from_days', 'type_encoded', 'source_encoded', 'status_encoded', 'rating_encoded', 'is_tv', 'is_movie', 'is_original', 'is_manga', 'is_novel', 'is_finished', 'is_adult', 'years_since_aired', 'days_aired', 'duration_minutes', 'popularity_vs_season_avg', 'genre_svd_0', 'genre_svd_1', 'genre_svd_2', 'genre_svd_3', 'genre_svd_4', 'genre_svd_5', 'genre_svd_6', 'genre_svd_7', 'genre_svd_8', 'genre_svd_9', 'theme_svd_0', 'theme_svd_1', 'theme_svd_2', 'theme_svd_3', 'theme_svd_4', 'theme_svd_5', 'theme_svd_6', 'theme_svd_7', 'theme_svd_8', 'theme_svd_9', 'is_action_adventure', 'is_romance', 'is_comedy', 'is_drama', 'is_scifi_fantasy', 'is_isekai', 'has_harem', 'has_school', 'has_military', 'has_supernatural', 'studios_reputation', 'producers_reputation', 'score_to_scored_by_ratio', 'favorites_to_members_ratio', 'score_popularity_inter

In [17]:
anime_df.to_csv("processed_anime_data.csv", index=False)
print("Processed dataset saved to 'processed_anime_data.csv'")

Processed dataset saved to 'processed_anime_data.csv'
