 # Anime Popularity Prediction: Data Preprocessing and Feature Engineering

 ## Imports and Setup

In [1]:
import json
import re

import numpy as np
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm  # noqa

  from tqdm.autonotebook import tqdm, trange


In [2]:
np.random.seed(42)  # Set random seed for reproducibility

 ## Data Loading

In [3]:
anime_df = pd.read_csv("data/anime.csv")

 ## Data Preprocessing

### Feature Selection

In [4]:
anime_features = [
    "anime_id",
    "type",
    "source",
    "episodes",
    "status",
    "aired_from",
    "aired_to",
    "duration",
    "rating",
    "score",
    "scored_by",
    "rank",
    "popularity",
    "members",
    "favorites",
    "synopsis",
    "producers",
    "studios",
    "genres",
    "themes",
]

anime_df = anime_df[anime_features]

In [5]:
anime_df = anime_df[~(anime_df["status"] == "Not yet aired")]

### Feature Transformation

In [6]:
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, errors="coerce").tz_localize(None)
    except:
        return pd.NaT


def split_and_lower(x, separator=", "):
    return str(x).lower().split(separator) if not pd.isna(x) else []


def transform_values(df):
    date_columns = ["aired_from", "aired_to"]
    list_columns = ["genres", "themes", "studios", "producers"]

    df[date_columns] = df[date_columns].map(parse_date)
    for col in list_columns:
        df[col] = df[col].apply(split_and_lower)

    df["rating"] = df["rating"].astype(str).str.split().str[0]

    return df


anime_df = transform_values(anime_df)

In [7]:
def clean_text(text, nlp):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_.strip()]
    return " ".join(tokens)


def clean_synopsis(df):
    nlp = spacy.load("en_core_web_sm")
    tqdm.pandas(desc="Cleaning synopsis")
    df["synopsis"] = df["synopsis"].progress_apply(lambda x: clean_text(x, nlp) if pd.notna(x) else "")
    return df


anime_df = clean_synopsis(anime_df)

Cleaning synopsis:   0%|          | 0/361 [00:00<?, ?it/s]

### Data Imputation

In [8]:
def handle_missing_values(df):
    # TODO: Date imputation

    df["episodes"] = df["episodes"].fillna(0)
    df["scored_by"] = df["scored_by"].fillna(0)
    df["score"] = df["score"].fillna(df["score"].mean())
    df["type"] = df["type"].fillna(df["type"].mode()[0])
    df["rank"] = df["rank"].fillna(df["rank"].max() + 1)
    return df


anime_df = handle_missing_values(anime_df)

In [9]:
print(anime_df.columns.to_list())

['anime_id', 'type', 'source', 'episodes', 'status', 'aired_from', 'aired_to', 'duration', 'rating', 'score', 'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis', 'producers', 'studios', 'genres', 'themes']


 ## Feature Engineering

### Temporal Features

In [10]:
def create_age_bucket(df):
    current_date = pd.Timestamp.now()
    df["age"] = (current_date - df["aired_from"]).dt.days / 365.25
    df["age_bucket"] = pd.cut(df["age"], bins=[0, 1, 5, 10, 20, float("inf")], labels=[0, 1, 2, 3, 4])
    return df.drop(columns=["age"])


def create_status_features(df):
    df["is_ongoing"] = (df["status"] == "Currently Airing").astype(int)
    df["days_aired"] = (df["aired_to"] - df["aired_from"]).dt.days
    df.loc[df["is_ongoing"] == 1, "days_aired"] = (
        pd.Timestamp.now() - df.loc[df["is_ongoing"] == 1, "aired_from"]
    ).dt.days
    return df.drop(columns=["status"])


def create_season_encoding(df):
    df["month"] = df["aired_from"].dt.month
    df["season_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["season_cos"] = np.cos(2 * np.pi * df["month"] / 12)
    return df.drop(columns=["month"])


def extract_minutes(duration_str):
    if pd.isna(duration_str):
        return np.nan
    hours = re.findall(r"(\d+)\s*hr", duration_str)
    minutes = re.findall(r"(\d+)\s*min", duration_str)
    total_minutes = 0
    if hours:
        total_minutes += int(hours[0]) * 60
    if minutes:
        total_minutes += int(minutes[0])
    return total_minutes


def create_runtime_density(df):
    df["episode_duration"] = df["duration"].apply(extract_minutes)
    df["total_runtime"] = df["episode_duration"] * df["episodes"]
    df["airing_days"] = (df["aired_to"] - df["aired_from"]).dt.days
    df["runtime_density"] = df["total_runtime"] / df["airing_days"].replace(0, 1)  # Avoid division by zero
    return df.drop(columns=["episode_duration", "total_runtime", "airing_days"])


def create_temporal_features(df):
    df = create_age_bucket(df)
    df = create_status_features(df)
    df = create_season_encoding(df)
    df = create_runtime_density(df)
    return df


anime_df = create_temporal_features(anime_df)

### Categorical Features

In [11]:
def create_boolean_features(df):
    df["is_tv"] = (df["type"] == "TV").astype(int)
    df["is_movie"] = (df["type"] == "Movie").astype(int)
    df["is_original"] = (df["source"] == "Original").astype(int)
    df["is_manga"] = (df["source"] == "Manga").astype(int)
    df["is_novel"] = (df["source"].isin(["Novel", "Light novel", "Visual novel"])).astype(int)
    df["is_adult"] = (df["rating"].isin(["R", "R+", "Rx"])).astype(int)

    return df.drop(columns=["type", "source", "rating"])


anime_df = create_boolean_features(anime_df)

In [12]:
def load_category_mappings(file_path):
    with open(file_path, "r") as file:
        return json.load(file)


def map_categories(genres_and_themes, mapping):
    categories = set()
    for item in genres_and_themes:
        for category, genre_list in mapping.items():
            if item != "isekai" and item in genre_list:
                categories.add(category)
    return list(categories) if categories else ["Other"]


def create_categories(df, mappings):
    df["categories"] = df["genres"] + df["themes"]
    df["categories"] = df["categories"].apply(lambda x: map_categories(x, mappings))

    all_categories = set(mappings.keys())
    for category in all_categories:
        df[f"is_{category.lower()}"] = df["categories"].apply(lambda x: int(category in x))

    return df.drop(columns=["genres", "themes", "categories"])


category_mappings = load_category_mappings("data/category_mappings.json")
anime_df = create_categories(anime_df, category_mappings)

In [13]:
def create_synopsis_embeddings(df):
    df = df.reset_index(drop=True)
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    embeddings = model.encode(df["synopsis"], show_progress_bar=True)

    synopsis_df = pd.DataFrame(
        embeddings,
        columns=[f"synopsis_emb_{i}" for i in range(embeddings.shape[1])],
    )

    df = pd.concat([df, synopsis_df], axis=1)
    return df.drop(columns=["synopsis"])


anime_df = create_synopsis_embeddings(anime_df)

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

 ## Final Dataset

In [14]:
print(f"Final dataset shape: {anime_df.shape}")
print("\nColumns in the final dataset:")
print(anime_df.columns.tolist())

Final dataset shape: (361, 428)

Columns in the final dataset:
['anime_id', 'episodes', 'aired_from', 'aired_to', 'duration', 'score', 'scored_by', 'rank', 'popularity', 'members', 'favorites', 'producers', 'studios', 'age_bucket', 'is_ongoing', 'days_aired', 'season_sin', 'season_cos', 'runtime_density', 'is_tv', 'is_movie', 'is_original', 'is_manga', 'is_novel', 'is_adult', 'is_comedy', 'is_game', 'is_fantasy', 'is_sciencefiction', 'is_artistic style', 'is_setting', 'is_historical', 'is_action', 'is_drama', 'is_sports', 'is_lifestyle', 'is_professional', 'is_adventure', 'is_thematic', 'is_demographic', 'is_relationship', 'is_ecchi', 'is_psychological', 'is_supernatural', 'synopsis_emb_0', 'synopsis_emb_1', 'synopsis_emb_2', 'synopsis_emb_3', 'synopsis_emb_4', 'synopsis_emb_5', 'synopsis_emb_6', 'synopsis_emb_7', 'synopsis_emb_8', 'synopsis_emb_9', 'synopsis_emb_10', 'synopsis_emb_11', 'synopsis_emb_12', 'synopsis_emb_13', 'synopsis_emb_14', 'synopsis_emb_15', 'synopsis_emb_16', 'syno

In [15]:
anime_df.to_csv("processed_anime_data.csv", index=False)
print("Processed dataset saved to 'processed_anime_data.csv'")

Processed dataset saved to 'processed_anime_data.csv'
