## Imports & Paths

In [1]:
from  pathlib import Path
import pandas as pd

In [2]:
RAW_FILE = Path('../data/anime_dataset.csv')
CLEAN_FILE = Path('../models/clean_anime.csv')
CLEAN_FILE.parent.mkdir(exist_ok=True, parents=True)

## Load raw data

In [3]:
df = pd.read_csv(RAW_FILE)
print("Raw shape:",df.shape)
df.head()

Raw shape: (13891, 13)


Unnamed: 0,Rank,Title,English Title,URL,Image URL,Score,Users Rated,Episodes,Genres,Studios,Synopsis,Popularity,Ranked
0,1,Frieren: Beyond Journey's End,Frieren: Beyond Journey's End,https://myanimelist.net/anime/52991/Sousou_no_...,https://cdn.myanimelist.net/r/50x70/images/ani...,9.31,621043.0,28,"Adventure, Drama, Fantasy, Shounen",Madhouse,During their decade-long quest to defeat the D...,#157,#1
1,2,Fullmetal Alchemist: Brotherhood,,https://myanimelist.net/anime/5114/Fullmetal_A...,https://cdn.myanimelist.net/r/50x70/images/ani...,9.1,2202757.0,64,"Action, Adventure, Drama, Fantasy, Military, S...",Bones,After a horrific alchemy experiment goes wrong...,#3,#2
2,3,Steins;Gate,,https://myanimelist.net/anime/9253/Steins_Gate,https://cdn.myanimelist.net/r/50x70/images/ani...,9.07,1454070.0,24,"Drama, Sci-Fi, Suspense, Psychological, Time T...",White Fox,Eccentric scientist Rintarou Okabe has a never...,#14,#3
3,4,Attack on Titan Season 3 Part 2,Attack on Titan Season 3 Part 2,https://myanimelist.net/anime/38524/Shingeki_n...,https://cdn.myanimelist.net/r/50x70/images/ani...,9.05,1678118.0,10,"Action, Drama, Suspense, Gore, Military, Survi...",Wit Studio,Seeking to restore humanity's diminishing hope...,#21,#4
4,5,One Piece Fan Letter,,https://myanimelist.net/anime/60022/One_Piece_...,https://cdn.myanimelist.net/r/50x70/images/ani...,9.05,72811.0,1,"Action, Adventure, Fantasy, Shounen",Toei Animation,Although the golden age of piracy is about to ...,#2281,#5


## Clean & Transform

In [4]:
df = df.drop(columns=['English Title'],errors='ignore')

In [5]:
df = df.drop_duplicates(subset='Title')

In [6]:
df['Title_norm'] = df['Title'].astype(str).str.strip().str.lower()

In [7]:
def parse_genres(genres_cell: str) -> list[str]:
    return [
        g.strip().lower()
        for g in str(genres_cell).split(",")
        if g.strip()
    ]

In [8]:
df['Genres_list'] = df['Genres'].fillna("").apply(parse_genres)

In [9]:
df[["Title", "Genres", "Genres_list"]].head(3)

Unnamed: 0,Title,Genres,Genres_list
0,Frieren: Beyond Journey's End,"Adventure, Drama, Fantasy, Shounen","[adventure, drama, fantasy, shounen]"
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Military, S...","[action, adventure, drama, fantasy, military, ..."
2,Steins;Gate,"Drama, Sci-Fi, Suspense, Psychological, Time T...","[drama, sci-fi, suspense, psychological, time ..."


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

In [11]:
mlb = MultiLabelBinarizer()
genre_df = pd.DataFrame(
    mlb.fit_transform(df['Genres_list']),
    columns=mlb.classes_,
    index=df.index
)

In [12]:
genre_df

Unnamed: 0,action,adult cast,adventure,anthropomorphic,avant garde,award winning,boys love,cgdct,childcare,combat sports,...,survival,suspense,team sports,time travel,urban fantasy,vampire,video game,villainess,visual arts,workplace
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13873,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13874,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13875,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13877,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df = pd.concat([df,genre_df], axis=1)
print("✅ Added one-hot genre columns:", list(mlb.classes_))

✅ Added one-hot genre columns: ['action', 'adult cast', 'adventure', 'anthropomorphic', 'avant garde', 'award winning', 'boys love', 'cgdct', 'childcare', 'combat sports', 'comedy', 'crossdressing', 'delinquents', 'detective', 'drama', 'ecchi', 'educational', 'fantasy', 'gag humor', 'girls love', 'gore', 'gourmet', 'harem', 'high stakes game', 'historical', 'horror', 'idols (female)', 'idols (male)', 'isekai', 'iyashikei', 'josei', 'kids', 'love polygon', 'love status quo', 'magical sex shift', 'mahou shoujo', 'martial arts', 'mecha', 'medical', 'military', 'music', 'mystery', 'mythology', 'organized crime', 'otaku culture', 'parody', 'performing arts', 'pets', 'psychological', 'racing', 'reincarnation', 'reverse harem', 'romance', 'samurai', 'school', 'sci-fi', 'seinen', 'shoujo', 'shounen', 'showbiz', 'slice of life', 'space', 'sports', 'strategy game', 'super power', 'supernatural', 'survival', 'suspense', 'team sports', 'time travel', 'urban fantasy', 'vampire', 'video game', 'vill

In [14]:
df.shape

(13231, 90)

In [15]:
df.to_csv(CLEAN_FILE, index=False)