In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from anime_sensei.constant import *
from anime_sensei.utils.utility import parse_duration_to_minutes
load_dotenv()

kaggle_username = os.getenv("KAGGLE_USERNAME")
kaggle_key = os.getenv("KAGGLE_KEY")

In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "anime-dataset-2023.csv"

# Load the latest version
anime = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "dbdmobile/myanimelist-dataset",
  file_path,
)

In [3]:
anime.head()

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


## Data Exploration and Transformation

### 1. Anime dataset

In [4]:
anime.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [5]:
anime.isnull().sum()

anime_id        0
Name            0
English name    0
Other name      0
Score           0
Genres          0
Synopsis        0
Type            0
Episodes        0
Aired           0
Premiered       0
Status          0
Producers       0
Licensors       0
Studios         0
Source          0
Duration        0
Rating          0
Rank            0
Popularity      0
Favorites       0
Scored By       0
Members         0
Image URL       0
dtype: int64

In [6]:
columns_with_unknown = anime.columns[anime.isin(['UNKNOWN']).any()]
print(columns_with_unknown)

Index(['English name', 'Other name', 'Score', 'Genres', 'Type', 'Episodes',
       'Premiered', 'Producers', 'Licensors', 'Studios', 'Rating', 'Rank',
       'Scored By'],
      dtype='object')


In [7]:
anime.replace("UNKNOWN", np.nan, inplace = True)
anime['Synopsis'].replace("No description available for this anime.", np.nan, inplace = True)
anime['Scored By'].replace(np.nan, 0, inplace = True)

In [8]:
anime.isnull().sum()

anime_id            0
Name                0
English name    14577
Other name        128
Score            9213
Genres           4929
Synopsis         4535
Type               74
Episodes          611
Aired               0
Premiered       19399
Status              0
Producers       13350
Licensors       20170
Studios         10526
Source              0
Duration            0
Rating            669
Rank             4612
Popularity          0
Favorites           0
Scored By           0
Members             0
Image URL           0
dtype: int64

In [9]:
anime.drop(['English name', 'Other name', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Aired', 'Status', 'Rank'], axis = 1, inplace=True)
anime.dropna(subset=['Synopsis'], inplace=True)

In [10]:
anime.isnull().sum()

anime_id         0
Name             0
Score         5145
Genres        3262
Synopsis         0
Type            49
Episodes       484
Duration         0
Rating         532
Popularity       0
Favorites        0
Scored By        0
Members          0
Image URL        0
dtype: int64

In [11]:
average_rating = anime['Score'][anime['Score']!=np.nan]
average_rating = average_rating.astype('float')
mean = round(average_rating.mean(), 2)
anime['Score'].replace(np.nan, mean, inplace = True)
anime['Score'] = anime['Score'].astype('float64')


anime['Episodes'].replace(np.nan, 0.0, inplace = True)
anime['Episodes'] = anime['Episodes'].astype('float64')


anime['Type'].replace(np.nan, "UNKNOWN", inplace = True)
anime['Genres'].replace(np.nan, "UNKNOWN", inplace = True)

mode_ratings = anime['Rating'].value_counts().idxmax()
anime['Rating'].replace(np.nan, mode_ratings, inplace = True)

anime['Duration_mins'] = anime['Duration'].apply(parse_duration_to_minutes)
anime.drop('Duration', axis = 1, inplace = True)

In [12]:
anime.isnull().sum()

anime_id         0
Name             0
Score            0
Genres           0
Synopsis         0
Type             0
Episodes         0
Rating           0
Popularity       0
Favorites        0
Scored By        0
Members          0
Image URL        0
Duration_mins    0
dtype: int64

In [13]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20370 entries, 0 to 24904
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   anime_id       20370 non-null  int64  
 1   Name           20370 non-null  object 
 2   Score          20370 non-null  float64
 3   Genres         20370 non-null  object 
 4   Synopsis       20370 non-null  object 
 5   Type           20370 non-null  object 
 6   Episodes       20370 non-null  float64
 7   Rating         20370 non-null  object 
 8   Popularity     20370 non-null  int64  
 9   Favorites      20370 non-null  int64  
 10  Scored By      20370 non-null  object 
 11  Members        20370 non-null  int64  
 12  Image URL      20370 non-null  object 
 13  Duration_mins  20370 non-null  int64  
dtypes: float64(2), int64(5), object(7)
memory usage: 2.3+ MB


In [14]:
anime.head()

Unnamed: 0,anime_id,Name,Score,Genres,Synopsis,Type,Episodes,Rating,Popularity,Favorites,Scored By,Members,Image URL,Duration_mins
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,R - 17+ (violence & profanity),602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...,115
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,PG-13 - Teens 13 or older,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...,24
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,PG-13 - Teens 13 or older,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...,25
4,8,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,PG - Children,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...,23


### 2. Ratings

In [15]:
ratings = pd.read_csv("/Users/pparashar21/Desktop/Projects/AnimeRecommender/Artifacts/Data_ingestion/06-14-2025_19-33-18/Anime_Ratings.csv")
ratings.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


In [16]:
ratings.shape

(24325191, 5)

In [17]:
ratings.groupby('anime_id').agg({'user_id':'count', 'rating':'mean'}).reset_index()

Unnamed: 0,anime_id,user_id,rating
0,1,64625,8.772673
1,5,29512,8.378287
2,6,38209,8.296920
3,7,10627,7.377905
4,8,1859,7.015062
...,...,...,...
16495,56014,1,6.000000
16496,56023,1,5.000000
16497,56030,1,5.000000
16498,56036,2,7.500000


In [18]:
ratings.isnull().sum()

user_id        0
Username       0
anime_id       0
Anime Title    0
rating         0
dtype: int64

In [33]:
merged = pd.merge(anime, ratings, on='anime_id', how='inner', indicator=True)
merged.drop(['Anime Title'], axis=1, inplace = True)
merged.head()

Unnamed: 0,anime_id,Name,Score,Genres,Synopsis,Type,Episodes,Rating,Popularity,Favorites,Scored By,Members,Image URL,Duration_mins,user_id,Username,rating,_merge
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24,1,Xinil,10,both
1,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24,4,Crystal,8,both
2,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24,20,vondur,9,both
3,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24,23,Amuro,9,both
4,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,R - 17+ (violence & profanity),43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...,24,47,kei-clone,7,both
