In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from anime_sensei.exception.handler import ExceptionHandler
from anime_sensei.constant import *
from anime_sensei.entity import config_entity
from anime_sensei.utils.utility import parse_duration_to_minutes
load_dotenv()

kaggle_username = os.getenv("KAGGLE_USERNAME")
kaggle_key = os.getenv("KAGGLE_KEY")

In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter


file_path = "anime-dataset-2023.csv"

# Load the latest version
anime = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "dbdmobile/myanimelist-dataset",
  file_path,
)

In [3]:
anime

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24900,55731,Wu Nao Monu,UNKNOWN,无脑魔女,UNKNOWN,"Comedy, Fantasy, Slice of Life",No description available for this anime.,ONA,15.0,"Jul 4, 2023 to ?",...,UNKNOWN,Web manga,Unknown,PG-13 - Teens 13 or older,UNKNOWN,24723,0,UNKNOWN,0,https://cdn.myanimelist.net/images/anime/1386/...
24901,55732,Bu Xing Si: Yuan Qi,Blader Soul,捕星司·源起,UNKNOWN,"Action, Adventure, Fantasy",No description available for this anime.,ONA,18.0,"Jul 27, 2023 to ?",...,UNKNOWN,Web novel,Unknown,PG-13 - Teens 13 or older,0.0,0,0,UNKNOWN,0,https://cdn.myanimelist.net/images/anime/1383/...
24902,55733,Di Yi Xulie,The First Order,第一序列,UNKNOWN,"Action, Adventure, Fantasy, Sci-Fi",No description available for this anime.,ONA,16.0,"Jul 19, 2023 to ?",...,UNKNOWN,Web novel,Unknown,PG-13 - Teens 13 or older,0.0,0,0,UNKNOWN,0,https://cdn.myanimelist.net/images/anime/1130/...
24903,55734,Bokura no Saishuu Sensou,UNKNOWN,僕らの最終戦争,UNKNOWN,UNKNOWN,A music video for the song Bokura no Saishuu S...,Music,1.0,"Apr 23, 2022",...,UNKNOWN,Original,3 min,PG-13 - Teens 13 or older,0.0,0,0,UNKNOWN,0,https://cdn.myanimelist.net/images/anime/1931/...


## Data Exploration and Transformation

### 1. Anime dataset

In [6]:
anime.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [7]:
anime.isnull().sum()

anime_id        0
Name            0
English name    0
Other name      0
Score           0
Genres          0
Synopsis        0
Type            0
Episodes        0
Aired           0
Premiered       0
Status          0
Producers       0
Licensors       0
Studios         0
Source          0
Duration        0
Rating          0
Rank            0
Popularity      0
Favorites       0
Scored By       0
Members         0
Image URL       0
dtype: int64

In [8]:
columns_with_unknown = anime.columns[anime.isin(['UNKNOWN']).any()]
print(columns_with_unknown)

Index(['English name', 'Other name', 'Score', 'Genres', 'Type', 'Episodes',
       'Premiered', 'Producers', 'Licensors', 'Studios', 'Rating', 'Rank',
       'Scored By'],
      dtype='object')


In [9]:
anime.replace("UNKNOWN", np.nan, inplace = True)
anime['Synopsis'].replace("No description available for this anime.", np.nan, inplace = True)
anime['Scored By'].replace(np.nan, 0, inplace = True)

In [10]:
anime.isnull().sum()

anime_id            0
Name                0
English name    14577
Other name        128
Score            9213
Genres           4929
Synopsis         4535
Type               74
Episodes          611
Aired               0
Premiered       19399
Status              0
Producers       13350
Licensors       20170
Studios         10526
Source              0
Duration            0
Rating            669
Rank             4612
Popularity          0
Favorites           0
Scored By           0
Members             0
Image URL           0
dtype: int64

In [11]:
anime.drop(['English name', 'Other name', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Aired'], axis = 1, inplace=True)
anime.dropna(subset=['Synopsis'], inplace=True)

In [12]:
anime.isnull().sum()

anime_id         0
Name             0
Score         5145
Genres        3262
Synopsis         0
Type            49
Episodes       484
Status           0
Duration         0
Rating         532
Rank          4207
Popularity       0
Favorites        0
Scored By        0
Members          0
Image URL        0
dtype: int64

In [13]:
average_rating = anime['Score'][anime['Score']!=np.nan]
average_rating = average_rating.astype('float')
mean = round(average_rating.mean(), 2)
anime['Score'].replace(np.nan, mean, inplace = True)
anime['Score'] = anime['Score'].astype('float64')


anime['Rank'].replace(np.nan, -1.0, inplace = True)
anime['Rank'] = anime['Rank'].astype('float64')

anime['Episodes'].replace(np.nan, -1.0, inplace = True)
anime['Episodes'] = anime['Episodes'].astype('float64')


anime['Type'].replace(np.nan, "UNKNOWN", inplace = True)
anime['Genres'].replace(np.nan, "UNKNOWN", inplace = True)

mode_ratings = anime['Rating'].value_counts().idxmax()
anime['Rating'].replace(np.nan, mode_ratings, inplace = True)

anime['Duration_mins'] = anime['Duration'].apply(parse_duration_to_minutes)
anime.drop('Duration', axis = 1, inplace = True)

In [14]:
anime.isnull().sum()

anime_id         0
Name             0
Score            0
Genres           0
Synopsis         0
Type             0
Episodes         0
Status           0
Rating           0
Rank             0
Popularity       0
Favorites        0
Scored By        0
Members          0
Image URL        0
Duration_mins    0
dtype: int64

In [15]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20370 entries, 0 to 24904
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   anime_id       20370 non-null  int64  
 1   Name           20370 non-null  object 
 2   Score          20370 non-null  float64
 3   Genres         20370 non-null  object 
 4   Synopsis       20370 non-null  object 
 5   Type           20370 non-null  object 
 6   Episodes       20370 non-null  float64
 7   Status         20370 non-null  object 
 8   Rating         20370 non-null  object 
 9   Rank           20370 non-null  float64
 10  Popularity     20370 non-null  int64  
 11  Favorites      20370 non-null  int64  
 12  Scored By      20370 non-null  object 
 13  Members        20370 non-null  int64  
 14  Image URL      20370 non-null  object 
 15  Duration_mins  20370 non-null  int64  
dtypes: float64(3), int64(5), object(8)
memory usage: 2.6+ MB


### 2. Users

In [28]:
users = pd.read_csv("/Users/pparashar21/Desktop/Projects/AnimeRecommender/Artifacts/Data_ingestion/Anime_Users.csv")
users.head(10)

Unnamed: 0,Mal ID,Username,Gender,Birthday,Location,Joined,Days Watched,Mean Score,Watching,Completed,On Hold,Dropped,Plan to Watch,Total Entries,Rewatched,Episodes Watched
0,1,Xinil,Male,1985-03-04T00:00:00+00:00,California,2004-11-05T00:00:00+00:00,142.3,7.37,1.0,233.0,8.0,93.0,64.0,399.0,60.0,8458.0
1,3,Aokaado,Male,,"Oslo, Norway",2004-11-11T00:00:00+00:00,68.6,7.34,23.0,137.0,99.0,44.0,40.0,343.0,15.0,4072.0
2,4,Crystal,Female,,"Melbourne, Australia",2004-11-13T00:00:00+00:00,212.8,6.68,16.0,636.0,303.0,0.0,45.0,1000.0,10.0,12781.0
3,9,Arcane,,,,2004-12-05T00:00:00+00:00,30.0,7.71,5.0,54.0,4.0,3.0,0.0,66.0,0.0,1817.0
4,18,Mad,,,,2005-01-03T00:00:00+00:00,52.0,6.27,1.0,114.0,10.0,5.0,23.0,153.0,42.0,3038.0
5,20,vondur,Male,1988-01-25T00:00:00+00:00,"Bergen, Norway",2005-01-05T00:00:00+00:00,73.1,8.06,11.0,94.0,11.0,2.0,20.0,138.0,7.0,4374.0
6,23,Amuro,,1988-02-22T00:00:00+00:00,Canada,2005-01-23T00:00:00+00:00,142.5,7.41,20.0,298.0,5.0,19.0,50.0,392.0,0.0,8565.0
7,36,Baman,Male,,Land of Rain and Fjords,2005-02-05T00:00:00+00:00,272.1,5.9,27.0,1144.0,11.0,55.0,338.0,1575.0,36.0,16309.0
8,44,beddan,Male,,,2005-02-21T00:00:00+00:00,18.6,7.6,0.0,37.0,0.0,0.0,0.0,37.0,0.0,1083.0
9,47,kei-clone,Male,,31f288172a11dea9f2781a6d87e0a200,2005-03-09T00:00:00+00:00,34.5,6.84,15.0,104.0,22.0,3.0,19.0,163.0,1.0,2054.0


In [19]:
users.columns

Index(['Mal ID', 'Username', 'Gender', 'Birthday', 'Location', 'Joined',
       'Days Watched', 'Mean Score', 'Watching', 'Completed', 'On Hold',
       'Dropped', 'Plan to Watch', 'Total Entries', 'Rewatched',
       'Episodes Watched'],
      dtype='object')

### 3. Ratings

In [18]:
ratings = pd.read_csv("/Users/pparashar21/Desktop/Projects/AnimeRecommender/Artifacts/Data_ingestion/Anime_Ratings.csv")
ratings.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


In [29]:
ratings.isnull().sum()

user_id        0
Username       0
anime_id       0
Anime Title    0
rating         0
dtype: int64

In [37]:
ratings['Anime Title'][ratings['anime_id'] == 48]

1           .hack//Sign
1192        .hack//Sign
1235        .hack//Sign
1856        .hack//Sign
2023        .hack//Sign
               ...     
24315670    .hack//Sign
24317204    .hack//Sign
24320332    .hack//Sign
24321749    .hack//Sign
24322572    .hack//Sign
Name: Anime Title, Length: 21292, dtype: object

In [38]:
anime['Name'][anime['anime_id'] == 48]

29    .hack//Sign
Name: Name, dtype: object