## Scrape the Jikan Database

In [None]:
import requests
import json

api_url = 'https://api.jikan.moe/v4'

def scrape_page(endpoint, page, file_path):
    response = requests.get(api_url + endpoint + f'?page={page}')
    response.raise_for_status()
    data = response.json()
    with open(file_path, 'w') as f:
        json.dump(data['data'], f, indent=4)

In [None]:
import datetime
import tqdm
import time
import os

wait = 1.2 # seconds, with 1.15 crashed

def scrape_jikan_db(database):

    directory_path = f'data/raw/{database}'
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

    last_page = requests.get(api_url + '/' + database).json()['pagination']['last_visible_page']
    length = len(str(last_page))

    print('Started:', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    for page in tqdm.trange(1, last_page + 1):
        start = time.perf_counter()
        scrape_page('/' + database, page, directory_path + f'/page{str(page).zfill(length)}.json')
        end = time.perf_counter()
        time.sleep(max(0, start + wait - end))

    print('Finished:', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

## Merge Files

In [None]:
import shutil

def merge_files(database):

    directory_path = f'data/raw/{database}'

    data = []
    for file_name in tqdm.tqdm(os.listdir(directory_path)):
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r') as f:
            file = json.load(f)
        data.extend(file)

    with open(f'data/raw/{database}.json', 'w') as f:
        json.dump(data, f, indent=4)

    shutil.rmtree(directory_path)

## Actual Scraping

### Anime

In [None]:
merge_files('anime')

100%|██████████| 1070/1070 [00:03<00:00, 348.37it/s]


In [None]:
import pandas as pd
import numpy as np

anime = pd.read_json('/content/data/raw/anime.json')

# Drop Duplicates
old_size = anime.shape[0]
anime = anime.drop_duplicates(subset=['mal_id']).reset_index(drop=True)
print('Duplicates:', old_size - anime.shape[0])

# Remove useless columns (airing column = Currently Airing status, explicit_genres is void)
assert all(anime['airing'] == (anime['status']=='Currently Airing'))
assert all(anime.explicit_genres.astype('str')=='[]')
anime.drop(columns=['airing', 'explicit_genres'], inplace=True)

# Avoid 'Unknown' and 'None' strings
for col in ['type', 'source', 'duration']:
    anime[col] = anime[col].replace('Unknown', np.nan)
anime['rating'] = anime['rating'].replace('None', np.nan)

# Avoid unnecessary floats
for col in ['scored_by', 'episodes', 'year']:
    anime[col] = anime[col].astype('Int64')

# Simplify aired dates
anime['aired_from'] = pd.to_datetime(anime['aired'].str['from']).dt.date
anime['aired_to'] = pd.to_datetime(anime['aired'].str['to']).dt.date
anime.drop(columns=['aired'], inplace=True)

# Use popularity=0 to detect 'pending approval' animes
anime['pending_approval'] = anime['popularity'] == 0

#  Drop rank and popularity, as they sort equal score / members alphabetically...
anime.drop(columns=['rank', 'popularity'], inplace=True)

# Missing synopsis and background
old_default_synopsis = 'No synopsis has been added for this series yet. Click here to update this information.'
anime['synopsis'] = anime['synopsis'].replace('', np.nan).replace(old_default_synopsis, np.nan)
anime['background'] = anime['background'].replace('', np.nan)

# season and year should be for all, premiered only in TV shows
anime.rename(columns={'season': 'premiered_season', 'year': 'premiered_year'}, inplace=True)

# Simplify broadcast
anime['broadcast_day'] = anime['broadcast'].str['day']
anime['broadcast_time'] = anime['broadcast'].str['time']
anime.drop(columns=['broadcast'], inplace=True)

# Only keep names
for col in ['producers', 'licensors', 'studios', 'genres', 'themes', 'demographics']:
    anime[col] = anime[col].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Replace old Themes names
old_themes_names = {'Police': 'Detective', 'Cars': 'Racing', 'Demons': 'Mythology', 'Game': 'Strategy Game'}
anime['themes'] = anime['themes'].apply(lambda x: [t if t not in old_themes_names else old_themes_names[t] for t in x])

# R18+ definition by MAL (not ranked)
anime['nsfw'] = anime['genres'].apply(lambda x: 'Hentai' in x or 'Erotica' in x)

# Simplify trailer. Pictures: default.jpg and prefixs sd, mq, hq, maxres
anime['trailer_url'] = anime['trailer'].str['url']
anime.drop(columns=['trailer'], inplace=True)

# Simplify main_picture, delete default. Options: .jpg, t.jpg, l.jpg, .webp, t.webp, l.webp
default_image = 'https://cdn.myanimelist.net/img/sp/icon/apple-touch-icon-256.png'
anime['main_picture'] = anime['images'].str['jpg'].str['image_url'].replace(default_image, np.nan)
anime.drop(columns=['images'], inplace=True)

# Better order
order = ['mal_id', 'title', 'type', 'score', 'scored_by', 'status', 'episodes', 'aired_from', 'aired_to', 'source',
         'members', 'favorites', 'duration', 'rating', 'nsfw', 'pending_approval', 'premiered_season', 'premiered_year',
         'broadcast_day', 'broadcast_time', 'genres', 'themes', 'demographics', 'studios', 'producers', 'licensors',
         'synopsis', 'background', 'main_picture', 'url', 'trailer_url', 'title_english', 'title_japanese', 'title_synonyms']

anime = anime[order]

# Sort by Top Anime
anime['tmp'] = anime['score'].rank(ascending=False) + anime['scored_by'].rank(ascending=False)
anime = anime.sort_values('tmp').reset_index(drop=True)
anime.drop(columns=['tmp'], inplace=True)

# Save as csv
anime.to_csv('data/anime.csv', index=False)

print(anime.shape)

pd.options.display.max_columns = None
anime.head(1)

Duplicates: 113


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime['tmp'] = anime['score'].rank(ascending=False) + anime['scored_by'].rank(ascending=False)


(26633, 34)


Unnamed: 0,mal_id,title,type,score,scored_by,status,episodes,aired_from,aired_to,source,members,favorites,duration,rating,nsfw,pending_approval,premiered_season,premiered_year,broadcast_day,broadcast_time,genres,themes,demographics,studios,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.09,2125584,Finished Airing,64,2009-04-05,2010-07-04,Manga,3353933,226439,24 min per ep,R - 17+ (violence & profanity),False,False,spring,2009,Sundays,17:00,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1208/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...


In [None]:
import pandas as pd
import ast

anime = pd.read_csv('/content/data/anime.csv')

for col in ['scored_by', 'episodes', 'premiered_year']:
    anime[col] = anime[col].astype('Int64')

for col in ['aired_from', 'aired_to']:
    anime[col] = pd.to_datetime(anime[col]).dt.date

anime['broadcast_time'] = pd.to_datetime(anime['broadcast_time']).dt.time

for col in ['genres', 'themes', 'demographics', 'studios', 'producers', 'licensors', 'title_synonyms']:
    anime[col] = anime[col].apply(ast.literal_eval)


  anime['broadcast_time'] = pd.to_datetime(anime['broadcast_time']).dt.time


Unnamed: 0,mal_id,title,type,score,scored_by,status,episodes,aired_from,aired_to,source,members,favorites,duration,rating,nsfw,pending_approval,premiered_season,premiered_year,broadcast_day,broadcast_time,genres,themes,demographics,studios,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.09,2125584,Finished Airing,64,2009-04-05,2010-07-04,Manga,3353933,226439,24 min per ep,R - 17+ (violence & profanity),False,False,spring,2009,Sundays,17:00:00,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1208/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...


In [None]:
anime

Unnamed: 0,mal_id,title,type,score,scored_by,status,episodes,aired_from,aired_to,source,members,favorites,duration,rating,nsfw,pending_approval,premiered_season,premiered_year,broadcast_day,broadcast_time,genres,themes,demographics,studios,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.09,2125584,Finished Airing,64,2009-04-05,2010-07-04,Manga,3353933,226439,24 min per ep,R - 17+ (violence & profanity),False,False,spring,2009,Sundays,17:00:00,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1208/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...
1,11061,Hunter x Hunter (2011),TV,9.03,1768158,Finished Airing,148,2011-10-02,2014-09-24,Manga,2841616,210917,23 min per ep,PG-13 - Teens 13 or older,False,False,fall,2011,Sundays,10:55:00,"[Action, Adventure, Fantasy]",[],[Shounen],[Madhouse],"[VAP, Nippon Television Network, Shueisha]",[VIZ Media],Hunters devote themselves to accomplishing haz...,,https://cdn.myanimelist.net/images/anime/1337/...,https://myanimelist.net/anime/11061/Hunter_x_H...,https://www.youtube.com/watch?v=D9iTQRB4XRk,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,[HxH (2011)]
2,38524,Shingeki no Kyojin Season 3 Part 2,TV,9.05,1592959,Finished Airing,10,2019-04-29,2019-07-01,Manga,2285318,58771,23 min per ep,R - 17+ (violence & profanity),False,False,spring,2019,Mondays,00:10:00,"[Action, Drama, Suspense]","[Gore, Military, Survival]",[Shounen],[Wit Studio],"[Production I.G, Dentsu, Mainichi Broadcasting...",[Funimation],Seeking to restore humanity's diminishing hope...,Shingeki no Kyojin Season 3 Part 2 adapts cont...,https://cdn.myanimelist.net/images/anime/1517/...,https://myanimelist.net/anime/38524/Shingeki_n...,https://www.youtube.com/watch?v=hKHepjfj5Tw,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,[]
3,9253,Steins;Gate,TV,9.07,1403149,Finished Airing,24,2011-04-06,2011-09-14,Visual novel,2570007,189959,24 min per ep,PG-13 - Teens 13 or older,False,False,spring,2011,Wednesdays,02:05:00,"[Drama, Sci-Fi, Suspense]","[Psychological, Time Travel]",[],[White Fox],"[Frontier Works, Media Factory, Kadokawa Shote...",[Funimation],Eccentric scientist Rintarou Okabe has a never...,Steins;Gate is based on 5pb. and Nitroplus' vi...,https://cdn.myanimelist.net/images/anime/1935/...,https://myanimelist.net/anime/9253/Steins_Gate,https://www.youtube.com/watch?v=27OZc-ku6is,Steins;Gate,STEINS;GATE,[]
4,28851,Koe no Katachi,Movie,8.93,1619864,Finished Airing,1,2016-09-17,NaT,Manga,2337848,88035,2 hr 10 min,PG-13 - Teens 13 or older,False,False,,,,NaT,"[Award Winning, Drama]",[Romantic Subtext],[Shounen],[Kyoto Animation],"[Shochiku, Pony Canyon, Kodansha, ABC Animatio...","[Eleven Arts, NYAV Post]","As a wild youth, elementary school student Sho...",Koe no Katachi won the following awards: Japan...,https://cdn.myanimelist.net/images/anime/1122/...,https://myanimelist.net/anime/28851/Koe_no_Kat...,https://www.youtube.com/watch?v=XBNWo25izJ8,A Silent Voice,聲の形,[The Shape of Voice]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26628,41458,Origami Ninja Koyankinte,TV,,,Currently Airing,,2020-04-07,NaT,,334,0,,PG - Children,False,False,spring,2020,Tuesdays,07:30:00,[Comedy],[],[Kids],[Directions],[TV Tokyo],[],"Koyan, the Origami Ninja, came to Earth from h...",,https://cdn.myanimelist.net/images/anime/1860/...,https://myanimelist.net/anime/41458/Origami_Ni...,,,おりがみにんじゃ コーヤン＠きんてれ,[Happy Smile ♡ Dream]
26629,41460,Kore ga Masakano Are deshita: Wakamono no Shou...,ONA,,,Finished Airing,1,2020-03-12,NaT,Original,285,0,3 min,G - All Ages,False,False,,,,NaT,[Drama],[],[],[Studio Placebo],[],[],The adult age in japan will be reduced from 20...,,https://cdn.myanimelist.net/images/anime/1900/...,https://myanimelist.net/anime/41460/Kore_ga_Ma...,,,コレがまさかのアレでした ～若者の消費者被害～,[]
26630,41470,Micchiri Wanko! Animation,TV,,,Finished Airing,24,2020-04-01,2020-09-09,Other,881,1,1 min per ep,G - All Ages,False,False,spring,2020,Wednesdays,07:30:00,[Comedy],[],[Kids],[Charaction],[],[],Spin-off of Micchiri Neko.,,https://cdn.myanimelist.net/images/anime/1793/...,https://myanimelist.net/anime/41470/Micchiri_W...,,,みっちりわんこ！あにめ～しょん,[]
26631,41475,Hulu Xiao Jin Gang,OVA,,,Finished Airing,6,1989-01-01,1991-01-01,Original,209,1,18 min per ep,PG - Children,False,False,,,,NaT,"[Action, Adventure]",[Super Power],[Kids],[Shanghai Animation Film Studio],[],[],,,https://cdn.myanimelist.net/images/anime/1298/...,https://myanimelist.net/anime/41475/Hulu_Xiao_...,,,葫芦小金刚,"[Diamond Brother, Hulu Brothers 2, Calabash Br..."
