# SI 507 final project: analysis of anime data

final project of SI 507

##  Data preparation

### Generate authentication

In [8]:
import json
import requests
import secrets
import csv
import time
from bs4 import BeautifulSoup
import mal_client.py

In [None]:
CLIENT_ID = mal_client.CLIENT_ID
CLIENT_SECRET = mal_client.CLIENT_SECRET

In [None]:
# 1. Generate a new Code Verifier / Code Challenge.
def get_new_code_verifier() -> str:
    token = secrets.token_urlsafe(100)
    return token[:128]


# 2. Print the URL needed to authorise your application.
def print_new_authorisation_url(code_challenge: str):
    global CLIENT_ID

    url = f'https://myanimelist.net/v1/oauth2/authorize?response_type=code&client_id={CLIENT_ID}&code_challenge={code_challenge}'
    print(f'Authorise your application by clicking here: {url}\n')


# 3. Once you've authorised your application, you will be redirected to the webpage you've
#    specified in the API panel. The URL will contain a parameter named "code" (the Authorisation
#    Code). You need to feed that code to the application.
def generate_new_token(authorisation_code: str, code_verifier: str) -> dict:
    global CLIENT_ID, CLIENT_SECRET

    url = 'https://myanimelist.net/v1/oauth2/token'
    data = {
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'code': authorisation_code,
        'code_verifier': code_verifier,
        'grant_type': 'authorization_code'
    }

    response = requests.post(url, data)
    response.raise_for_status()  # Check whether the requests contains errors

    token = response.json()
    response.close()
    print('Token generated successfully!')

    with open('token.json', 'w') as file:
        json.dump(token, file, indent = 4)
        print('Token saved in "token.json"')

    return token


# 4. Test the API by requesting your profile information
def print_user_info(access_token: str):
    url = 'https://api.myanimelist.net/v2/anime?q=cowboy%20bebop'
    response = requests.get(url, headers = {
        'Authorization': f'Bearer {access_token}'
        })
    
    response.raise_for_status()
    user = response.json()
    response.close()

    print(f"\n>>> Greetings {user}! <<<")


In [None]:
code_verifier = code_challenge = get_new_code_verifier()
print_new_authorisation_url(code_challenge)

authorisation_code = input('Copy-paste the Authorisation Code: ').strip()
token = generate_new_token(authorisation_code, code_verifier)

print_user_info(token['access_token'])


Authorise your application by clicking here: https://myanimelist.net/v1/oauth2/authorize?response_type=code&client_id=206c0e0531f8910a3898c210ea19012f&code_challenge=g0nB01cfWnIC1tsFyWmGkHseIB988cDgOHxPgj-vInUwiLDnIEkTA0Iq5mISfUKxNgd2KbGcCwj1OC9teDHAolR387ZqKmMLHoncc6vKFiVQJotbSZ6Sbz3oF6Cz7dfa

Copy-paste the Authorisation Code: def502009009b02b521974ca2efa9a126d9924387cb4251a9760502cd16e61ec917816c174ebd91f91469474c2779aad9a5e2b48a08254c60209594a927c6509f5e89b006f6b9bf14542e25eaf1d3c8379fc229c0e0a823296c092ee07cf2d521dbb0e01b764760697643a010bb3adc1ea91218b34a70a79537b4f672309b4535421fb6b465069f2568e3d9cd352f285e8ba6a1ae3ef967c7639e4dbcf248b1850e79d5c5e802de197b32e341a2f10bb70989a1bd658ba34223a6b5c79be0a720313b1ae3a16409cdeac6f9b7347617961e9a72e1ca35347df7409c5ce28fbeaaeb2cf7923cb7392ad0c450cad941b6c114e407b1b1b3bf8a2d1d8bb8c16139ed93886ce241b23d3231fba3472cccf12cffa2ad7e2e58f0f08f77d74c30e3ff2dc67cd2177408ba813ae5601af52bb3e6eb921df0271f852efe79a5ce355b46099e6ec355468f5263efedefcc5fd1

Note: fail to correctly receive the callback of authentication. My original redirect URI is localhost/oauth, however, since I don't have an application on that port, this page won't be loaded. However, I found the returned authorization code in the url. I can build a server on localhost next time and listen for the code

### get all animes in one season

In [None]:
url = 'https://api.myanimelist.net/v2/anime/season/1970/fall?offset=10'
response = requests.get(url, headers = {
    'Authorization': f"Bearer {token['access_token']}"
    })

response.raise_for_status()
cb = response.json()
response.close()

print(json.dumps(cb, indent=1))

{
 "data": [
  {
   "node": {
    "id": 3858,
    "title": "Konchuu Monogatari Minashigo Hutch",
    "main_picture": {
     "medium": "https://api-cdn.myanimelist.net/images/anime/12/6900.jpg",
     "large": "https://api-cdn.myanimelist.net/images/anime/12/6900l.jpg"
    }
   }
  },
  {
   "node": {
    "id": 8234,
    "title": "Muumin",
    "main_picture": {
     "medium": "https://api-cdn.myanimelist.net/images/anime/8/21297.jpg",
     "large": "https://api-cdn.myanimelist.net/images/anime/8/21297l.jpg"
    }
   }
  },
  {
   "node": {
    "id": 8778,
    "title": "Avignon no Hashi de",
    "main_picture": {
     "medium": "https://api-cdn.myanimelist.net/images/anime/12/24341.jpg",
     "large": "https://api-cdn.myanimelist.net/images/anime/12/24341l.jpg"
    }
   }
  },
  {
   "node": {
    "id": 20025,
    "title": "Itazura Tenshi Chippo-chan",
    "main_picture": {
     "medium": "https://api-cdn.myanimelist.net/images/anime/10/53413.jpg",
     "large": "https://api-cdn.myanimeli

### get details of an anime by its id

In [None]:
url = 'https://api.myanimelist.net/v2/anime/37599?fields=id,title,main_picture,alternative_titles,start_date,end_date,type,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,my_list_status,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics'
response = requests.get(url, headers = {
    'Authorization': f"Bearer {token['access_token']}"
    })

response.raise_for_status()
cb = response.json()
response.close()

print(json.dumps(cb, indent=1))


{
 "id": 37599,
 "title": "Magatsu Wahrheit: Zuerst",
 "main_picture": {
  "medium": "https://api-cdn.myanimelist.net/images/anime/1109/108983.jpg",
  "large": "https://api-cdn.myanimelist.net/images/anime/1109/108983l.jpg"
 },
 "alternative_titles": {
  "synonyms": [
   "MWZ"
  ],
  "en": "MAGATSU WAHRHEIT",
  "ja": "\u798d\u3064\u30f4\u30a1\u30fc\u30eb\u30cf\u30a4\u30c8 -ZUERST-"
 },
 "start_date": "2020-10-13",
 "synopsis": "Two young people living in the Wahrheit Empire are planning to relocate to the capital city: the shy Inumael who works as a transporter and left his beloved sister behind at his parents' home, and Leocadio, a naive new soldier of the Empire who spends his days in the imperial city thinking of his future as a soldier.\n\nIn the Empire, countless human beings were once extinguished from the world. The return of the \"light,\" the disaster that summoned a ferocious monster into the world, is predicted, and the remaining humans' spirits are devastated. Meanwhile, th

In [None]:
# test datetime conversion
from datetime import datetime
datetime.strptime("2021-01-01", "%Y-%m-%d")

datetime.datetime(2021, 1, 1, 0, 0)

### Data acquisition

In [None]:
# api call function
seasons = ['winter', 'spring', 'summer', 'fall']
years = [1970]

# fetch video info by season/year?offset=number format
def fetch_video_list(year=1970, season='fall', url = ''):
  if len(url) == 0:
    url = 'https://api.myanimelist.net/v2/anime/season/{}/{}'.format(year, season)
  # print(url)
  # url = 'https://api.myanimelist.net/v2/anime/season/fall/2020'
  # print(url)
  response = requests.get(url, headers = {
      'Authorization': f"Bearer {token['access_token']}"
      })

  response.raise_for_status()
  cb = response.json()
  response.close()
  return cb

In [None]:
fetch_video_list().get('paging',{}).get('next','')

'https://api.myanimelist.net/v2/anime/season/1970/fall?offset=10'

In [None]:
# fetch and 
def fetch_store_video_info(seasons, years):
  # data = [["id", "title", "start_date", "end_date", "mean", "rank", "popularity", "num_scoring_users", "media_type", "genres", "num_episodes",
  #          "year", "season", "day_of_the_week", "start_time", "source", "average_episode_duration", 'watching', "completed", "on_hold", "dropped", "plan_to_watch"]]
  # # with open("anime.csv", "w", newline='') as f:
  # #   writer = csv.writer(f)
  # #   writer.writerows(data)
  data = []
  ttime = 0
  for year in years:
    for season in seasons:
      ytime = 0
      data = []
      tic = time.perf_counter()
      print('start process y{}s{}'.format(year, season))
      next = ''
      hasStarted = False
      while next != '' or not hasStarted:
        hasStarted = True
        video_list = fetch_video_list(year, season, next)
        for video in video_list['data']:
          data.append(fetch_video_info(video['node']['id']))
        next = video_list.get('paging',{}).get('next','')
      with open("anime.csv", "a", newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data)
      toc = time.perf_counter()
      print("Used time: {}".format((toc-tic)/60))
      ttime += (toc-tic) / 60
      ytime += (toc-tic) / 60
    print("Year {} time: {}".format(year, ytime))
    print("Total time: {}".format(ttime))

In [None]:
# fetch information of a single video
def fetch_video_info(id):
  url = 'https://api.myanimelist.net/v2/anime/{}?fields=id,title,main_picture,alternative_titles,start_date,end_date,type,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,my_list_status,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics'.format(id)
  response = requests.get(url, headers = {
      'Authorization': f"Bearer {token['access_token']}"
      })

  response.raise_for_status()
  cb = response.json()
  response.close()
  return [cb['id'],cb['title'],cb.get('start_date', ''),cb.get('end_date', ''),cb.get('mean', ''),cb.get('rank', ''),cb.get('popularity', ''),
          cb.get('num_scoring_users', ''),cb.get('media_type', ''),"+".join([genre['name'] for genre in cb.get('genres', [])]),cb.get('num_episodes', 0),
          cb['start_season']['year'],cb['start_season']['season'],cb.get('broadcast',{}).get('day_of_the_week',''),cb.get('broadcast', {}).get('start_time',''),cb.get('source',''),
          cb.get('average_episode_duration', 0),int(cb['statistics']['status']['watching']),int(cb['statistics']['status']['completed']),int(cb['statistics']['status']['on_hold']),
          int(cb['statistics']['status']['dropped']),int(cb['statistics']['status']['plan_to_watch'])]

In [None]:
tic = time.perf_counter()
years = range(2011, 2020)
videos = fetch_store_video_info(seasons, years)
toc = time.perf_counter()
print((toc - tic) / 60)

start process y2011swinter
Used time: 1.6907011211499896
start process y2011sspring
Used time: 1.6795477227333322
start process y2011ssummer
Used time: 1.8612784772833644
start process y2011sfall
Used time: 1.7625550643332948
Year 2011 time: 1.7625550643332948
Total time: 6.994082385499981
start process y2012swinter
Used time: 1.965443065049961
start process y2012sspring
Used time: 1.7513806882999536
start process y2012ssummer
Used time: 2.0311697654166587
start process y2012sfall
Used time: 2.1514745365666994
Year 2012 time: 2.1514745365666994
Total time: 14.893550440833252
start process y2013swinter
Used time: 1.914355641866617
start process y2013sspring
Used time: 1.9136716327000007
start process y2013ssummer
Used time: 2.1980807131833595
start process y2013sfall
Used time: 2.1593086773000323
Year 2013 time: 2.1593086773000323
Total time: 23.07896710588326
start process y2014swinter
Used time: 2.369868429883354
start process y2014sspring
Used time: 2.236590076249983
start process y2

The collected data is around 4MB, and contain anime from 1970-2020.

### Bilibili crawler

It's annoying that bilibili.com has adopted anti-crawler technique, when I use the static crawler technique taught in class, I can only get a *loading* message back. So I used the combination of selenium + bs.

In [2]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

Collecting selenium
  Using cached https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl
Installing collected packages: selenium
Successfully installed selenium-3.141.0
Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:12 http:/

  if sys.path[0] == '':


In [22]:
# save to cache
CACHE_FILENAME = "bilibili_cache.json"

def open_cache():
    ''' Opens the cache file if it exists and loads the JSON into
    the CACHE_DICT dictionary.
    if the cache file doesn't exist, creates a new cache dictionary
    
    Parameters
    ----------
    None
    
    Returns
    -------
    The opened cache: dict
    '''
    try:
        cache_file = open(CACHE_FILENAME, 'r')
        cache_contents = cache_file.read()
        cache_dict = json.loads(cache_contents)
        cache_file.close()
    except:
        cache_dict = {}
    return cache_dict


def save_cache(cache_dict):
    ''' Saves the current state of the cache to disk
    
    Parameters
    ----------
    cache_dict: dict
        The dictionary to save
    
    Returns
    -------
    None
    '''
    dumped_json_cache = json.dumps(cache_dict)
    fw = open(CACHE_FILENAME,"w")
    fw.write(dumped_json_cache)
    fw.close() 

In [None]:
# collect data for top popular anime in each category
# fetch popularity for top 100 anime for original / manga sourced / novel sourced / game sourced anime
base_url_source = "https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id={}&order=3&st=1&sort=0&page={}"
sources = [10010, 10011, 10012, 10013] # original, manga, novel, game
pages = [1,2,3,4,5]
sources_name = {10010:"original", 10011:"manga sourced", 10012:"novel sourced", 10013:"game sourced"}
source_popularity = {"source":[], "popularity":[]}

bilibili_cache = open_cache()
prev = ""
for source in sources:
  for page in pages:
    cur_url = base_url_source.format(source, page)
    data = ""
    if cur_url not in bilibili_cache or divmod(time.time() - bilibili_cache[cur_url+'!time'], 3600)[0] >= 12:
      print('A new search' + cur_url)
      wd.get(cur_url)
      data = wd.page_source
      bilibili_cache[cur_url] = data
      bilibili_cache[cur_url+"!time"] = time.time()
      save_cache(bilibili_cache)
    else:
      data = bilibili_cache[cur_url]
    soup = BeautifulSoup(data, 'html.parser')
    for bangumi in soup.find_all('li', class_='bangumi-item'):
      source_popularity['source'].append(sources_name[source])
      print(bangumi)
      source_popularity['popularity'].append(float(bangumi.find(class_="shadow").text[:-3]))

In [None]:
# fetch popularity for top 20 anime of each anime genre
# frist get each genre name
genres = []
wd.get(base_url_source.format(10016, 1))
soup = BeautifulSoup(wd.page_source, 'html.parser')
for genre in soup.find_all('li', class_='filter-item'):
  genres.append(genre.text)
genres = genres[genres.index('热血'): genres.index('职场')+1]
genre_popularity={"genre":[], "popularity":[]}
j = 0
for i in range(10016, 10049):
  if i != 10019:
    cur_url = base_url_source.format(i, 1)
    if cur_url not in bilibili_cache or divmod(time.time() - bilibili_cache[cur_url+'!time'], 3600)[0] >= 12:
      wd.get(cur_url)
      data = wd.page_source
      bilibili_cache[cur_url] = data
      bilibili_cache[cur_url+"!time"] = time.time()
      save_cache(bilibili_cache)
    else:
      data = bilibili_cache[cur_url]
    print(i, genres[j], len(data))
    soup = BeautifulSoup(data, 'html.parser')
    for bangumi in soup.find_all('li', class_='bangumi-item'):
      genre_popularity['genre'].append(genres[j])
      print(bangumi)
      genre_popularity['popularity'].append(float(bangumi.find(class_="shadow").text[:-3]))
    j += 1

In [84]:
import pandas as pd
pd_source = pd.DataFrame(data=source_popularity)
pd_genre = pd.DataFrame(data=genre_popularity)
pd_source.to_csv("source.csv")
pd_genre.to_csv("genre.csv")

In [86]:
pd_mal = pd.read_csv("anime.csv")

### Save to database

In [87]:
import sqlite3
BILIBILI_DBNAME = 'bilibili.sqlite'
MAL_DBNAME = "mal.sqlite"

In [88]:
conn = sqlite3.connect(BILIBILI_DBNAME)
pd_source.to_sql("source", conn)
pd_genre.to_sql("genre", conn)
conn.close()
conn = sqlite3.connect(MAL_DBNAME)
pd_mal.to_sql("anime", conn)
conn.close()

In [94]:
conn = sqlite3.connect(MAL_DBNAME)
for i in conn.execute("select * from anime limit 10").fetchall():
  print(i)
conn.close()

(0, 38731, 'Diamond no Ace: Act II', '2019-04-02', '2020-03-31', 8.23, 271.0, 2194, 19897, 'tv', 'Comedy+Sports+School+Shounen', 52, 2019, 'spring', 'tuesday', '17:55', 'manga', 1440, 11949, 22640, 1808, 1007, 12351)
(1, 37804, 'Shirobako Movie', '2020-02-29', '2020-02-29', 7.45, 1746.0, 2980, 372, 'movie', 'Comedy+Drama', 1, 2020, 'winter', None, None, 'original', 7200, 528, 559, 419, 65, 26694)
(2, 40230, 'Housekishou Richard-shi no Nazo Kantei', '2020-01-09', '2020-03-26', 7.16, 2935.0, 2575, 12153, 'tv', 'Drama+Mystery+Slice of Life', 12, 2020, 'winter', 'thursday', '23:30', 'novel', 1420, 4274, 14941, 1310, 3051, 13720)
(3, 40351, 'Pokemon (2019)', '2019-11-17', None, 7.33, 2209.0, 2614, 9828, 'tv', 'Action+Adventure+Comedy+Kids+Fantasy', 0, 2019, 'fall', 'friday', '18:55', 'game', 1415, 20711, 3, 3280, 2794, 9250)
(4, 37890, 'Oshi ga Budoukan Ittekuretara Shinu', '2020-01-10', '2020-03-27', 7.46, 1684.0, 3123, 10507, 'tv', 'Comedy+Music+Seinen+Shoujo Ai+Slice of Life', 12, 2020, 