In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import json
import re

* Assumes wiki is gold standard source of truth and relies on its page structure, chronology, etc.

In [2]:
base_url = 'https://toarumajutsunoindex.fandom.com'

In [3]:
INDEX = 'Toaru_Majutsu_no_Index'
RAILGUN = 'Toaru_Kagaku_no_Railgun'
ASTRAL_BUDDY = 'Astral_Buddy'
ACCEL = 'Toaru_Kagaku_no_Accelerator'
DARK_MATTER = 'Toaru_Kagaku_no_Dark_Matter'

In [4]:
def get_character_data(wiki_url, affiliation):
        
    data = {
        'name_en': '',
        'name_jp': '',
        'img_url': '',
        'affiliation': [affiliation],
        'series': [],
        'is_supporting_character': False
    }
    
    soup = BeautifulSoup(requests.get(wiki_url).text, 'html.parser')
    name_en = soup.select('h1.page-header__title')[0].text
    name_jp_raw = soup.find("div", {"data-source": "Kanji"})
    
    # only get characters with both japanese and english names
    if not name_jp_raw:
        return None
    
    data['name_en'] = name_en
    data['name_jp'] = name_jp_raw.find("div", {"class": "pi-data-value pi-font"}).text
    
    # default image
    default_img_url = soup.find("a", {"class": "image-thumbnail"}).get('href')
    data['img_url'] = default_img_url
    
    headlines = [_['id'] for _ in soup.select("span.mw-headline")]
    for headline in headlines:
        if INDEX in headline:
            data['series'].append('禁書')
        elif RAILGUN in headline:
            data['series'].append('超電磁砲')
        elif ASTRAL_BUDDY in headline:
            data['series'].append('アストラル・バディ')
        elif ACCEL in headline:
            data['series'].append('一方通行')
        elif DARK_MATTER in headline:
            data['series'].append('未元物質')

    # ignore characters that are not in any of the main series/spinoffs
    if not data['series']:
        return None

    intro_text = ''.join(_ for _ in [m.get('content') for m in soup.find_all("meta")] if _)
    # heuristics to find minor characters
    is_stub = soup.find('div', {'id': 'stub'}) is not None
    is_few_sections = len(soup.find_all('li', {'class': 'toclevel-2'})) < 3
    # explicit mentions
    minor_pattern = r'\bis a (small )?(minor|supporting|background|side)\b'
    is_minor = bool(re.search(minor_pattern, intro_text, re.IGNORECASE))
    data['is_supporting_character'] = is_stub or is_few_sections or is_minor
    
    return data

* Affiliation (magic/science/other) based on wiki categorization.

In [5]:
categories = {
    'magic': {
        'path': '/wiki/Category:Magic_Side_Characters',
        'char_urls': []
    },
    'science': {
        'path': '/wiki/Category:Science_Side_Characters',
        'char_urls': []
    },
    'other': {
        'path': '/wiki/Category:Normal_Characters',
        'char_urls': []
    }
}

In [6]:
def get_character_pages(url, char_urls=None):
    if char_urls is None:
        char_urls = []
    
    curr_page = requests.get(url, allow_redirects=False)
    assert(curr_page.status_code == 200)
    
    curr_soup = BeautifulSoup(curr_page.text, 'html.parser')
    
    # only get characters with pictures
    curr_char_divs = [
        div.find('a') for div in curr_soup.find_all("div", {"class": "category-page__member-left"})
        if 'Template_Placeholder_other.png' not in str(div)
    ]
    curr_char_divs = [d for d in curr_char_divs if d]

    # only get characters with valid pages (& no redirects)
    for curr_char_div in curr_char_divs:
        curr_char_url = "{base}{suffix}".format(base=base_url, suffix=curr_char_div.get('href'))
        is_valid_page = requests.get(curr_char_url, allow_redirects=False).status_code == 200
        if not is_valid_page:
            continue
        char_urls.append(curr_char_url)

    next_page = curr_soup.find("a", {"class": "category-page__pagination-next"})
    if next_page:
        next_url = next_page.get("href")
        char_urls.extend(get_character_pages(next_url, char_urls))
        
    return char_urls

In [7]:
for name, _ in categories.items():
    category_url = base_url + _['path']
    categories[name]['char_urls'] = get_character_pages(category_url)

Get data

* Deal format is array of characters
* Target json - [{name:"name", img: "img.png", opts: {series:["a"], affiliation:["b"]}}]

In [8]:
class Character:
    def __init__(self):
        self.name = ""
        self.affiliation = set()
        self.series = set()
        self.is_supporting_character = False
        self.img = ""
    
    def get_data(self):
        return dict(
            name = self.name,
            img = self.img,
            opts = dict(
                affiliation = list(self.affiliation),
                series = list(self.series),
                is_supporting_character = self.is_supporting_character
            )
        )

In [9]:
characters = dict()

for category, _ in categories.items():
    for char_url in _['char_urls']:
        
        curr_data = get_character_data(char_url, category)
        if not curr_data:
            continue
                
        name = "{jp} ({en})".format(jp=curr_data['name_jp'], en=curr_data['name_en'])
        
        if name in characters:
            curr_char = characters[name]
        else:    
            curr_char = Character()
            curr_char.name = name
            # note that .png extension comes after postprocessing
            curr_char.img = re.search('latest\?cb=(.+)', curr_data['img_url'], re.IGNORECASE).group(1) + '.png'
            #curr_char.img = curr_data['img_url']
            curr_char.affiliation.update(curr_data['affiliation'])
            curr_char.series.update(curr_data['series'])
            curr_char.is_supporting_character = curr_data['is_supporting_character']
        
        characters[name] = curr_char

# TODO if only appear in SS then minor

In [10]:
# some manual updates until i figure out better heuristics
chars_in_ab = ['御坂 美琴 (Misaka Mikoto)', '食蜂 操祈 (Shokuhou Misaki)', 
               '白井 黒子 (Shirai Kuroko)', '初春 飾利 (Uiharu Kazari)', '佐天 涙子 (Saten Ruiko)']
chars_not_minor = ['初春 飾利 (Uiharu Kazari)', '佐天 涙子 (Saten Ruiko)', "神苑小路 瑠璃懸巣 (Shin'enkouji Rurikakesu)",
                   '婚后 光子 (Kongou Mitsuko)', '湾内 絹保 (Wannai Kinuho)', '御坂 美鈴 (Misaka Misuzu)']
for char in chars_in_ab:
    characters[char].series.update(["アストラル・バディ"])
for char in chars_not_minor:
    characters[char].is_supporting_character = False

In [11]:
output_data = [character.get_data() for character in characters.values()]

Ended up downloading the images using wget ._.

To preserve aspect ratio, the downloaded images were resized and padded instead. See the other notebook in this repo.

In [12]:
write_image_urls = False
if write_image_urls:
    with open('img.txt', 'w') as f:
        for _ in output_data:
            f.write(_['img'])
            f.write('\n')