In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import json
import re

* Assumes wiki is gold standard source of truth and relies on its page structure, chronology, etc.

In [2]:
base_url = 'https://toarumajutsunoindex.fandom.com'

In [3]:
INDEX = 'Toaru_Majutsu_no_Index'
RAILGUN = 'Toaru_Kagaku_no_Railgun'
ASTRAL_BUDDY = 'Astral_Buddy'
ACCEL = 'Toaru_Kagaku_no_Accelerator'
DARK_MATTER = 'Toaru_Kagaku_no_Dark_Matter'

In [4]:
def get_character_data(wiki_url, affiliation):
        
    data = {
        'name_en': '',
        'name_jp': '',
        'img_url': '',
        'affiliation': [affiliation],
        'series': [],
        'is_supporting_character': False
    }
    
    soup = BeautifulSoup(requests.get(wiki_url).text, 'html.parser')
    name_en = soup.select('h1.page-header__title')[0].text
    name_jp_raw = soup.find("div", {"data-source": "Kanji"})
    
    # only get characters with both japanese and english names
    if not name_jp_raw:
        return None
    
    data['name_en'] = name_en
    data['name_jp'] = name_jp_raw.find("div", {"class": "pi-data-value pi-font"}).text
    
    # default image
    default_img_url = soup.find("a", {"class": "image-thumbnail"}).get('href')
    data['img_url'] = default_img_url
    
    headlines = [_['id'] for _ in soup.select("span.mw-headline")]
    for headline in headlines:
        if INDEX in headline:
            data['series'].append('禁書')
        elif RAILGUN in headline:
            data['series'].append('超電磁砲')
        elif ASTRAL_BUDDY in headline:
            data['series'].append('アストラル・バディ')
        elif ACCEL in headline:
            data['series'].append('一方通行')
        elif DARK_MATTER in headline:
            data['series'].append('未元物質')

    # ignore characters that are not in any of the main series/spinoffs
    if not data['series']:
        return None
        
    #data['is_supporting_character'] = not any(
    #    ["background" in _.text.strip().lower() for _ in soup.select('span.toctext')]
    #)
    
    data['is_supporting_character'] = soup.find('div', {'id': 'stub'}) is not None
    
    return data

* Affiliation (magic/science/other) based on wiki categorization.

In [5]:
categories = {
    'magic': {
        'path': '/wiki/Category:Magic_Side_Characters',
        'char_urls': []
    },
    'science': {
        'path': '/wiki/Category:Science_Side_Characters',
        'char_urls': []
    },
    'other': {
        'path': '/wiki/Category:Normal_Characters',
        'char_urls': []
    }
}

In [6]:
def get_character_pages(url, char_urls=None):
    if char_urls is None:
        char_urls = []
    
    curr_page = requests.get(url, allow_redirects=False)
    assert(curr_page.status_code == 200)
    
    curr_soup = BeautifulSoup(curr_page.text, 'html.parser')
    
    # only get characters with pictures
    curr_char_divs = [
        div.find('a') for div in curr_soup.find_all("div", {"class": "category-page__member-left"})
        if 'Template_Placeholder_other.png' not in str(div)
    ]
    curr_char_divs = [d for d in curr_char_divs if d]

    # only get characters with valid pages (& no redirects)
    for curr_char_div in curr_char_divs:
        curr_char_url = "{base}{suffix}".format(base=base_url, suffix=curr_char_div.get('href'))
        is_valid_page = requests.get(curr_char_url, allow_redirects=False).status_code == 200
        if not is_valid_page:
            continue
        char_urls.append(curr_char_url)

    next_page = curr_soup.find("a", {"class": "category-page__pagination-next"})
    if next_page:
        next_url = next_page.get("href")
        char_urls.extend(get_character_pages(next_url, char_urls))
        
    return char_urls

In [7]:
for name, _ in categories.items():
    category_url = base_url + _['path']
    categories[name]['char_urls'] = get_character_pages(category_url)

Get data

* Deal format is array of characters
* Target json - [{name:"name", img: "img.png", opts: {series:["a"], affiliation:["b"]}}]

In [8]:
characters = dict()

for category, _ in categories.items():
    for char_url in _['char_urls']:
        data = defaultdict(list)
        curr_data = get_character_data(char_url, category)
        if not curr_data:
            continue
            
        name = "{jp} ({en})".format(jp=curr_data['name_jp'], en=curr_data['name_en'])
        
        data['img'] = re.search('latest\?cb=(.+)', curr_data['img_url'], re.IGNORECASE).group(1)
        #data['img'] = curr_data['img_url']
        data['affiliation'].extend(curr_data['affiliation'])
        data['series'].extend(curr_data['series'])
        data['is_supporting_character'] = curr_data['is_supporting_character']
        
        data['affiliation'] = list(set(data['affiliation']))
        data['series'] = list(set(data['series']))
        
        
        characters[name] = data

In [9]:
output_data = []

for character in characters:
    curr = characters[character]
    output_data.append(
        dict(
            name = character,
            img = curr['img'],
            opts = dict(
                series=curr['series'], 
                affiliation=curr['affiliation'], 
                is_supporting_character=curr['is_supporting_character']
            )
        )
    )

In [12]:
# manual updates
for i, x in enumerate(output_data):
    if any(n in x['name'] for n in ['Misaka Mikoto', 'Shokuhou Misaki', 'Shirai Kuroko']):
        if "アストラル・バディ" not in x['opts']['series']:
            output_data[i]['opts']['series'].append("アストラル・バディ")

Ended up downloading the images using wget ._.

To preserve aspect ratio, the downloaded images were resized and padded instead. See the other notebook in this repo.

In [11]:
#with open('img.txt', 'w') as f:
#    for _ in output_data:
#        f.write(_['img'])
#        f.write('\n')