In [1]:
import requests
from bs4 import BeautifulSoup

* Assumes wiki is gold standard source of truth and relies on its page structure, chronology, etc.

In [2]:
base_url = 'https://toarumajutsunoindex.fandom.com'

In [3]:
INDEX = 'Toaru_Majutsu_no_Index'
RAILGUN = 'Toaru_Kagaku_no_Railgun'
ASTRAL_BUDDY = 'Astral_Buddy'
ACCEL = 'Toaru_Kagaku_no_Accelerator'
DARK_MATTER = 'Toaru_Kagaku_no_Dark_Matter'

In [11]:
def get_character_data(wiki_url, affiliation):
    data = {
        'name_en': '',
        'name_jp': '',
        'img_url': '',
        'affiliation': affiliation,
        'in_index': False,
        'in_railgun': False, 
        'in_ab': False,
        'in_accel': False,
        'in_dm': False
    }
    
    soup = BeautifulSoup(requests.get(wiki_url).text, 'html.parser')
    name_en = soup.select('h1.page-header__title')[0].text
    name_jp_raw = soup.find("div", {"data-source": "Kanji"})
    
    # only get characters with both japanese and english names
    if not name_jp_raw:
        return None
    
    data['name_en'] = name_en
    data['name_jp'] = name_jp_raw.find("div", {"class": "pi-data-value pi-font"}).text
    
    # default image
    default_img_url = soup.find("a", {"class": "image-thumbnail"}).get('href')
    data['img_url'] = default_img_url
    
    headlines = [_['id'] for _ in soup.select("span.mw-headline")]
    for headline in headlines:
        if INDEX in headline:
            data['in_index'] = True
        elif RAILGUN in headline:
            data['in_railgun'] = True
        elif ASTRAL_BUDDY in headline:
            data['in_ab'] = True
        elif ACCEL in headline:
            data['in_accel'] = True
        elif DARK_MATTER in headline:
            data['in_dm'] = True
            
    # ignore characters that are not in any of the main series/spinoffs
    if not any([data['in_index'], data['in_railgun'], data['in_ab'], data['in_accel'], data['in_dm']]):
        return None
        
    return data

* Affiliation (magic/science/other) based on wiki categorization.

In [5]:
categories = {
    'magic': {
        'path': '/wiki/Category:Magic_Side_Characters',
        'char_urls': []
    },
    'science': {
        'path': '/wiki/Category:Science_Side_Characters',
        'char_urls': []
    },
    'other': {
        'path': '/wiki/Category:Normal_Characters',
        'char_urls': []
    }
}

In [6]:
def get_character_pages(url, char_urls=None):
    if char_urls is None:
        char_urls = []
    
    curr_page = requests.get(url, allow_redirects=False)
    assert(curr_page.status_code == 200)
    
    curr_soup = BeautifulSoup(curr_page.text, 'html.parser')
    
    # only get characters with pictures
    curr_char_divs = [
        div.find('a') for div in curr_soup.find_all("div", {"class": "category-page__member-left"})
        if 'Template_Placeholder_other.png' not in str(div)
    ]
    curr_char_divs = [d for d in curr_char_divs if d]

    # only get characters with valid pages (& no redirects)
    for curr_char_div in curr_char_divs:
        curr_char_url = "{base}{suffix}".format(base=base_url, suffix=curr_char_div.get('href'))
        is_valid_page = requests.get(curr_char_url, allow_redirects=False).status_code == 200
        if not is_valid_page:
            continue
        char_urls.append(curr_char_url)

    next_page = curr_soup.find("a", {"class": "category-page__pagination-next"})
    if next_page:
        next_url = next_page.get("href")
        char_urls.extend(get_character_pages(next_url, char_urls))
        
    return char_urls

In [7]:
for name, _ in categories.items():
    category_url = base_url + _['path']
    categories[name]['char_urls'] = get_character_pages(category_url)

In [13]:
for name, _ in categories.items():
    for char_url in _['char_urls']:
        char_data = get_character_data(char_url, name)

{'name_en': 'Agata', 'name_jp': 'アガター', 'img_url': 'https://static.wikia.nocookie.net/to-aru-majutsu-no-index/images/2/23/Agata_profile.jpg/revision/latest?cb=20110122092043', 'affiliation': 'magic', 'in_index': True, 'in_railgun': False, 'in_ab': False, 'in_accel': False, 'in_dm': False}
{'name_en': 'Aihana Etsu', 'name_jp': '藍花 悦', 'img_url': 'https://static.wikia.nocookie.net/to-aru-majutsu-no-index/images/b/ba/Aihana_Etsu_ambiguous_pfp.jpg/revision/latest?cb=20200719024739', 'affiliation': 'science', 'in_index': True, 'in_railgun': True, 'in_ab': False, 'in_accel': False, 'in_dm': False}
{'name_en': 'Charles Conder', 'name_jp': 'チャールズ＝コンダー', 'img_url': 'https://static.wikia.nocookie.net/to-aru-majutsu-no-index/images/6/6a/Charles_ConderProfile.jpg/revision/latest?cb=20101212032417', 'affiliation': 'other', 'in_index': True, 'in_railgun': False, 'in_ab': False, 'in_accel': False, 'in_dm': False}


In [16]:
list(set(categories['science']['char_urls']) & set(categories['magic']['char_urls']))

['https://toarumajutsunoindex.fandom.com/wiki/Ellis_Warrior',
 'https://toarumajutsunoindex.fandom.com/wiki/Ladylee_Tangleroad',
 'https://toarumajutsunoindex.fandom.com/wiki/Etzali',
 'https://toarumajutsunoindex.fandom.com/wiki/Tsuchimikado_Motoharu',
 'https://toarumajutsunoindex.fandom.com/wiki/Kihara_Kagun',
 'https://toarumajutsunoindex.fandom.com/wiki/Xochitl']

In [None]:
list(set(a) & set(b))