In [1]:
import pandas as pd
import requests
from tqdm import tqdm

fandom_url = "https://pokemon.fandom.com/wiki/"
gameinfo_url = "https://pokemon.gameinfo.io/en/pokemon/"
pokemondb_url = "https://pokemondb.net/pokedex/"

def get_bulbapedia_url(pokemon):
    return f"https://bulbapedia.bulbagarden.net/wiki/{pokemon}_(Pok%C3%A9mon)"

pokemon_df = pd.read_csv("./pokemon.csv")

In [6]:
pokemon_name_list = pokemon_df.loc[pokemon_df["Generation"] == 1]["Name"].to_list()
#pokemon_name_list = pokemon_df["Name"].to_list()
pokemondb_mapping = {"Nidoran♀":"nidoran-f","Nidoran♂":"nidoran-m","Farfetch'd":"Farfetchd","Mr. Mime":"mr-mime"}

In [7]:
def get_index(pokemon):
    return pokemon_df.loc[pokemon_df["Name"] == pokemon]["#"].values[0]

In [8]:
import re
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [9]:
def get_fandom_data(pokemon):
    text = requests.get(fandom_url+pokemon).text
    try:
        if "<span class=\"mw-headline\" id=\"Biology\">Biology</span>" in text:
            subtext = text.split("<span class=\"mw-headline\" id=\"Biology\">Biology</span>")[1].split("<span class=\"mw-headline\" id=\"Game")[0]
        else:
            subtext = text.split("<span class=\"mw-headline\" id=\"Physiology\">Physiology</span>")[1].split("<span class=\"mw-headline\" id=\"Game")[0]
    except:
        print(f"Fandom problem {pokemon}")
        return None
    return subtext

def get_bulbapedia_data(pokemon):
    bulba_url = get_bulbapedia_url(pokemon)
    text = requests.get(bulba_url).text
    try:
        subtext = text.split("<span class=\"mw-headline\" id=\"Biology\">Biology</span>")[1].split("<span class=\"mw-headline\" id=\"In_the_anime\">In the anime</span>")[0]
    except:
        print(f"Bulbapedia problem {pokemon}")
        return None
    return subtext

def get_gameinfo_data(pokemon):
    text = requests.get(gameinfo_url+pokemon).text
    try:
        subtext = text.split("<article class=\"pokemon-about\">")[1].split("</article>")[0]
    except:
        print(f"Gameinfo problem {pokemon}")
        return None
    return subtext

def get_pokedex_data(pokemon):
    if pokemon in pokemondb_mapping:
        pokemon = pokemondb_mapping[pokemon]
    text = requests.get(pokemondb_url+pokemon).text
    try:
        subtext = text.split("<h2>Pokédex entries</h2>")[1].split("<h2>Moves learned by Bulbasaur</h2>")[0].split("<tbody>")[1].split("</tbody>")[0]
        subtext = re.sub(r'<th>.*?</th>', '', subtext)
    except:
        print(f"Pokemondb problem {pokemon}")
        return None
    return subtext

def clean_data(subtext):
    subtext = cleanhtml(subtext).split(".")

    sentence_list = []
    for sentence in subtext:
        if "artwork" in sentence:
            continue
        if "\n" in sentence:
            sentence = sentence.split("\n")[-1]
        sentence = sentence.replace("\t", "")
        sentence = sentence.replace("\"", "")
        sentence = sentence.replace(";", " ")
        sentence = re.sub(' +', ' ', sentence)
        sentence = sentence.lstrip().rstrip()
        if sentence != "" and sentence not in sentence_list:
            sentence_list.append(sentence)
    return sentence_list

In [None]:
pokemon_desc_dict = {}
for pokemon in tqdm(pokemon_name_list):
    fandom_subtext = get_fandom_data(pokemon)
    bulba_subtext = get_bulbapedia_data(pokemon)
    gameinfo_subtext = get_gameinfo_data(pokemon)
    pokedex_subtext = get_pokedex_data(pokemon)
    pokemon_list = []
    if fandom_subtext != None:
        pokemon_list += clean_data(fandom_subtext)
    if bulba_subtext != None:
        pokemon_list += clean_data(bulba_subtext)
    if gameinfo_subtext != None:
        pokemon_list += clean_data(gameinfo_subtext)
    if pokedex_subtext != None:
        pokemon_list += clean_data(pokedex_subtext)

    pokemon_desc_dict[pokemon] = pokemon_list

In [11]:
with open('./all_pokemon_description.csv','w', encoding="utf-8") as file:
    file.write("text;name;labels")
    file.write("\n")
    for pokemon in pokemon_desc_dict:
        for sentence in pokemon_desc_dict[pokemon]:
            file.write(f"{sentence};{pokemon};{get_index(pokemon)-1}")
            file.write("\n")

In [12]:
len(pokemon_desc_dict)

752