In [68]:
# Import definitions
import pwiki
import pwiki.wiki
import mwparserfromhell
import re
import os

from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD, OWL


In [69]:
# piwiki configuration

bulbabot = pwiki.wiki.Wiki(api_endpoint="https://bulbapedia.bulbagarden.net/w/api.php")

In [None]:
# piwiki basic tests
print(bulbabot.exists("Bulbasaur"))

bulbasaur_test = bulbabot.parse(title="Bulbasaur")
print( bulbasaur_test)

In [70]:
#Bulbasaur page content test
bulbasaur_test2 = bulbabot.page_text(title="Bulbasaur (Pokémon)")
bulbasaur_test2

'{{PokémonPrevNext/Head|type=Grass|type2=Poison}}\n{{PokémonPrevNext/GO|species=Bulbasaur}}\n{{PokémonPrevNext/Pokémon|type=Grass|nextnum=0002|next=Ivysaur|round=none}}\n{{PokémonPrevNext/Disambig|species=Bulbasaur}}\n|}\n{{Pokémon Infobox\n|name=Bulbasaur\n|jname=フシギダネ\n|tmname=Fushigidane\n|ndex=0001\n|type1=Grass\n|type2=Poison\n|category=Seed\n|height-ftin=2\'04"\n|height-m=0.7\n|weight-lbs=15.2\n|weight-kg=6.9\n|abilityn=d\n|ability1=Overgrow\n|abilityd=Chlorophyll\n|egggroupn=2\n|egggroup1=Monster\n|egggroup2=Grass\n|eggcycles=20\n|evtotal=1\n|evsa=1\n|expyield=64\n|oldexp=64\n|lv100exp=1,059,860\n|gendercode=31\n|color=Green\n|catchrate=45\n|body=08\n|generation=1\n|pokefordex=bulbasaur\n|friendship=70\n}}\n\'\'\'Bulbasaur\'\'\' ([[List of Japanese Pokémon names|Japanese]]: \'\'\'フシギダネ\'\'\' \'\'Fushigidane\'\') is a dual-type {{2t|Grass|Poison}} {{OBP|Pokémon|species}} introduced in [[Generation I]].\n\nIt [[Evolution|evolves]] into {{p|Ivysaur}} starting at [[level]] 16, which

In [81]:
bulbasaur_test2_images= bulbabot.links_on_page(title="Red")
bulbasaur_test2_images

['Ash Ketchum',
 'Core series',
 'FireRed & LeafGreen arc (Adventures)',
 'Generation I',
 'Generation III',
 'Green (Groundbreakers 39)',
 'List of Pokémon by color',
 'PO01',
 'Pokédex',
 'Pokémon Adventures',
 'Pokémon FireRed and LeafGreen Versions',
 'Pokémon Masters EX',
 'Pokémon Mystery Dungeon: Red Rescue Team and Blue Rescue Team',
 'Pokémon Mystery Dungeon series',
 'Pokémon Origins',
 'Pokémon Pocket Monsters',
 'Pokémon Red and Blue Versions',
 'Pokémon Red and Green Versions',
 'Pokémon Ruby-Sapphire',
 'Pokémon Trading Figure Game',
 'Pokémon games',
 'Pokémon the Series',
 'Red, Green & Blue arc (Adventures)',
 'Red (Adventures)',
 'Red (Masters)',
 'Red (Next Quest 42)',
 'Red (Origins)',
 'Red (Pocket Monsters)',
 'Red (Ruby-Sapphire)',
 'Red (game)',
 'Trainer figure (TFG)',
 'Yellow (Unnamed Third Set 42)',
 'Bulbapedia:Disambiguation']

In [57]:
# FUNCTION DEFINITION : pokémon infobox text extraction
def extract_infobox(content):
    match = re.search(r"{{Pokémon Infobox.*?}}", content, re.DOTALL)
    if match:
        return match.group(0)
    return None


def extract_region_infobox(content):
    match = re.search(r"{{RegionInfobox.*?}}", content, re.DOTALL)
    if match:
        return match.group(0)
    return None

def extract_character_infobox(content):
    match = re.search(r"{{Character Infobox.*?}}", content, re.DOTALL)
    if match:
        return match.group(0)
    return None

def extract_move_infobox(content):
    match = re.search(r"{{MoveInfobox.*?}}", content, re.DOTALL)
    if match:
        return match.group(0)
    return None

def extract_location_infobox(content):
    match = re.search(r"{{Infobox location.*?}}", content, re.DOTALL)
    if match:
        return match.group(0)
    return None

In [None]:
# infobox extration test
infobox = extract_infobox(bulbasaur_test2) 
infobox

In [4]:
# FUNCTION DEFINITION : parsing of the infobox text 
def parse_infobox(infobox):
    properties = {}
    lines = infobox.split("\n")
    for line in lines:
        if line.startswith("|"):
            key_value = line[1:].split("=", 1)  # Remove leading `|` and split at `=`
            if len(key_value) == 2:
                key, value = key_value
                new_key = key.strip()
                new_value = value.strip().replace("[[","").replace("]]","").replace("{{","").replace("}}","")
                new_value = re.sub(r'<!--.*?-->', '', new_value, flags=re.DOTALL)
                if new_key == "size":
                    new_value = new_value.replace("px", "")
                if not("|" in new_value):
                    properties[new_key] = new_value
    return properties

In [None]:
# Infobox text parsing test 
properties = parse_infobox(infobox)
properties

In [None]:
# Vocabulary Graph 

#path 
vocabulary_path = f'C:/Users/HP/Documents/07. EMSE/Master DSC/WS/Project-BulbapediaKG/BulbapediaKG1/Vocabulary/'

vocabulary_graph = Graph()

for file_name in os.listdir(vocabulary_path):
    if file_name.endswith('.ttl'):
        vocabulary_graph.parse(os.path.join(vocabulary_path,file_name), format="ttl")
        print(file_name)


property_mappings = {}

# Iterate over the graph to get class properties and their types
for subject, predicate, obj in vocabulary_graph:
    if predicate == RDF.type and obj == RDFS.Class:
        # Class found, check for properties of this class
        class_uri = subject
        for s, p, o in vocabulary_graph.triples((None, RDFS.domain, class_uri)):
            if p == RDFS.domain and o == class_uri:
                # Now we know this property is for our class
                property_uri = s
                # Check for range (type) of the property
                range_type = None
                values_list = []

                for _, p, o in vocabulary_graph.triples((property_uri, RDFS.range, None)):
                    range_type = o

                values_list.append(range_type)
                match range_type:
                    case XSD.string:
                        values_list.append('Literal')
                    case XSD.integer:
                        values_list.append('Integer')
                    case XSD.date:
                        values_list.append('Date')
                    case XSD.decimal:
                        values_list.append('Decimal')
                    case _:
                        values_list.append('URIRef')

                property_mappings[str(property_uri)] = values_list
                    

property_mappings


In [31]:
# FUNCTION DEFINITION : mapping of pokemon infobox properties to existing graph
def map_infobox_to_rdf(name_space, graph, property_mappings, properties):
    
    #POKEMON = Namespace("http://example.org/pokemon/")
    name_space = name_space
    nstype = str(name_space)
    nstype = nstype.rstrip('/').split('/')[-1]
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    
    resoucre_uri = URIRef(name_space[properties.get("name", "Unknown").replace(" ", "_")])
    if properties.get("name", "Unknown").replace(" ", "_") == "Unkown":
        resoucre_uri = URIRef(name_space[properties.get("translated_name", "Unknown").replace(" ", "_")])
    graph.add((resoucre_uri, RDF.type, URIRef(name_space + nstype)))

    for propertyUri, propertyRestriction in property_mappings.items():
        if name_space in propertyUri:
            property_name = propertyUri[propertyUri.rfind('/') + 1:]
            if property_name in properties:
                match propertyRestriction[1]:
                    case "Literal":
                        graph.add((resoucre_uri, URIRef(propertyUri), Literal(properties[property_name])))
                    case "Integer":
                        if isinstance(int(properties[property_name]), int):
                            graph.add((resoucre_uri, URIRef(propertyUri), Literal(int(properties[property_name]))))
                    case "Date": 
                        graph.add((resoucre_uri, URIRef(propertyUri), Literal(properties[property_name], datatype=XSD.date)))
                    case "Decimal": 
                        graph.add((resoucre_uri, URIRef(propertyUri), Literal(properties[property_name], datatype=XSD.decimal)))
                    case _:
                        uriString = properties[property_name].replace(" ","_")
                        if str(name_space) in str(propertyRestriction[0]):
                            graph.add((resoucre_uri, URIRef(propertyUri), URIRef(propertyRestriction[0] +"/" +uriString)))
                        else:
                            graph.add((resoucre_uri, URIRef(propertyUri), URIRef(uriString)))
                            graph.add((URIRef(uriString), RDF.type ,propertyRestriction[0]))


    return graph


In [None]:
pokemon_infobox_graph = Graph()
POKEMON = Namespace("http://example.org/pokemon/")
pokemon_infobox_graph = map_infobox_to_rdf(POKEMON, pokemon_infobox_graph, property_mappings, properties=properties) 
print(pokemon_infobox_graph.serialize(format="turtle"))

In [None]:
# Get the list of pokemons
poke_list = bulbabot.page_text("List_of_Pokémon_by_National_Pokédex_number")
poke_list


In [42]:
# FUNTION DEFINITION : get pokemon list
def extract_third_substrings(input_string):
    # Define the regular expression pattern
    # pattern = r"\{\{ndex\|\d+\|([^\|]+)\|[^\|]+\|[^\|]+\}\}"
    # pattern = r"\{\{ndex\|\d+\|([^\|]+)(?:\|[^\|]+)*\}\}"
    pattern = r"\{\{ndex\|\d+\|([^\|]+)\|?.*?\}\}"
    
    # Find all matches of the pattern
    matches = re.findall(pattern, input_string)
    
    # Return the list of third substrings
    return matches

In [None]:
poke_list = extract_third_substrings(poke_list)
poke_list = [name + " (Pokémon)" for name in poke_list]
poke_list

In [None]:
POKEMON = Namespace("http://example.org/pokemon/")
intial_graph = Graph()
final_graph = Graph()

for pokemon in poke_list:
    try:
        # Step 1: Fetch the page text
        result1 = bulbabot.page_text(title=pokemon)
        
        # Step 2: Extract the infobox
        result2 = extract_infobox(result1)
        
        # Step 3: Parse the infobox
        result3 = parse_infobox(result2)
        
        # Step 4: Map the parsed infobox to RDF
        final_graph = map_infobox_to_rdf(POKEMON, intial_graph, property_mappings, result3)
        
        # Log or handle the final result (optional)
        # print(f"Successfully processed {pokemon}")
    except Exception as e:
        # Handle any errors that occur in the pipeline
        print(f"Error processing {pokemon}: {e}")

In [None]:
# Save or print the created graph
graph_path = "GeneratedGraphs/"
print(final_graph.serialize(format="turtle"))
with open(os.path.join(graph_path, "poke_graph.ttl"), "w", encoding="utf-8") as file:
    file.write(final_graph.serialize(format="turtle"))

In [7]:
Regions_wiht_infobox = [
    "Kanto",
    "Johto",
    "Hoenn",
    "Orre",
    "Sevii Islands",
    "Orange Islands",
    "Pokémon Island",
    "White City",
    "Fiore",
    "Sinnoh",
    "Pokétopia",
    "Pokémon world (Mystery Dungeon)",
    "Mintale Town",
    "Almia",
    "Trading Card Game Islands",
    "Oblivia",
    "Unova",
    "PokéPark (game)",
    "Ransei",
    "Decolore Islands",
    "Kalos",
    "Ferrum",
    "Alola",
    "Carmonte Island",
    "Ryme City",
    "Tumblecube Island",
    "Galar",
    "Pasio",
    "Isle of Armor",
    "Crown Tundra",
    "Lental",
    "Hisui",
    "Aeos Island",
    "Paldea",
    "Kitakami",
    "Blueberry Academy"
]

In [8]:
final_region_graph = Graph()
intial_region_graph = Graph()
REGION = Namespace("http://example.org/region/")
for region in Regions_wiht_infobox:
    try:
        # Step 1: Fetch the page text
        result1 = bulbabot.page_text(title=region)
        
        # Step 2: Extract the infobox
        result2 = extract_region_infobox(result1)
        
        # Step 3: Parse the infobox
        result3 = parse_infobox(result2)
        
        # Step 4: Map the parsed infobox to RDF
        # final_region_graph = map_region_infobox_to_rdf(intial_region_graph, property_mappings, result3)}
        final_region_graph = map_infobox_to_rdf(REGION, intial_region_graph, property_mappings, result3)
        
        # Log or handle the final result (optional)
        # print(f"Successfully processed {region}")
    except Exception as e:
        # Handle any errors that occur in the pipeline
        print(f"Error processing {region}: {e}")
        print(e)

In [None]:
# Save or print the created graph
print(final_region_graph.serialize(format="turtle"))
with open(os.path.join(graph_path,"region_graph.ttl"), "w", encoding="utf-8") as file:
    file.write(final_region_graph.serialize(format="turtle"))

In [None]:
with open('RessourcesList\character_list.txt', 'r') as file:
    character_list = [line.strip() for line in file]

print(character_list)

In [None]:
CHARACTER = Namespace("http://example.org/character/")
intial_character_graph = Graph()
final_character_graph = Graph()

for character in character_list:
    try:
        # Step 1: Fetch the page text
        result1 = bulbabot.page_text(title=character)
        
        # Step 2: Extract the infobox
        result2 = extract_character_infobox(result1)
        
        # Step 3: Parse the infobox
        result3 = parse_infobox(result2)
        
        # Step 4: Map the parsed infobox to RDF
        final_character_graph = map_infobox_to_rdf(CHARACTER, intial_character_graph, property_mappings, result3)
        
        # Log or handle the final result (optional)
        print(f"Successfully processed {character}")
    except Exception as e:
        # Handle any errors that occur in the pipeline
        print(f"Error processing {character}: {e}")

In [None]:
# Save or print the created graph
print(final_character_graph.serialize(format="turtle"))
with open(os.path.join(graph_path,"character_graph.ttl"), "w", encoding="utf-8") as file:
    file.write(final_character_graph.serialize(format="turtle"))

In [None]:
with open('RessourcesList\move_list.txt', 'r') as file:
    move_list = [line.strip() for line in file]

print(move_list)

In [None]:
MOVE = Namespace("http://example.org/move/")
intial_move_graph = Graph()
final_move_graph = Graph()

for individual_move in move_list:
    try:
        # Step 1: Fetch the page text
        result1 = bulbabot.page_text(title=individual_move)
        
        # Step 2: Extract the infobox
        result2 = extract_move_infobox(result1)
        
        # Step 3: Parse the infobox
        result3 = parse_infobox(result2)
        
        # Step 4: Map the parsed infobox to RDF
        final_move_graph = map_infobox_to_rdf(MOVE, intial_move_graph, property_mappings, result3)
        
        # Log or handle the final result (optional)
        print(f"Successfully processed {individual_move}")
    except Exception as e:
        # Handle any errors that occur in the pipeline
        print(f"Error processing {individual_move}: {e}")

In [None]:
# Save or print the created graph
print(final_move_graph.serialize(format="turtle"))
with open(os.path.join(graph_path,"move_graph.ttl"), "w", encoding="utf-8") as file:
    file.write(final_move_graph.serialize(format="turtle"))

In [None]:
with open('RessourcesList\location_list.txt', 'r') as file:
    location_list = [line.strip() for line in file]

print(location_list)

In [None]:
LOCATION = Namespace("http://example.org/location/")
intial_location_graph = Graph()
final_location_graph = Graph()

for individual_location in location_list:
    try:
        # Step 1: Fetch the page text
        result1 = bulbabot.page_text(title=individual_location)
        
        # Step 2: Extract the infobox
        result2 = extract_location_infobox(result1)
        
        # Step 3: Parse the infobox
        result3 = parse_infobox(result2)
        
        # Step 4: Map the parsed infobox to RDF
        final_location_graph = map_infobox_to_rdf(LOCATION, intial_location_graph, property_mappings, result3)
        
        # Log or handle the final result (optional)
        print(f"Successfully processed {individual_location}")
    except Exception as e:
        # Handle any errors that occur in the pipeline
        print(f"Error processing {individual_location}: {e}")

In [None]:
# Save or print the created graph
print(final_location_graph.serialize(format="turtle"))
# with open(os.path.join(graph_path,"location_graph.ttl"), "w", encoding="utf-8") as file:
#     file.write(final_location_graph.serialize(format="turtle"))

In [73]:
# def map_infobox_to_rdf(properties):
#     # Define RDF namespaces
#     POKEMON = Namespace("http://example.org/pokemon/")
#     RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

#     # Initialize RDF graph
#     g = Graph()

#     # Add Pokémon URI
#     pokemon_uri = URIRef(POKEMON[properties.get("name", "Unknown").replace(" ", "_")])
#     g.add((pokemon_uri, RDF.type, POKEMON.Pokémon))

#     # Map general properties
#     mappings = {
#         "name": POKEMON.name,
#         "jname": POKEMON.japaneseName,
#         "tmname": POKEMON.trademarkedJapaneseName,
#         "jtranslit": POKEMON.japaneseTransliteration,
#         "category": POKEMON.category,
#         "ndex": POKEMON.nationalDexNumber,
#         "forme": POKEMON.numberOfForms,
#         "type1": POKEMON.primaryType,
#         "type2": POKEMON.secondaryType,
#         "ability1": POKEMON.ability1,
#         "ability2": POKEMON.ability2,
#         "abilityd": POKEMON.hiddenAbility,
#         "height-m": POKEMON.heightInMeters,
#         "height-ftin": POKEMON.heightInFeetInches,
#         "weight-kg": POKEMON.weightInKilograms,
#         "weight-lbs": POKEMON.weightInPounds,
#         "catchrate": POKEMON.catchRate,
#         "gendercode": POKEMON.genderRatioCode,
#         "egggroup1": POKEMON.eggGroup1,
#         "egggroup2": POKEMON.eggGroup2,
#         "color": POKEMON.color,
#         "friendship": POKEMON.baseFriendship,
#         "generation": POKEMON.generationIntroduced,
#         "expyield": POKEMON.baseExperienceYield,
#         "evtotal": POKEMON.evTotal,
#         "evhp": POKEMON.evYieldHP,
#         "evat": POKEMON.evYieldAttack,
#         "evde": POKEMON.evYieldDefense,
#         "evsa": POKEMON.evYieldSpecialAttack,
#         "evsd": POKEMON.evYieldSpecialDefense,
#         "evsp": POKEMON.evYieldSpeed
#     }

#     # Map form-specific properties dynamically
#     for i in range(2, 7):  # Forms 2 through 6
#         form_suffix = f"form{i}"
#         mappings.update({
#             f"{form_suffix}": POKEMON[f"form{i}Name"],
#             f"{form_suffix}type1": POKEMON[f"form{i}PrimaryType"],
#             f"{form_suffix}type2": POKEMON[f"form{i}SecondaryType"],
#             f"height-m{i}": POKEMON[f"form{i}HeightInMeters"],
#             f"height-ftin{i}": POKEMON[f"form{i}HeightInFeetInches"],
#             f"weight-kg{i}": POKEMON[f"form{i}WeightInKilograms"],
#             f"weight-lbs{i}": POKEMON[f"form{i}WeightInPounds"],
#         })

#     # Generate RDF triples for properties that exist in the input
#     for key, predicate in mappings.items():
#         if key in properties and properties[key]:
#             g.add((pokemon_uri, predicate, Literal(properties[key])))

#     return g

In [None]:
# grap = map_infobox_to_rdf(properties)
# print(grap.serialize(format="turtle"))

In [100]:
# # FUNCTION DEFINITION : mapping of character infobox properties to existing graph
# def map_character_infobox_to_rdf(graph, property_mappings, properties):
#     CHARACTER = Namespace("http://example.org/character/")
#     RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

#     character_uri = URIRef(CHARACTER[properties.get("name", "Unknown").replace(" ", "_")])
#     graph.add((character_uri, RDF.type, CHARACTER.Character))

#     mappings = {
#         "color": CHARACTER.color,
#         "bordercolor": CHARACTER.borderColor,
#         "corecolor": CHARACTER.coreColor,
#         "name": CHARACTER.name,
#         "jname": CHARACTER.japaneseName,
#         "tmname": CHARACTER.romanizedJapaneseName,
#         "jtranslit": CHARACTER.japaneseTransliteration,
#         "text": CHARACTER.textColor,
#         "sloganline": CHARACTER.sloganLine,
#         "image": CHARACTER.image,
#         "size": CHARACTER.imageSize,
#         "caption": CHARACTER.imageCaption,
#         "age": CHARACTER.age,
#         "birthday": CHARACTER.birthday,
#         "gender": CHARACTER.gender,
#         "height": CHARACTER.height,
#         "eyes": CHARACTER.eyeColor,
#         "hair": CHARACTER.hairColor,
#         "hometown": CHARACTER.hometown,
#         "region": CHARACTER.region,
#         "relatives": CHARACTER.relatives,
#     }

#     for key, predicate in mappings.items():
#         if key in properties and properties[key]:
#             property_type = property_mappings.get(str(URIRef(CHARACTER + key)))
#             match property_type:
#                 case "Literal":
#                     graph.add((character_uri, predicate, Literal(properties[key])))
#                 case "Integer":
#                     if isinstance(int(properties[key]), int):
#                         graph.add((character_uri, predicate, Literal(int(properties[key]))))
#                 case _:
#                     print(property_type)
#                     uriString = properties[key].replace(" ","_")
#                     if not("<" in uriString) and not(">" in uriString):
#                         graph.add((character_uri, predicate, URIRef(CHARACTER + uriString)))

#     return graph

In [90]:
# # FUNCTION DEFINITION : mapping of region infobox properties to existing graph
# def map_region_infobox_to_rdf(graphModify, property_mappings, properties):

#     REGION = Namespace("http://example.org/region/")
#     RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

#     region_uri = URIRef(REGION[properties.get("name", "Unknown").replace(" ", "_")])
#     graphModify.add((region_uri, RDF.type, REGION.Region))

#     mappings = {
#         "regioncolor": REGION.colorTemplate,
#         "name": REGION.name,
#         "jname": REGION.japaneseName,
#         "tmname": REGION.romanizedJapaneseName,
#         "region": REGION.isRegion,
#         "image": REGION.image,
#         "size": REGION.imageSize,
#         "caption": REGION.imageCaption,
#         "introduction": REGION.introductionMedia,
#         "professor": REGION.professor,
#         "firstpartner": REGION.firstPartnerPokemon,
#         "villain": REGION.villainousOrganization,
#         "league": REGION.league,
#         "location": REGION.leagueLocation,
#         "pokedext": REGION.pokedex,
#         "series": REGION.animationSeries,
#         "season": REGION.animationSeason,
#         "generation": REGION.generation,
#         "games": REGION.games,
#         "manga": REGION.mangaChapter,
#     }
    
#     for key, predicate in mappings.items():
#         if key in properties and properties[key]:
#             property_type = property_mappings.get(str(URIRef(REGION + key)))
#             match property_type:
#                 case "Literal":
#                     graphModify.add((region_uri, predicate, Literal(properties[key])))
#                 case "Integer":
#                     if isinstance(int(properties[key]), int):
#                         graphModify.add((region_uri, predicate, Literal(int(properties[key]))))
#                 case _:
#                     uriString = properties[key].replace(" ","_")
#                     graphModify.add((region_uri, predicate, URIRef(REGION + uriString)))

#     return graphModify