In [1]:
# Import definitions
import pwiki
import pwiki.wiki
import mwparserfromhell
import re
import os
import urllib.parse
import rdflib

from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD, OWL


In [3]:
#Name spaces definition
RESOURCE = Namespace("http://example.org/resource/")
BULBAPEDIA = Namespace("https://bulbapedia.bulbagarden.net/w/api.php")

In [4]:
# piwiki configuration
bulbabot = pwiki.wiki.Wiki(api_endpoint="https://bulbapedia.bulbagarden.net/w/api.php")
print(bulbabot.exists("Bulbasaur (Pokémon)"))

True


In [32]:
# General inforbox extraction based on the standar output of the bulbabot.page_text(title) function
# content = bulbabot.page_text(title)
# pattern = r"{{Pokémon Infobox.*?}}" for example

def extract_infobox(content, pattern):
    # pattern = pattern = r'{{' + re.escape(pattern) + r'\s*(?:[^{}]++|{{[^{}]++}})*}}'
    pattern = pattern = r'{{' + re.escape(pattern).replace(r'\ ', " ") + r'\s*(?:[^{}]++|{{[^{}]++}})*}}'
    # print(pattern)
    # pattern =  r'\{\{' + re.escape(pattern).replace(r'\ ', " ") + r'([^{]*(?:\{[^{]*\}[^{}]*)*)\}\}'
    # print(pattern)
    match = re.search(pattern, content, re.DOTALL)
    if match:
        return match.group(0)
    return None

def load_regex_patterns(file_path):
    regex_dict = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Strip whitespace, skip empty lines or comments
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                
                # Split the line into key and regex pattern by '='
                key, pattern = line.split('=', 1)
                
                # Add to the dictionary
                regex_dict[key.strip()] = pattern.strip()
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except Exception as e:
        print(f"Error: {e}")
    
    return regex_dict

# parsing of the infobox text 
def parse_infobox(infobox):
    properties = {}
    if infobox is None:
        return properties
    lines = infobox.split("\n")
    for line in lines:
        if line.startswith("|"):
            key_value = line[1:].split("=", 1)  # Remove leading `|` and split at `=`
            if len(key_value) == 2:
                key, value = key_value
                new_key = key.strip()
                new_value = value
                new_value = re.sub(r'<.*?>', '', new_value.replace("{{","_").replace("}}","_").replace("|","_").replace(" ", "_"))
                new_value = re.sub(r'<!--.*?-->', '', new_value, flags=re.DOTALL)
                new_value = new_value.strip().replace("[[","").replace("]]","").replace("{{","").replace("}}","").replace("?","").replace("\\", "").replace("\"", "").replace("'", "")
                if new_key == "size":
                    new_value = new_value.replace("px", "")
                if not("|" in new_value):
                    properties[new_key] = new_value
    return properties

def is_valid_xsd_date(date_string):
    # Regex pattern for the date format YYYY-MM-DD
    xsd_date_pattern = r"^\d{4}-\d{2}-\d{2}$"
    
    # Check if the date string matches the pattern
    if re.match(xsd_date_pattern, date_string):
        return True
    else:
        return False
    
def is_valid_uri(uri_string):
    # Simple regex for checking if the string is a valid URI format
    pattern = re.compile(r'^[a-zA-Z][a-zA-Z\d+\-.]*://(?:[^\s/?#]+(?:[^\s/?#]*[^\s/?#])?/?[^\s]*)?$')
    return bool(pattern.match(uri_string))

In [6]:
def read_vocabulary(initial_vocabulary_graph, vocabulary_path): 
    for file_name in os.listdir(vocabulary_path):
        if file_name.endswith('.ttl'):
            initial_vocabulary_graph.parse(os.path.join(vocabulary_path,file_name), format="ttl", encoding="utf-8" )
            print("Read: " + file_name)

    property_mappings = {}

    # Iterate over the graph to get class properties and their types
    for subject, predicate, obj in initial_vocabulary_graph:
        if predicate == RDF.type and obj == RDFS.Class:
            # Class found, check for properties of this class
            class_uri = subject
            for s, p, o in initial_vocabulary_graph.triples((None, RDFS.domain, class_uri)):
                if p == RDFS.domain and o == class_uri:
                    # Now we know this property is for our class
                    property_uri = s
                    # Check for range (type) of the property
                    range_type = None
                    values_list = []

                    for _, p, o in initial_vocabulary_graph.triples((property_uri, RDFS.range, None)):
                        range_type = o

                    values_list.append(range_type)
                    match range_type:
                        case XSD.string:
                            values_list.append('Literal')
                        case XSD.integer:
                            values_list.append('Integer')
                        case XSD.date:
                            values_list.append('Date')
                        case XSD.decimal:
                            values_list.append('Decimal')
                        case _:
                            values_list.append('URIRef')

                    property_mappings[str(property_uri)] = values_list
                        

    return initial_vocabulary_graph, property_mappings

In [11]:
# FUNCTION DEFINITION : mapping of pokemon infobox properties to existing graph
def map_infobox_to_rdf(name_space, graph, property_mappings, properties, bulbapedia_title, resource_category):
    
    # POKEMON = Namespace("http://example.org/pokemon/")
    # name_space = RESOURCE = Namespace("http://example.org/resource/")
    nstype = str(resource_category.replace("Infobox", "").replace("infobox", "").replace(" ", ""))
    ns1 = URIRef(name_space +  nstype.lower())
    PROPERTY = Namespace("http://example.org/property/")

    # print(name_space)
    # print(nstype)
    # print(ns1)
    
    resoucre_uri = URIRef(name_space+ nstype.lower() +"/" + properties.get("name", "Unknown").replace(" ", "_"))
    # Probably best to enhance URI management, must be revised and decided afterwards:
    # resoucre_uri = URIRef(name_space[properties.get("name", "Unknown").replace(" ", "_")])

    if properties.get("name", "Unknown").replace(" ", "_") == "Unknown":
        resoucre_uri = URIRef(name_space+ nstype.lower() +"/" +bulbapedia_title.replace(" ", "_"))
    graph.add((resoucre_uri, RDF.type, URIRef(ns1 + "/" + nstype )))

    # print(graph.serialize(format="turtle"))

    # for propertyUri, propertyRestriction in property_mappings.items():
    #     if name_space in propertyUri:
    #         property_name = propertyUri[propertyUri.rfind('/') + 1:]
    #         print(property_name)
    #         print(propertyUri)
    #         print("----------------------------------------------------------------")

    #         if property_name in properties:
    #             match propertyRestriction[1]:
    #                 case "Literal":
    #                     # print("here1")
    #                     graph.add((resoucre_uri, URIRef(propertyUri), Literal(properties[property_name])))
    #                 case "Integer":
    #                     # print("here2")
    #                     if isinstance(int(properties[property_name]), int):
    #                         graph.add((resoucre_uri, URIRef(propertyUri), Literal(int(properties[property_name]))))
    #                 case "Date": 
    #                     # print("here3")
    #                     graph.add((resoucre_uri, URIRef(propertyUri), Literal(properties[property_name], datatype=XSD.date)))
    #                 case "Decimal": 
    #                     # print("here4")
    #                     graph.add((resoucre_uri, URIRef(propertyUri), Literal(properties[property_name], datatype=XSD.decimal)))
    #                 case _:
    #                     uriString = re.sub(r"<[^>]*>", "", re.sub(r'<.*?>', '', properties[property_name].replace(" ","_").replace(";", "_")))
    #                     if str(name_space) in str(propertyRestriction[0]):
    #                         # print("here5")
    #                         graph.add((resoucre_uri, URIRef(propertyUri), URIRef(propertyRestriction[0] +"/" +uriString)))
    #                     else:
    #                         graph.add((resoucre_uri, URIRef(propertyUri), URIRef(uriString)))
    #                         graph.add((URIRef(uriString), RDF.type ,propertyRestriction[0]))
    #             mapped_properties.append(property_name)

    for propertyToBeIncluded, propertyValue in properties.items():
        #Test if it exist in the property mappings directory 
        testing_property = PROPERTY + propertyToBeIncluded
        if testing_property in property_mappings :      
            propertyRestriction = property_mappings[str(testing_property)]
            propertyValue = re.sub(r'<.*?>', '', propertyValue.replace("{{","_").replace("}}","_").replace("|","_"))
            match propertyRestriction[1]:
                case "Literal":
                    # print("here1")
                    graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue)))
                case "Integer":
                    # print("here2")
                    # if isinstance(int(propertyValue), int):
                    try:
                        graph.add((resoucre_uri, URIRef(testing_property), Literal(int(propertyValue))))
                    except :
                        graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue)))
                case "Date": 
                    # print("here3")
                    try:
                        if is_valid_xsd_date(propertyValue):
                            graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue, datatype=XSD.date)))
                        else: 
                            graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue)))
                    except :
                        graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue)))
                case "Decimal": 
                    # print("here4")
                    try:
                        graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue, datatype=XSD.decimal)))
                    except:
                        graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyValue)))
                case _:
                    uriString = re.sub(r"<[^>]*>", "", re.sub(r'<.*?>', '', propertyValue.replace(" ","_").replace(";", "_")))
                    if (str(name_space) in str(propertyRestriction[0])) and (is_valid_uri(propertyRestriction[0] +"/" +uriString)) :
                        try:
                            graph.add((resoucre_uri, URIRef(testing_property), URIRef(re.sub(r'/[^/]+$', '', propertyRestriction[0]) +"/" +uriString)))
                            graph.add((URIRef(uriString), RDF.type , propertyRestriction[0]))
                        except:
                            graph.add((resoucre_uri, URIRef(testing_property), Literal(propertyRestriction[0]) +"/" +uriString))
                            graph.add((URIRef(uriString), RDF.type , propertyRestriction[0]))
                    else:
                        graph.add((resoucre_uri, URIRef(testing_property), URIRef(uriString)))
                        graph.add((URIRef(uriString), RDF.type , propertyRestriction[0]))


        #If not exist in mappings directorym then adds it as a generic property and a its valua as a literal string
        # algo declare it as a generic property
        else:
    #         print("not")
            propertyToBeIncluded = re.sub(r'<.*?>', '', propertyToBeIncluded.replace("{{","_").replace("}}","_").replace("- ", "").replace("|","_").replace(" ", "_") )
            propertyValue = re.sub(r'<.*?>', '', propertyValue.replace("{{","_").replace("}}","_").replace("|","_").replace(" ", "_"))
            # propertyToBeIncluded = urllib.parse.quote(propertyToBeIncluded)
            # propertyValue = urllib.parse.quote(propertyValue)
            try: 
                graph.add((resoucre_uri, URIRef(name_space+ nstype.lower() +"/" + propertyToBeIncluded), Literal(propertyValue)))
            # graph.add((URIRef(name_space+ nstype.lower() +propertyToBeIncluded), RDF.type, URIRef(name_space+ nstype.lower() +"/genericproperty")))
            except:
                print("Imposible to add property key " + propertyToBeIncluded)
                print("property value:" +propertyValue)
    # print(graph.serialize(format="turtle"))
    # print("----------------------------------------------------------------")

    return graph


In [8]:
def final_function(resources_list, infoboxes_patterns, vocabulary_property_mappings, resource_namespace ):
    for resource in resources_list:
        if resource.endswith('.txt'):
            with open(resource, 'r', encoding='utf-8') as file:
                bulbapedia_titles = [line.strip() for line in file]

            resource_category = resource.split('/')[1].split('_')[0].replace("&", "/")

    
            graph_path = "GeneratedGraphs/"
            if os.path.exists(os.path.join(graph_path, resource_category + "_graph.ttl")):
                continue
            initial_graph = Graph()
            final_graph = Graph()
            for bulbapedia_title in bulbapedia_titles:
                try:
                    # Step 1: Fetch the page text
                    result1 = bulbabot.page_text(title=bulbapedia_title)
                    
                    # # Step 2: Extract the infobox                    
                    result2 = extract_infobox(result1, infoboxes_patterns[resource_category])
                    
                    # # Step 3: Parse the infobox
                    result3 = parse_infobox(result2)
                    # print(resource_category)
                    # a = resource_category.replace("Infobox", "").replace("infobox", "").replace(" ", "")
                    # print(a)
                    # print(URIRef(RESOURCE+a))
                    
                    # Step 4: Map the parsed infobox to RDF
                    final_graph = map_infobox_to_rdf(resource_namespace, initial_graph, vocabulary_property_mappings, result3, bulbapedia_title, resource_category)
                    # Log or handle the final result (optional)
                    with open(os.path.join(graph_path, resource_category + "_graph.ttl"), "w", encoding="utf-8") as file:
                        file.write(final_graph.serialize(format="turtle"))
                    # print(final_graph.serialize(format="turtle"))
                    # print(f"Successfully processed {bulbapedia_title}")
                except Exception as e:
                    # Handle any errors that occur in the pipeline
                    print(f"Error processing {bulbapedia_title}: {e}")
                    break
                

    

In [9]:
regex_infoboxes_path = "Infoboxes\infoboxes.txt"
regex_infoboxes = load_regex_patterns(regex_infoboxes_path)
regex_infoboxes

{'AbilityInfobox': 'AbilityInfobox',
 'AbilityInfoboxNoncat': 'AbilityInfoboxNoncat',
 'AdvArcInfobox': 'AdvArcInfobox',
 'AnimeLocationInfobox': 'AnimeLocationInfobox',
 'AnimePokémonInfobox': 'AnimePokémonInfobox',
 'ArchetypeInfobox': 'ArchetypeInfobox',
 'BadgeInfobox': 'BadgeInfobox',
 'BattleEInfobox': 'BattleEInfobox',
 'BattrioExpansionInfobox': 'BattrioExpansionInfobox',
 'BattrioPuckInfobox': 'BattrioPuckInfobox',
 'BerryInfobox': 'BerryInfobox',
 'Blueinfobox': 'Blueinfobox',
 'Boardgame Infobox': 'Boardgame Infobox',
 'BookInfobox': 'BookInfobox',
 'CDInfobox': 'CDInfobox',
 'ChapInfobox': 'ChapInfobox',
 'Chapterinfobox': 'Chapterinfobox',
 'Character Infobox': 'Character Infobox',
 'CharInfobox': 'CharInfobox',
 'CharInfoboxUser': 'CharInfoboxUser',
 'Company Infobox': 'Company Infobox',
 'Console infobox': 'Console infobox',
 'CountryInfobox': 'CountryInfobox',
 'DeckInfobox': 'DeckInfobox',
 'DuelInfobox': 'DuelInfobox',
 'Dungeon infobox': 'Dungeon infobox',
 'DWGame':

In [12]:
vocabulary_path = f'C:/Users/HP/Documents/07. EMSE/Master DSC/WS/Project-BulbapediaKG/BulbapediaKG1/Vocabulary/'
vocabulary_graph = Graph()

vocabulary = read_vocabulary(vocabulary_graph, vocabulary_path)
vocabulary[1]


Read: infoboxCharacter.ttl
Read: infoboxLocation.ttl
Read: infoboxMove.ttl
Read: infoboxPokemon.ttl
Read: infoboxRegion.ttl


{'http://example.org/resource/move/n': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'),
  'Integer'],
 'http://schema.org/name': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'),
  'Literal'],
 'http://example.org/resource/move/jname': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'),
  'Literal'],
 'http://example.org/resource/move/jtrans': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'),
  'Literal'],
 'http://example.org/resource/move/jtranslit': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'),
  'Literal'],
 'http://example.org/resource/move/gameimage': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#anyURI'),
  'URIRef'],
 'http://example.org/resource/move/gameimage2': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#anyURI'),
  'URIRef'],
 'http://example.org/resource/move/gameimagewidth': [rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'),
  'Literal'],
 'http://example.org/resource/

In [35]:
# Get the ressources path, this must be the folder with all the txt files representing the title pages in bulbapedia 
# that uses an specific infobox template, ie the ressource, namin convention for those files is:
# 1. Get the bulbapedia infobox name, ex MovieInfobox.
# 2. keep only the part before Infobox and add _list.txt

resources_path = 'RessourcesList' 
all_files_and_dirs = os.listdir(resources_path)
resource_list = [("RessourcesList/"+f) for f in all_files_and_dirs if os.path.isfile(os.path.join(resources_path, f))]

test = final_function(resource_list, regex_infoboxes,vocabulary[1], RESOURCE)


http://example.org/resource/region/Region/UnknownCynthia:_''"My_house_Even_the_region_is_a_secret!"''_(_g_Platinum_,_at_the__player_'s_villa) does not look like a valid URI, trying to serialize this will break.
http://example.org/resource/region/UnknownCynthia:_''"My_house_Even_the_region_is_a_secret!"''_(_g_Platinum_,_at_the__player_'s_villa) does not look like a valid URI, trying to serialize this will break.
UnknownCynthia:_''"My_house_Even_the_region_is_a_secret!"''_(_g_Platinum_,_at_the__player_'s_villa) does not look like a valid URI, trying to serialize this will break.


Error processing Cynthia: "UnknownCynthia:_''"My_house_Even_the_region_is_a_secret!"''_(_g_Platinum_,_at_the__player_'s_villa)" does not look like a valid URI, I cannot serialize this as N3/Turtle. Perhaps you wanted to urlencode it?


In [22]:
bulbasaur_test = bulbabot.page_text(title="Bulbasaur (Pokémon)")
bulbasaur_infoboxtest = extract_infobox(bulbasaur_test, regex_infoboxes["Pokémon Infobox"])
bulbasaur_infoboxparsed = parse_infobox(bulbasaur_infoboxtest)

In [33]:
bulbasaur_test = bulbabot.page_text(title="Cynthia")
# print(bulbasaur_test)
# print("-----------------------")
# print(regex_infoboxes["Character Infobox"])
bulbasaur_infoboxtest = extract_infobox(bulbasaur_test, regex_infoboxes["Character Infobox"])
bulbasaur_infoboxtest
bulbasaur_infoboxparsed = parse_infobox(bulbasaur_infoboxtest)
bulbasaur_infoboxparsed


{'color': '_cynthia_color_',
 'bordercolor': '_cynthia_color_dark_',
 'corecolor': '_cynthia_color_light_',
 'name': 'Cynthia',
 'jname': 'シロナ',
 'tmname': 'Shirona',
 'text': 'FFF',
 'slogan': 'no',
 'image': 'Brilliant_Diamond_Shining_Pearl_Cynthia.png',
 'size': '180',
 'caption': 'Art_from__pkmn_Brilliant_Diamond_and_Shining_Pearl_',
 'gender': 'Female',
 'eyes': 'Gray',
 'hair': 'Blonde',
 'hometown': 'UnknownCelestic_Town_tt_*_Pokémon_Adventures_',
 'region': 'UnknownCynthia:_\'\'"My_house_Even_the_region_is_a_secret!"\'\'_(_g_Platinum_,_at_the__player_\'s_villa)',
 'relatives': 'Professor_Carolina_Unnamed_grandmother_and_grandfatherUnnamed_younger_sister'}

In [None]:
vocabulary[1]