(https://towardsdatascience.com/harnessing-the-power-of-knowledge-graphs-enriching-an-llm-with-structured-data-997fabc62386)

In [None]:
!pip3 install pandas rdflib SPARQLWrapper tqdm

In [None]:
import requests
import json
import pandas as pd

url = 'https://search.worldbank.org/api/v2/wds'
params = {
  'format': 'json',
  'display_title': '"sustainable development"',
  'rows': 20,
  'page': 1
}

metadata_list = []

for i in range(1):
  response = requests.get(url, params=params)
  data = json.loads(response.content)
  for doc_id in data['documents']:
    metadata = data['documents'][doc_id]
    metadata_list.append(metadata)

  params['page'] += 1

df = pd.DataFrame(metadata_list)

In [None]:
print(df)

In [None]:
from rdflib import Graph, RDF, RDFS, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, XSD
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

# Create a new RDF graph
g = Graph()

schema = Namespace('http://schema.org/')
wd = Namespace('http://www.wikidata.org/entity/')

# Define namespaces
prefixes = {
  'schema': schema,
  'wd': wd,
  'skos': SKOS,
  'xsd': XSD
}
for p, ns in prefixes.items():
  g.bind(p, ns)

In [None]:
def create_subclass_country(column):
  newClass = URIRef(schema + "country")
  g.add((newClass, RDFS.label, Literal("country", lang='en')))
  df[column] = df[column].astype(str)
  for value in df[column].unique():
    if value != "nan":
      # Check Wikidata for a matching class
      sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
      query = f"""
                SELECT ?class ?label WHERE {{
                    ?class wdt:P31 wd:Q6256 .
                    ?class rdfs:label "{value}"@en .
                    OPTIONAL {{ ?class skos:prefLabel ?label FILTER(lang(?label) = "en") }}
                    FILTER(REGEX(STR(?class), "^http://www.wikidata.org/entity/Q[0-9]+$"))
                }}
            """
      sparql.setQuery(query)
      sparql.setReturnFormat(JSON)
      results = sparql.query().convert()

      # If there is a match, use the Wikidata class as a subclass
      if results['results']['bindings']:

        #Get URI from Wikidata
        uri = results['results']['bindings'][0]['class']['value']

        #Get the 'Q ID' which is the unique ID at the end of the URI
        qid = uri.split('/')[-1]
        country_label = value

        #Create a subclass for each country under the country class
        subclass = URIRef(schema + country_label.replace(' ', '_'))
        g.add((subclass, RDF.type, RDFS.Class))
        g.add((subclass, RDFS.subClassOf, newClass))

        # Update the "country_URI" column with the URI for the current country
        df.loc[df[column] == value, "country_URI"] = uri
        uri = URIRef(uri)

        # Define the URI for the new Wikidata URI property
        wd_URI_property = URIRef(schema + "wd_URI")

        # Add the property to the RDF graph
        g.add((wd_URI_property, RDF.type, RDF.Property))

        # Add a label to the property
        label = Literal("Wikidata URI", lang="en")
        g.add((wd_URI_property, RDFS.label, label))

        #Add Wikidata URI as a property to each country class
        g.add((subclass, schema.wd_URI, uri))

        #Add label to each Wikidata Q ID code that it is the Q ID for this particular country
        g.add((uri, RDFS.label, Literal(f"{country_label} wikidata code", lang='en')))
        g.add((subclass, RDFS.label, Literal(value, lang='en')))
      else:
        subclass = URIRef(schema + value.replace(' ', '_').replace('-','_'))
        g.add((subclass, RDF.type, RDFS.Class))
        g.add((subclass, RDFS.subClassOf, newClass))
        g.add((subclass, RDFS.label, Literal(value, lang='en')))

In [None]:
#Save graph as ttl file for use in protégé
g.serialize('worldBankKG.ttl',format='turtle',prefixes = prefixes, encoding='urf-8')