In [7]:
def ensure(package):
    try:
        __import__(package)
    except ImportError:
        import subprocess, sys
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
ensure("rdflib")
ensure("requests")


In [6]:
import requests
from rdflib import Graph, URIRef, Literal, Namespace

# .nt file with artist data
input_path = "wikidata-personen-attributen.nt"
g = Graph()
g.parse(input_path, format="nt")

# schema.org
schema = Namespace("https://schema.org/")

# Get from Wikipedia the summary and url
def get_wikipedia_info(name):
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{name.replace(' ', '_')}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        summary = data.get("extract")
        page_url = data.get("content_urls", {}).get("desktop", {}).get("page", None)
        return summary, page_url
    else:
        return None, None

# New graph
g_out = Graph()

# Every artist name & attribute to tripple
for s, p, o in g:
    if p == URIRef("https://schema.org/name") and o.language == 'en':
        artist_name = str(o)
        summary, page_url = get_wikipedia_info(artist_name)

        # Add description to graph
        if summary:
            g_out.add((s, schema.description, Literal(summary, lang="en")))

        # Add mainEntityOfPage
        if page_url:
            g_out.add((s, schema.mainEntityOfPage, URIRef(page_url)))

# Save to a new .nt file
output_path = "artist_descriptions_with_wikipedia.nt"
g_out.serialize(destination=output_path, format="nt")
print(f"File saved to {output_path}")


File saved to artist_descriptions_with_wikipedia.nt


