In [6]:
from question_types.sparql import SparqlQueries
from rdflib.namespace import RDFS
import pandas as pd

In [7]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-15 21:15:46,632 | INFO | __init__ | [92mParsing graph[0m
2024-10-15 21:17:00,410 | INFO | __init__ | [92mGraph parsed[0m


In [8]:
triples_data = []

def get_label(graph, uri):
    """Function to get the rdfs:label or schema:name for a URI if available."""
    for label in graph.objects(uri, RDFS.label):
        return str(label)
    
    schema_name = "http://schema.org/name"
    
    for label in graph.objects(uri, graph.namespace_manager.compute_qname(schema_name)[2]):
        return str(label)
    
    return decode_uri(uri)

schema_mapping = {
    "http://ddis.ch/atai/": "ddis",
    "http://www.wikidata.org/entity/": "wd",
    "http://www.wikidata.org/prop/direct/": "wdt",
    "http://schema.org/": "schema"
}

def decode_uri(uri):
    """Helper function to replace full URIs with prefixes and extract local names"""
    for schema, prefix in schema_mapping.items():
        if uri.startswith(schema):
            return f"{prefix}:{uri[len(schema):]}" 
    return uri  


for i, (subj, pred, obj) in enumerate(sparql.graph):
    subject_label = get_label(sparql.graph, subj)
    predicate_label = get_label(sparql.graph, pred)
    object_label = get_label(sparql.graph, obj)
    
    triples_data.append((subject_label, predicate_label, object_label))

df_triples = pd.DataFrame(triples_data, columns=['Subject', 'Predicate', 'Object'])

df_grouped = df_triples.groupby(['Subject', 'Predicate'])['Object'].agg(lambda x: ', '.join(x)).reset_index()

df_pivot = df_grouped.pivot(index='Subject', columns='Predicate', values='Object')


In [9]:
df_pivot.to_pickle("./exports/graph.pkl")