In [2]:
from question_types.sparql import SparqlQueries
from rdflib.namespace import RDFS
import pandas as pd

In [3]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-16 07:59:20,536 | INFO | __init__ | [92mParsing graph[0m
2024-10-16 08:00:08,149 | INFO | __init__ | [92mGraph parsed[0m


In [4]:
triples_data = []

def get_label(graph, uri):
    """Function to get the rdfs:label or schema:name for a URI if available."""
    for label in graph.objects(uri, RDFS.label):
        return str(label)
    
    schema_name = "http://schema.org/name"
    
    for label in graph.objects(uri, graph.namespace_manager.compute_qname(schema_name)[2]):
        return str(label)
    
    return decode_uri(uri)

schema_mapping = {
    "http://ddis.ch/atai/": "ddis",
    "http://www.wikidata.org/entity/": "wd",
    "http://www.wikidata.org/prop/direct/": "wdt",
    "http://schema.org/": "schema"
}

def decode_uri(uri):
    """Helper function to replace full URIs with prefixes and extract local names"""
    for schema, prefix in schema_mapping.items():
        if uri.startswith(schema):
            return f"{prefix}:{uri[len(schema):]}" 
    return uri  


for i, (subj, pred, obj) in enumerate(sparql.graph):
    subject_label = get_label(sparql.graph, subj)
    predicate_label = get_label(sparql.graph, pred)
    object_label = get_label(sparql.graph, obj)
    
    triples_data.append((subject_label, predicate_label, object_label))

df_triples = pd.DataFrame(triples_data, columns=['Subject', 'Predicate', 'Object'])

df_grouped = df_triples.groupby(['Subject', 'Predicate'])['Object'].agg(lambda x: ', '.join(x)).reset_index()

df_pivot = df_grouped.pivot(index='Subject', columns='Predicate', values='Object')

In [5]:
df_pivot.to_pickle("./exports/graph.pkl")

In [6]:
subject_as_object_set = set(df_triples['Subject']).intersection(set(df_triples['Object']))
print("Stage 1 - Subjects that are also objects:", len(subject_as_object_set))

object_to_subjects_predicates = {}
for index, row in df_triples.iterrows():
    if row['Object'] in subject_as_object_set:
        if row['Object'] not in object_to_subjects_predicates:
            object_to_subjects_predicates[row['Object']] = []
        object_to_subjects_predicates[row['Object']].append((row['Subject'], row['Predicate']))
print("Stage 2 - Mapping objects to their subjects and predicates:", len(object_to_subjects_predicates))

new_triples_data = df_triples.values.tolist()
for index, row in df_triples.iterrows():
    if row['Subject'] in object_to_subjects_predicates:
        new_triples_data.extend([(row['Subject'], predicate, new_object) for new_object, predicate in object_to_subjects_predicates[row['Subject']]])
print("Stage 3 - Added new triples:", len(new_triples_data))

df_new_triples = pd.DataFrame(new_triples_data, columns=['Subject', 'Predicate', 'Object'])

Stage 1 - Subjects that are also objects: 131021
Stage 2 - Mapping objects to their subjects and predicates: 131021
Stage 3 - Added new triples: 55980651


In [10]:
df_new_triples = df_new_triples.drop_duplicates()

df_grouped_new = df_new_triples.groupby(['Subject', 'Predicate'])['Object'].agg(lambda x: ', '.join(x)).reset_index()

df_pivot_new = df_grouped_new.pivot(index='Subject', columns='Predicate', values='Object')

In [12]:
import json
with open("./exports/graph.json", 'w') as file:
    json.dump(df_pivot_new.to_dict(), file, indent=4)
    
df_pivot_new.to_csv("./exports/extended_graph_triples.csv", index=False)

df_pivot_new.to_pickle("./exports/extended_graph_triples.pkl")