In [1]:
from question_types.sparql import SparqlQueries
from rdflib.namespace import RDFS
import pandas as pd

In [2]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-17 07:22:26,666 | INFO | __init__ | [92mParsing graph[0m
2024-10-17 07:23:16,246 | INFO | __init__ | [92mGraph parsed[0m


In [3]:
triples_data = []

def get_label(graph, uri):
    """Function to get the rdfs:label or schema:name for a URI if available."""
    for label in graph.objects(uri, RDFS.label):
        return str(label)
    
    schema_name = "http://schema.org/name"
    
    for label in graph.objects(uri, graph.namespace_manager.compute_qname(schema_name)[2]):
        return str(label)
    
    return decode_uri(uri)

schema_mapping = {
    "http://ddis.ch/atai/": "ddis",
    "http://www.wikidata.org/entity/": "wd",
    "http://www.wikidata.org/prop/direct/": "wdt",
    "http://schema.org/": "schema"
}

def decode_uri(uri):
    """Helper function to replace full URIs with prefixes and extract local names"""
    for schema, prefix in schema_mapping.items():
        if uri.startswith(schema):
            return f"{prefix}:{uri[len(schema):]}" 
    return uri  

for i, (subject_id, predicate_id, object_id) in enumerate(sparql.graph):   
    triples_data.append((subject_id, predicate_id, object_id))

df_triples = pd.DataFrame(triples_data, columns=['subject_id', 'predicate_id', 'object_id'])

In [4]:
subject_as_object_set = set(df_triples['subject_id']).intersection(set(df_triples['object_id']))
print("Stage 1 - subject_ids that are also objects:", len(subject_as_object_set))

object_to_subjects_predicates = {}
for index, row in df_triples.iterrows():
    if row['object_id'] in subject_as_object_set:
        if row['object_id'] not in object_to_subjects_predicates:
            object_to_subjects_predicates[row['object_id']] = []
        object_to_subjects_predicates[row['object_id']].append((row['subject_id'], row['predicate_id']))

new_triples_data = df_triples.values.tolist()
for index, row in df_triples.iterrows():
    if row['subject_id'] in object_to_subjects_predicates:
        new_triples_data.extend([(row['subject_id'], predicate, new_object) for new_object, predicate in object_to_subjects_predicates[row['subject_id']]])

print("Stage 2 - Newly added triples:", len(new_triples_data) - len(df_triples))

df_new_triples = pd.DataFrame(new_triples_data, columns=['subject_id', 'predicate_id', 'object_id'])

Stage 1 - subject_ids that are also objects: 136263
Stage 2 - Newly added triples: 48741780


In [5]:
df_new_triples = df_new_triples.drop_duplicates()

df_new_triples['subject_label'] = df_new_triples['subject_id'].apply(lambda x: get_label(sparql.graph, x))
df_new_triples['predicate_label'] = df_new_triples['predicate_id'].apply(lambda x: get_label(sparql.graph, x))
df_new_triples['object_label'] = df_new_triples['object_id'].apply(lambda x: get_label(sparql.graph, x))

df_grouped_new = df_new_triples.groupby(['subject_id', 'subject_label', 'predicate_label'])['object_label'].agg(lambda x: ', '.join(x)).reset_index()

df_grouped_new = df_grouped_new.drop_duplicates()

print(df_grouped_new[df_grouped_new['subject_id'].str.contains('Q13909')])

                                     subject_id      subject_label  \
217018    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217019    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217020    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217021    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217022    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217023    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217024    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217025    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217026    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217027    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217028    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217029    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217030    http://www.wikidata.org/entity/Q13909     Angelina Jolie   
217031    http://www

In [6]:
subject_label_df = df_grouped_new[['subject_id', 'subject_label']].copy()

subject_label_df = subject_label_df.drop_duplicates()

reverse_index = subject_label_df.groupby('subject_label')['subject_id'].agg(lambda x: ', '.join(x)).reset_index()

print(reverse_index[reverse_index.subject_label == "Angelina Jolie"])

       subject_label                             subject_id
8179  Angelina Jolie  http://www.wikidata.org/entity/Q13909


In [7]:
df_grouped_new.to_pickle("./exports/extended_graph_triples.pkl")
reverse_index.to_pickle("./exports/reverse_index.pkl")
df_grouped_new.to_csv("./exports/extended_graph_triples.csv")