In [15]:
from question_types.sparql import SparqlQueries
from rdflib.namespace import RDFS
import pandas as pd

In [16]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-16 19:48:40,750 | INFO | __init__ | [92mParsing graph[0m
2024-10-16 19:49:22,979 | INFO | __init__ | [92mGraph parsed[0m


In [17]:
triples_data = []

def get_label(graph, uri):
    """Function to get the rdfs:label or schema:name for a URI if available."""
    for label in graph.objects(uri, RDFS.label):
        return str(label)
    
    schema_name = "http://schema.org/name"
    
    for label in graph.objects(uri, graph.namespace_manager.compute_qname(schema_name)[2]):
        return str(label)
    
    return decode_uri(uri)

schema_mapping = {
    "http://ddis.ch/atai/": "ddis",
    "http://www.wikidata.org/entity/": "wd",
    "http://www.wikidata.org/prop/direct/": "wdt",
    "http://schema.org/": "schema"
}

def decode_uri(uri):
    """Helper function to replace full URIs with prefixes and extract local names"""
    for schema, prefix in schema_mapping.items():
        if uri.startswith(schema):
            return f"{prefix}:{uri[len(schema):]}" 
    return uri  

for i, (subject_id, predicate_id, object_id) in enumerate(sparql.graph):   
    triples_data.append((subject_id, predicate_id, object_id))

df_triples = pd.DataFrame(triples_data, columns=['subject_id', 'predicate_id', 'object_id'])

In [18]:
subject_as_object_set = set(df_triples['subject_id']).intersection(set(df_triples['object_id']))
print("Stage 1 - subject_ids that are also objects:", len(subject_as_object_set))

object_to_subjects_predicates = {}
for index, row in df_triples.iterrows():
    if row['object_id'] in subject_as_object_set:
        if row['object_id'] not in object_to_subjects_predicates:
            object_to_subjects_predicates[row['object_id']] = []
        object_to_subjects_predicates[row['object_id']].append((row['subject_id'], row['predicate_id']))

new_triples_data = df_triples.values.tolist()
for index, row in df_triples.iterrows():
    if row['subject_id'] in object_to_subjects_predicates:
        new_triples_data.extend([(row['subject_id'], predicate, new_object) for new_object, predicate in object_to_subjects_predicates[row['subject_id']]])

print("Stage 2 - Newly added triples:", len(new_triples_data) - len(df_triples))

df_new_triples = pd.DataFrame(new_triples_data, columns=['subject_id', 'predicate_id', 'object_id'])

Stage 1 - subject_ids that are also objects: 136263
Stage 2 - Newly added triples: 48741780


In [19]:
df_new_triples = df_new_triples.drop_duplicates()

df_new_triples['subject_label'] = df_new_triples['subject_id'].apply(lambda x: get_label(sparql.graph, x))
df_new_triples['predicate_label'] = df_new_triples['predicate_id'].apply(lambda x: get_label(sparql.graph, x))
df_new_triples['object_label'] = df_new_triples['object_id'].apply(lambda x: get_label(sparql.graph, x))

df_grouped_new = df_new_triples.groupby(['subject_id', 'subject_label', 'predicate_label'])['object_label'].agg(lambda x: ', '.join(x)).reset_index()

df_grouped_new = df_grouped_new.drop_duplicates()

print(df_grouped_new[df_grouped_new['subject_id'].str.contains('Q13909')])

After dropping duplicates:
                                 subject_id  \
0  http://www.wikidata.org/entity/Q28489891   
1    http://www.wikidata.org/entity/Q845176   
2   http://www.wikidata.org/entity/Q6786487   
3   http://www.wikidata.org/entity/Q2656265   
4    http://www.wikidata.org/entity/Q486826   

                                predicate_id  \
0  http://www.wikidata.org/prop/direct/P5970   
1   http://www.wikidata.org/prop/direct/P161   
2              http://schema.org/description   
3   http://www.wikidata.org/prop/direct/P175   
4   http://www.wikidata.org/prop/direct/P161   

                                   object_id  
0   http://www.wikidata.org/entity/Q56887490  
1     http://www.wikidata.org/entity/Q312483  
2  character in the animated Pixar film Cars  
3    http://www.wikidata.org/entity/Q5790509  
4     http://www.wikidata.org/entity/Q336689  
After applying labels:
                                 subject_id  \
0  http://www.wikidata.org/entity/Q28489891   
1 

In [20]:
subject_label_df = df_grouped_new[['subject_id', 'subject_label']].copy()

subject_label_df = subject_label_df.drop_duplicates()

reverse_index = subject_label_df.groupby('subject_label')['subject_id'].agg(lambda x: ', '.join(x)).reset_index()

print(reverse_index[reverse_index.subject_label == "Angelina Jolie"])

       subject_label                             subject_id
8179  Angelina Jolie  http://www.wikidata.org/entity/Q13909


In [21]:
df_grouped_new.to_pickle("./exports/extended_graph_triples.pkl")
reverse_index.to_pickle("./exports/reverse_index.pkl")
df_grouped_new.to_csv("./exports/extended_graph_triples.csv")