In [1]:
from question_types.sparql import SparqlQueries
from rdflib.namespace import RDFS
import pandas as pd
import unicodedata
import re

In [2]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-18 13:03:39,476 | INFO | __init__ | [92mParsing graph[0m
2024-10-18 13:04:25,678 | INFO | __init__ | [92mGraph parsed[0m


In [3]:
def normalize_string(s):
    """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
    s = s.lower()
    s = unicodedata.normalize('NFKD', s)
    s = s.encode('ascii', 'ignore').decode('utf-8')
    s = re.sub(r'[^\w\s]', '', s)
    s = ' '.join(s.split())
    return s

In [4]:
def get_label(graph, uri):
    """Function to get the rdfs:label or schema:name for a URI if available."""
    for label in graph.objects(uri, RDFS.label):
        return normalize_string(str(label))
    
    schema_name = "http://schema.org/name"
    
    for label in graph.objects(uri, graph.namespace_manager.compute_qname(schema_name)[2]):
        return normalize_string(str(label))
    
    return decode_uri(uri)

schema_mapping = {
    "http://ddis.ch/atai/": "ddis",
    "http://www.wikidata.org/entity/": "wd",
    "http://www.wikidata.org/prop/direct/": "wdt",
    "http://schema.org/": "schema"
}

def decode_uri(uri):
    """Helper function to replace full URIs with prefixes and extract local names"""
    for schema, prefix in schema_mapping.items():
        if uri.startswith(schema):
            return f"{prefix}:{uri[len(schema):]}" 
    return uri

triples_data = [(subject_id, predicate_id, object_id) for subject_id, predicate_id, object_id in sparql.graph]

df_triples = pd.DataFrame(triples_data, columns=['subject_id', 'predicate_id', 'object_id'])

In [5]:
subject_as_object_set = set(df_triples['subject_id']).intersection(set(df_triples['object_id']))
print("Stage 1 - subject_ids that are also objects:", len(subject_as_object_set))

object_to_subjects_predicates = {}
for index, row in df_triples.iterrows():
    if row['object_id'] in subject_as_object_set:
        if row['object_id'] not in object_to_subjects_predicates:
            object_to_subjects_predicates[row['object_id']] = []
        object_to_subjects_predicates[row['object_id']].append((row['subject_id'], row['predicate_id']))

new_triples_data = df_triples.values.tolist()
for index, row in df_triples.iterrows():
    if row['subject_id'] in object_to_subjects_predicates:
        new_triples_data.extend([(row['subject_id'], predicate, new_object) for new_object, predicate in object_to_subjects_predicates[row['subject_id']]])

print("Stage 2 - Newly added triples:", len(new_triples_data) - len(df_triples))

df_new_triples = pd.DataFrame(new_triples_data, columns=['subject_id', 'predicate_id', 'object_id'])
df_new_triples.head()

Stage 1 - subject_ids that are also objects: 136263
Stage 2 - Newly added triples: 48741780


Unnamed: 0,subject_id,predicate_id,object_id
0,http://www.wikidata.org/entity/Q5964032,http://www.wikidata.org/prop/direct/P750,http://www.wikidata.org/entity/Q907311
1,http://www.wikidata.org/entity/Q3617582,http://www.wikidata.org/prop/direct/P27,http://www.wikidata.org/entity/Q38
2,http://www.wikidata.org/entity/Q163868,http://www.wikidata.org/prop/direct/P161,http://www.wikidata.org/entity/Q3369287
3,http://www.wikidata.org/entity/Q302181,http://www.wikidata.org/prop/direct/P437,http://www.wikidata.org/entity/Q723685
4,http://www.wikidata.org/entity/Q337840,http://www.wikidata.org/prop/direct/P136,http://www.wikidata.org/entity/Q205049


In [6]:
df_new_triples = df_new_triples.drop_duplicates()

df_new_triples['subject_label'] = df_new_triples['subject_id'].apply(lambda x: get_label(sparql.graph, x))
df_new_triples['predicate_label'] = df_new_triples['predicate_id'].apply(lambda x: get_label(sparql.graph, x))
df_new_triples['object_label'] = df_new_triples['object_id'].apply(lambda x: get_label(sparql.graph, x))

df_grouped_new = df_new_triples.groupby(['subject_id', 'subject_label', 'predicate_label'])['object_label'].agg(lambda x: ', '.join(x)).reset_index()

df_grouped_new = df_grouped_new.drop_duplicates()

# print(df_grouped_new[df_grouped_new['subject_id'].str.contains('Q13909')])

                                     subject_id      subject_label  \
217018    http://www.wikidata.org/entity/Q13909     angelina jolie   
217019    http://www.wikidata.org/entity/Q13909     angelina jolie   
217020    http://www.wikidata.org/entity/Q13909     angelina jolie   
217021    http://www.wikidata.org/entity/Q13909     angelina jolie   
217022    http://www.wikidata.org/entity/Q13909     angelina jolie   
217023    http://www.wikidata.org/entity/Q13909     angelina jolie   
217024    http://www.wikidata.org/entity/Q13909     angelina jolie   
217025    http://www.wikidata.org/entity/Q13909     angelina jolie   
217026    http://www.wikidata.org/entity/Q13909     angelina jolie   
217027    http://www.wikidata.org/entity/Q13909     angelina jolie   
217028    http://www.wikidata.org/entity/Q13909     angelina jolie   
217029    http://www.wikidata.org/entity/Q13909     angelina jolie   
217030    http://www.wikidata.org/entity/Q13909     angelina jolie   
217031    http://www

In [7]:
subject_label_df = df_grouped_new[['subject_id', 'subject_label']].copy()

subject_label_df = subject_label_df.drop_duplicates()

reverse_index = subject_label_df.groupby('subject_label')['subject_id'].agg(lambda x: ', '.join(x)).reset_index()

print(reverse_index[reverse_index.subject_label == "Angelina Jolie"])

Empty DataFrame
Columns: [subject_label, subject_id]
Index: []


In [8]:
df_grouped_new.to_pickle("./exports/extended_graph_triples.pkl")
reverse_index.to_pickle("./exports/reverse_index.pkl")
df_grouped_new.to_csv("./exports/extended_graph_triples.csv")