In [28]:
import os.path

from rdflib import Graph, URIRef, Literal
from tqdm import tqdm
import pandas as pd
import argparse

In [29]:
parser = argparse.ArgumentParser()
parser.add_argument('--graph_file', default='slovenia.ttl', type=str, help='ttl file containing the graph to update')
parser.add_argument('--prediction_file', default='predictions_annotated_advanced.csv', type=str, help='csv file containing uslp predictions')
parser.add_argument('--output_file', default='updated_graph.ttl', type=str, help='file location to write updated graph to')
parser.add_argument('--cut_off', default=1.5, type=float, help='minimum score to achieve for updates to occur')
args = parser.parse_args(args=[])

In [7]:
g = Graph()
g.parse(args.graph_file)

<Graph identifier=N21c0cebd6c31437bbb06c8d498c8aabd (<class 'rdflib.graph.Graph'>)>

In [26]:
predictions = pd.read_csv(args.prediction_file)
predictions = predictions[['s', 'p', 'o', 'literal', 'score']]

In [15]:
namespace = {key: uri for key, uri in g.namespaces()}

In [27]:
def cast_uri(term:str, namespace:dict) -> URIRef:
    prefix, fragment = term.split(':')
    return namespace[prefix] + fragment

In [30]:
for idx, row in tqdm(predictions.iterrows(), total=len(predictions)):
    if row['score'] > args.cut_off:
        subject = cast_uri(row['s'], namespace)
        predicate = cast_uri(row['p'], namespace)
        object = cast_uri(row['o'], namespace)
        literal = Literal(row['literal'])
        g.remove((subject, predicate, literal))
        g.add((subject, predicate, object))

100%|██████████| 1188/1188 [00:00<00:00, 10094.78it/s]


In [31]:
g.serialize(args.output_file, format="turtle", encoding="utf-8")

<Graph identifier=N21c0cebd6c31437bbb06c8d498c8aabd (<class 'rdflib.graph.Graph'>)>

In [3]:
os.path.dirname('E:\Datasets\elliptic dataset\elliptic_txs_classes.csv')

'E:\\Datasets\\elliptic dataset'