## Populate RDF database



To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

We need to install <code>RDFLib</code>

<code>pip3 install rdflib </code> [Documentation](https://rdflib.readthedocs.io/en/stable/gettingstarted.html)

In [16]:
# required libraries
import pandas as pd
import os
from pathlib import Path
import gc
import json
import hashlib
from datetime import datetime

In [2]:
# folder where JSON files are stored
folder = "/locale/data/jupyter/prando/wd-project/2022/notebook_evaluated"
def load_json(directory,verbose = False):
    if verbose:
        print("Start to load files from:",directory)
    files={}
    for folder in os.listdir(directory):
        subdir = directory + os.sep + folder
        nbks = []
        for file in os.listdir(subdir):
            filepath = subdir + os.sep + file
            if filepath.endswith(".json"):
                fd = open(filepath,"r")
                tmp = json.load(fd)
                files[tmp["name"]] = tmp
                fd.close()
    if verbose:
        string = "Successfully load the json notebooks from the folder "+str(directory)+":\n"
        string += "--> Total number of files: "+str(len(files))
        print(string)
    return files

files = load_json(folder,True)

# saving folder
savePath =  '/locale/data/jupyter/prando/notebook/2022/rdf/'
#create rdf dir
if not os.path.exists(savePath):
    os.makedirs(savePath)

Start to load files from: /locale/data/jupyter/prando/notebook/2022/notebook_evaluated
Successfully load the json notebooks from the folder /locale/data/jupyter/prando/notebook/2022/notebook_evaluated:
--> Total number of files: 123


In [17]:
# Load the required libraries
from rdflib import Graph, Literal, RDF,BNode, URIRef, Namespace
# rdflib knows about some namespaces, like XSD
from rdflib.namespace import XSD

# Construct the country and the aitraffic ontology namespaces not known by RDFlib
WDO = Namespace("http://www.dei.unipd.it/exploratory#")
LSQV = Namespace("http://lsq.aksw.org/vocab#")
SP = Namespace("http://spinrdf.org/sp#")
DCT = Namespace("http://purl.org/dc/terms/")

## Topics and Tasks
In this section we model the topics and the tasks, creating the turtle file.

In [18]:
#create the graph for the topics and tasks
g = Graph()


# Bind the namespaces to a prefix for more readable output
g.bind("xsd", XSD)
g.bind("wdo", WDO)
g.bind("lsqv", LSQV)
g.bind("sp", SP)

#create the graph for the search workflows
h = Graph()

# Bind the namespaces to a prefix for more readable output
h.bind("xsd", XSD)
h.bind("wdo", WDO)
h.bind("lsqv", LSQV)
h.bind("sp", SP)
h.bind("dct",DCT)

In [None]:
%%time 
#measure execution time
topics=[]
workers = []

# create the URI for the Completeness Track
Track = URIRef(WDO["CompletenessTrack"])
g.add((Track, RDF.type, WDO['Track']))
g.add((Track, WDO['description'], Literal("Completeness Track", datatype=XSD.string)))
index=0
executions = 0
for filename in files:
    topic = files[filename]['topic']
    name = files[filename]['name']
    worker = files[filename]['student']
    macro_topic = files[filename]['macro_topic']
    hash_topic = hashlib.md5(topic.encode()).hexdigest()[:10]
    # create the URI for the current topic
    Topic = URIRef(WDO["TOPIC"+hash_topic])
    # add the topic with all the tasks to the graph G
    topics.append(topic)
    # Add triples using store's add() method.
    g.add((Topic, RDF.type, WDO['SearchTopic']))
    # add the description
    g.add((Topic, WDO['description'], Literal(topic, datatype=XSD.string)))
    # add the macro topic
    g.add((Topic, WDO['macroTopic'], Literal(macro_topic, datatype=XSD.string)))
    # add the link to the Track
    g.add((Topic, WDO['partOf'], Track))
    # add the search tasks
    goals = files[filename]['goals']
    # keep the URI of the Tasks saved to use later
    tasks = {}
    for goal in goals:
        no_hashed_goal = topic+goal
        hash_goal = hashlib.md5(no_hashed_goal.encode()).hexdigest()[:10]
        Task = URIRef(WDO["TASK"+str(hash_goal)])
        tasks[goal] = Task
        ## add the Tasks
        g.add((Task, RDF.type, WDO['SearchTask']))
        g.add((Task, WDO['description'], Literal(goals[goal], datatype=XSD.string)))
        g.add((Task, WDO['number'], Literal(str(goal), datatype=XSD.string)))
        g.add((Task, WDO['belongsTo'], Topic))
    
    
    ## add the search workflow
    Workflow = URIRef(WDO[name])
    h.add((Workflow, RDF.type, WDO['SearchWorkflow']))
    # add the related topic
    h.add((Workflow, WDO['implements'], Topic))
    
    #create the Worker
    Worker = URIRef(WDO["WORKER"+str(worker)])
    h.add((Workflow, WDO['wroteBy'], Worker))
    
    if worker not in workers:
        ## add the Worker
        workers.append(worker)
        h.add((Worker, RDF.type, WDO['Worker']))
        ## add also the score of the worker given is exam score
    
    search_workflow = files[filename]['search_workflow']
    for job in search_workflow:
        # the Job's URI is the concatenation of [JOB, number of the task, W, name of the file]
        Job = URIRef(WDO['JOB'+str(job)+'W'+name])
        h.add((Job, RDF.type, WDO['SearchJob']))
        # add the relation hasPart to the search workflow
        h.add((Workflow, WDO['hasPart'], Job))
        # add the relation performs to the search task
        h.add((Job, WDO['performs'], tasks[goal]))
        # add the query list
        Queries = BNode()
        h.add((Queries, RDF.type, RDF.List))
        h.add((Job, WDO['queries'], Queries))
        
        
        ## create the list of the query
        
        for i in range(len(search_workflow[job])):
            query = search_workflow[job][i]
            narrative = query['narrative']
            text = query['query']
            Query = URIRef(SP['Q'+str(index)])
            h.add((Query, RDF.type, SP['Query']))
            h.add((Query, SP['text'], Literal(text, datatype=XSD.string)))
            # add the parse Error if it exists
            if 'parseError' in query and query['parseError'] is not None:
                h.add((Query, LSQV['parseError'], Literal(query['parseError'], datatype=XSD.string)))
            # add the narrative if it exists
            if 'narrative' in query and query['narrative'] is not None:
                h.add((Query, WDO['narrative'], Literal(query['narrative'], datatype=XSD.string)))
            # add the size of the result set if it exists
            if 'output' in query and query['output'] is not None:
                h.add((Query, LSQV['resultSize'], Literal(str(len(query['output'])), datatype=XSD.long)))
                
            # add the metrics
            metrics = ['recall','precision','accuracy']
            for m in metrics:
                if m in query and query[m] is not None:
                    h.add((Query, WDO[m], Literal(query[m]), datatype=XSD.float))
            # add the executions
            # example: "22/Dec/2022:19:41:16"
            for ex in query['execution']:
                t_ex = datetime.strptime(ex,'%d/%b/%Y:%H:%M:%S',).strftime('%Y-%m-%dT%H:%M:%S')
                Execution = URIRef(DCT['EX'+str(executions)])
                h.add((Execution, RDF.type, LSQV['Execution']))
                g.add((Query, DCT['issued'], Literal(t_ex, datatype=XSD.dateTime)))
                executions+=1
                
            h.add((Queries, RDF.first, Query))
            if i < len(search_workflow[job])-1:
                Next = BNode()
                h.add((Next, RDF.type, RDF.List))
                h.add((Queries, RDF.rest, Next))
                Queries = Next
            else:
                h.add((Queries, RDF.rest, RDF.nil))
            index+=1
       

In [None]:
%%time
# print the data for the topics in the Turtle format
ttlname = 'topics.ttl'
print("--- saving serialization for the topics ---")
with open(savePath + ttlname, 'w') as file:
    file.write(g.serialize(format='turtle'))

In [36]:
%%time
# print the data for the workfflows in the Turtle format
ttlname = 'workflows.ttl'
print("--- saving serialization for the last aircrafts ---")
with open(savePath + ttlname, 'w') as file:
    file.write(h.serialize(format='turtle'))

--- saving serialization for the last aircrafts ---
CPU times: user 1.52 s, sys: 51.4 ms, total: 1.57 s
Wall time: 1.56 s


### Workers quality

In [15]:
f = open("../workers.csv","r")
lines = f.read().split("\n")[1:-1]
f.close()
workers = {}
for w in lines:
    spl = w.split(",")
    name = spl[0]+"_"+spl[1]
    workers[name] = 0.0
    if int(spl[2])!=0:
        workers[name] = (float(int(spl[2])-17.0)/14.0)

print(workers)

{'2022_0': 0.0, '2022_1': 0.5714285714285714, '2022_2': 1.0, '2022_3': 0.9285714285714286, '2022_4': 0.5714285714285714, '2022_5': 0.5, '2022_6': 0.9285714285714286, '2022_7': 0.0, '2022_8': 0.0, '2022_9': 0.0, '2022_10': 0.9285714285714286, '2022_11': 0.5, '2022_12': 1.0, '2022_13': 0.7857142857142857, '2022_14': 0.0, '2022_15': 0.9285714285714286, '2022_16': 0.6428571428571429, '2022_17': 0.7857142857142857, '2022_18': 0.0, '2022_19': 0.7857142857142857, '2022_20': 0.5714285714285714, '2022_21': 0.0, '2022_22': 1.0, '2022_23': 1.0, '2022_24': 0.0, '2022_25': 0.9285714285714286, '2022_26': 0.7142857142857143, '2022_27': 1.0, '2022_28': 0.7142857142857143, '2022_29': 0.2857142857142857, '2022_30': 0.9285714285714286, '2022_31': 0.7857142857142857, '2022_32': 1.0, '2022_33': 0.0, '2022_34': 0.7142857142857143, '2022_35': 0.9285714285714286, '2022_36': 0.0, '2022_37': 1.0, '2022_38': 0.0, '2022_39': 0.8571428571428571, '2022_40': 0.0, '2021_0': 1.0, '2021_1': 0.9285714285714286, '2021_2'

In [None]:
Topic = URIRef(WDO["TOPICHHH"])

# Add triples using store's add() method.
g.add((Topic, RDF.type, WDO['SearchTopic']))
# add the description
g.add((Topic, WDO['description'], Literal(topic, datatype=XSD.string)))
# print the data for the topics in the Turtle format
ttlname = 'topics.ttl'
print("--- saving serialization for the topics ---")
with open(savePath + ttlname, 'w') as file:
    file.write(g.serialize(format='turtle'))