In [56]:
import pandas as pd
import json
import uuid
from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, OWL, Literal, URIRef
from rdflib.namespace import XSD, RDF, SDO, RDFS

KG = Namespace("http://kg-course/model-management/")
BIBO = Namespace("http://purl.org/ontology/bibo/")
SCHEMA = Namespace("http://schema.org/")


g = Graph()
g.bind("kg", KG)
g.bind("bibo", BIBO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("schema1", SCHEMA)
g.bind("xsd", XSD)

#### Models class and its properties

In [57]:
g.add((SCHEMA.Person, RDF.type, OWL.Class))
g.add((SCHEMA.Person, RDFS.label, Literal("Person", lang="en")))
g.add((SCHEMA.Person, RDFS.comment, Literal("A person, company, or organization that created the model", lang="en")))

# Model class
g.add((KG.Model, RDF.type, RDFS.Class))
g.add((KG.Model, RDFS.label, Literal("Model", lang="en")))
g.add((KG.Model, RDFS.comment, Literal("NLP, Computer Vision, Reinforcement Learning etc. models", lang="en")))

# Model name
g.add((KG.name, RDF.type, OWL.DatatypeProperty))
g.add((KG.name, RDFS.label, Literal("Model name", lang="en")))
g.add((KG.name, RDFS.domain, KG.Model))
g.add((KG.name, RDFS.range, RDFS.Literal))

# Model creator
g.add((KG.creator, RDF.type, OWL.ObjectProperty))
g.add((KG.creator, RDFS.label, Literal("Model creator", lang="en")))
g.add((KG.creator, RDFS.domain, KG.Model))
g.add((KG.creator, RDFS.range, SCHEMA.Person))

# Creation date
g.add((KG.created_at, RDF.type, OWL.DatatypeProperty))
g.add((KG.created_at, RDFS.label, Literal("Creation date", lang="en")))
g.add((KG.created_at, RDFS.domain, KG.Model))
g.add((KG.created_at, RDFS.range, XSD.dateTime))

# Downloads
g.add((KG.downloads, RDF.type, OWL.DatatypeProperty))
g.add((KG.downloads, RDFS.label, Literal("Number of times a model has been downloaded", lang="en")))
g.add((KG.downloads, RDFS.domain, KG.Model))
g.add((KG.downloads, RDFS.range, XSD.integer))

# Task
g.add((KG.task, RDF.type, OWL.DatatypeProperty))
g.add((KG.task, RDFS.label, Literal("Model task", lang="en")))
g.add((KG.task, RDFS.comment, Literal("The task the model is designed to perform", lang="en")))
g.add((KG.task, RDFS.domain, KG.Model))
g.add((KG.task, RDFS.range, RDFS.Literal))

# hasPaper property
g.add((KG.hasPaper, RDF.type, OWL.ObjectProperty))
g.add((KG.hasPaper, RDFS.label, Literal("has Paper", lang="en")))
g.add((KG.hasPaper, RDFS.comment, Literal("The ID of the paper that the model is based on or described in", lang="en")))
g.add((KG.hasPaper, RDFS.domain, KG.Model))
g.add((KG.hasPaper, RDFS.range, KG.Paper))

# Base model
g.add((KG.base_model, RDF.type, OWL.DatatypeProperty))
g.add((KG.base_model, RDFS.label, Literal("Base model", lang="en")))
g.add((KG.base_model, RDFS.comment, Literal("The base model the model is built on", lang="en")))
g.add((KG.base_model, RDFS.domain, KG.Model))
g.add((KG.base_model, RDFS.range, RDFS.Literal))

# Language
g.add((KG.language, RDF.type, OWL.DatatypeProperty))
g.add((KG.language, RDFS.label, Literal("Supported languages", lang="en")))
g.add((KG.language, RDFS.comment, Literal("The languages the model supports (e.g. IT, EN, etc.)", lang="en")))
g.add((KG.language, RDFS.domain, KG.Model))
g.add((KG.language, RDFS.range, RDFS.Literal))

# hasEvaluationMetric property to link model to evaluation metrics
g.add((KG.hasEvaluationMetric, RDF.type, OWL.ObjectProperty))
g.add((KG.hasEvaluationMetric, RDFS.label, Literal("has Evaluation Metric", lang="en")))
g.add((KG.hasEvaluationMetric, RDFS.domain, KG.Model))
g.add((KG.hasEvaluationMetric, RDFS.range, KG.EvaluationMetric))

<Graph identifier=Na8c1505d5da94a1dba4e2bbf7a550e8c (<class 'rdflib.graph.Graph'>)>

#### Evaluation metrics class and properties

In [58]:

# ----------------- Evaluation Metric class -----------------#

g.add((KG.EvaluationMetric, RDF.type, RDFS.Class))
g.add((KG.EvaluationMetric, RDFS.label, Literal("Evaluation Metric", lang="en")))
g.add((KG.EvaluationMetric, RDFS.comment, Literal("The evaluation metric used to evaluate the model", lang="en")))

#----------------- Evaluation Metric properties -----------------#

# Task type
g.add((KG.taskType, RDF.type, OWL.DatatypeProperty))
g.add((KG.taskType, RDFS.label, Literal("Task on which the model was evaluated", lang="en")))
g.add((KG.taskType, RDFS.domain, KG.EvaluationMetric))
g.add((KG.taskType, RDFS.range, RDFS.Literal))

# Dataset name
g.add((KG.datasetName, RDF.type, OWL.DatatypeProperty))
g.add((KG.datasetName, RDFS.label, Literal("Dataset Name", lang="en")))
g.add((KG.datasetName, RDFS.comment, Literal("The name of the dataset used to evaluate the model, e.g. MNIST, IMDB, or custom", lang="en")))
g.add((KG.datasetName, RDFS.domain, KG.EvaluationMetric))
g.add((KG.datasetName, RDFS.range, RDFS.Literal))

# Metric type
g.add((KG.metricType, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricType, RDFS.label, Literal("Metric Type", lang="en")))
g.add((KG.metricType, RDFS.comment, Literal("The type of evaluation metric used, e.g. accuracy, F1 score, etc.", lang="en")))
g.add((KG.metricType, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricType, RDFS.range, RDFS.Literal))

# Metric value
g.add((KG.metricValue, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricValue, RDFS.label, Literal("Metric Value", lang="en")))
g.add((KG.metricValue, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricValue, RDFS.range, XSD.float))

# Metric error
g.add((KG.metricError, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricError, RDFS.label, Literal("Metric Error", lang="en")))
g.add((KG.metricError, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricError, RDFS.range, XSD.float))

# Metric mean
g.add((KG.metricMean, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricMean, RDFS.label, Literal("Metric Mean", lang="en")))
g.add((KG.metricMean, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricMean, RDFS.range, XSD.float))

<Graph identifier=Na8c1505d5da94a1dba4e2bbf7a550e8c (<class 'rdflib.graph.Graph'>)>

#### Paper class

In [59]:
# ----------------- Paper class -----------------#
g.add((KG.Paper, RDF.type, OWL.Class))
g.add((KG.Paper, RDFS.label, Literal("Scientific paper", lang="en")))
g.add((KG.Paper, RDFS.comment, Literal("A scientific paper that describes a model or research work", lang="en")))
g.add((KG.Paper, OWL.equivalentClass, BIBO.Article))

<Graph identifier=Na8c1505d5da94a1dba4e2bbf7a550e8c (<class 'rdflib.graph.Graph'>)>

#### Save ontology

In [60]:
g.serialize(destination='./output/graph/ontology.ttl', format='turtle')

<Graph identifier=Na8c1505d5da94a1dba4e2bbf7a550e8c (<class 'rdflib.graph.Graph'>)>

#### **Graph construction**

In [61]:
merged = pd.read_csv("./data/merged-dataset.csv")
paper_resources = set()

for model_id, group in merged.groupby("id_x"):
    first_row = group.iloc[0]
    model_uri = URIRef(KG + first_row["id_x"])
    g.add((model_uri, RDF.type, KG.Model))
    g.add((model_uri, KG.name, Literal(first_row["id_x"].split("/")[1])))
    
    creator_id = first_row["author"].strip()
    creator_uri = URIRef(KG + "Person/" + creator_id)

    g.add((creator_uri, RDF.type, SCHEMA.Person))  
    g.add((model_uri, KG.creator, creator_uri))

    g.add((model_uri, KG.created_at, Literal(first_row["created_at"], datatype=XSD.dateTime)))
    g.add((model_uri, KG.downloads, Literal(first_row["downloads"], datatype=XSD.integer)))
    if not pd.isna(first_row["pipeline_tag"]):
        g.add((model_uri, KG.task, Literal(first_row["pipeline_tag"])))
    
    if not pd.isna(first_row["base_model"]):
        base_model = first_row["base_model"]
        if "/" in base_model:
            base_model = base_model.split("/")[1]
        g.add((model_uri, KG.base_model, Literal(base_model)))
    
    language_string = first_row["language"]
    if not pd.isna(language_string) and language_string.strip():
        language_list = [lang.strip() for lang in language_string.split(",") if lang.strip()]
        for lang in language_list:
            g.add((model_uri, KG.language, Literal(lang)))
    
    for idx, row in group.iterrows():
        metrics_list = json.loads(row["evaluation_metrics"])
        for metric in metrics_list:
            metric_uri = URIRef(KG + "EvaluationMetric/" + str(uuid.uuid4()))
            g.add((metric_uri, RDF.type, KG.EvaluationMetric))
            g.add((metric_uri, KG.taskType, Literal(metric["task_type"])))
            g.add((metric_uri, KG.datasetName, Literal(metric["dataset_name"])))
            if metric.get("metric_type"):
                g.add((metric_uri, KG.metricType, Literal(metric["metric_type"])))
            if metric.get("metric_value"):
                g.add((metric_uri, KG.metricValue, Literal(metric["metric_value"], datatype=XSD.float)))
            else:
                if metric.get("metric_mean"):
                    g.add((metric_uri, KG.metricMean, Literal(metric["metric_mean"], datatype=XSD.float)))
                if metric.get("metric_error"):
                    g.add((metric_uri, KG.metricError, Literal(metric["metric_error"], datatype=XSD.float)))
            g.add((model_uri, KG.hasEvaluationMetric, metric_uri))

g.serialize(destination="./output/graph/models-metrics-papers.ttl", format="turtle")

<Graph identifier=Na8c1505d5da94a1dba4e2bbf7a550e8c (<class 'rdflib.graph.Graph'>)>

#### Add papers to the graph and link them to the models

In [None]:
merged_temp = merged.dropna(subset=["id_y"])  # drop rows without paper IDs

for idx, row in merged_temp.iterrows():
    paper_uri = URIRef(KG + "Paper/" + str(row["id_y"]))
    # Add the paper instance to the graph
    g.add((paper_uri, RDF.type, KG.Paper))
    
    model_uri = URIRef(KG + row["id_x"])
  
    # Add the identifier of the paper
    paper_identifier = str(row["id_y"])
    g.add((paper_uri, BIBO.identifier, Literal(paper_identifier)))

    # Add the title of the paper
    title = row["title"]
    g.add((paper_uri, BIBO.title, Literal(title, lang="en")))

    # Add the authors of the paper
    authors = row["authors"] 
    g.add((paper_uri, BIBO.authorList, Literal(authors, lang="en")))

    # Add the abstract of the paper
    abstract = row["summary"]
    g.add((paper_uri, BIBO.abstract, Literal(abstract, lang="en")))

    # Add the hasPaper property to link the model to the paper
    g.add((model_uri, KG.hasPaper, paper_uri))


g.serialize(destination="./output/graph/models-metrics-papers.ttl", format="turtle")

<Graph identifier=Na8c1505d5da94a1dba4e2bbf7a550e8c (<class 'rdflib.graph.Graph'>)>

In [63]:
print(len(g))

4148779
