In [21]:
import pandas as pd
import json
import uuid
from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, OWL, Literal, URIRef
from rdflib.namespace import XSD, RDF, SDO, RDFS

KG = Namespace("http://kg-course/model-management/")
BIBO = Namespace("http://purl.org/ontology/bibo/")
SCHEMA = Namespace("http://schema.org/")


g = Graph()
g.bind("kg", KG)
g.bind("bibo", BIBO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("schema1", SCHEMA)
g.bind("xsd", XSD)

#### Models class and its properties

In [22]:
# Model class
g.add((KG.Model, RDF.type, RDFS.Class))
g.add((KG.Model, RDFS.label, Literal("Model", lang="en")))
g.add((KG.Model, RDFS.comment, Literal("NLP, Computer Vision, Reinforcement Learning etc. models", lang="en")))

# Model name
g.add((KG.name, RDF.type, OWL.DatatypeProperty))
g.add((KG.name, RDFS.label, Literal("Model name", lang="en")))
g.add((KG.name, RDFS.domain, KG.Model))
g.add((KG.name, RDFS.range, RDFS.Literal))

# Model creator
g.add((KG.creator, RDF.type, OWL.ObjectProperty))
g.add((KG.creator, RDFS.label, Literal("Model creator", lang="en")))
g.add((KG.creator, RDFS.domain, KG.Model))
g.add((KG.creator, RDFS.range, SCHEMA.Person))

# Creation date
g.add((KG.created_at, RDF.type, OWL.DatatypeProperty))
g.add((KG.created_at, RDFS.label, Literal("Creation date", lang="en")))
g.add((KG.created_at, RDFS.domain, KG.Model))
g.add((KG.created_at, RDFS.range, XSD.dateTime))

# Downloads
g.add((KG.downloads, RDF.type, OWL.DatatypeProperty))
g.add((KG.downloads, RDFS.label, Literal("Number of times a model has been downloaded", lang="en")))
g.add((KG.downloads, RDFS.domain, KG.Model))
g.add((KG.downloads, RDFS.range, XSD.integer))

# Task
g.add((KG.task, RDF.type, OWL.DatatypeProperty))
g.add((KG.task, RDFS.label, Literal("Model task", lang="en")))
g.add((KG.task, RDFS.comment, Literal("The task the model is designed to perform", lang="en")))
g.add((KG.task, RDFS.domain, KG.Model))
g.add((KG.task, RDFS.range, RDFS.Literal))

# hasPaper property
g.add((KG.hasPaper, RDF.type, OWL.ObjectProperty))
g.add((KG.hasPaper, RDFS.label, Literal("has Paper", lang="en")))
g.add((KG.hasPaper, RDFS.comment, Literal("The ID of the paper that the model is based on or described in", lang="en")))
g.add((KG.hasPaper, RDFS.domain, KG.Model))
g.add((KG.hasPaper, RDFS.range, KG.Paper))

# Base model
g.add((KG.base_model, RDF.type, OWL.DatatypeProperty))
g.add((KG.base_model, RDFS.label, Literal("Base model", lang="en")))
g.add((KG.base_model, RDFS.comment, Literal("The base model the model is built on", lang="en")))
g.add((KG.base_model, RDFS.domain, KG.Model))
g.add((KG.base_model, RDFS.range, RDFS.Literal))

# Language
g.add((KG.language, RDF.type, OWL.DatatypeProperty))
g.add((KG.language, RDFS.label, Literal("Supported languages", lang="en")))
g.add((KG.language, RDFS.comment, Literal("The languages the model supports (e.g. IT, EN, etc.)", lang="en")))
g.add((KG.language, RDFS.domain, KG.Model))
g.add((KG.language, RDFS.range, RDFS.Literal))

# hasEvaluationMetric property to link model to evaluation metrics
g.add((KG.hasEvaluationMetric, RDF.type, OWL.ObjectProperty))
g.add((KG.hasEvaluationMetric, RDFS.label, Literal("has Evaluation Metric", lang="en")))
g.add((KG.hasEvaluationMetric, RDFS.domain, KG.Model))
g.add((KG.hasEvaluationMetric, RDFS.range, KG.EvaluationMetric))

<Graph identifier=N064bdd83fcbe46b494f192e550663925 (<class 'rdflib.graph.Graph'>)>

#### Evaluation metrics class and properties

In [23]:

# ----------------- Evaluation Metric class -----------------#

g.add((KG.EvaluationMetric, RDF.type, RDFS.Class))
g.add((KG.EvaluationMetric, RDFS.label, Literal("Evaluation Metric", lang="en")))
g.add((KG.EvaluationMetric, RDFS.comment, Literal("The evaluation metric used to evaluate the model", lang="en")))

#----------------- Evaluation Metric properties -----------------#

# Task type
g.add((KG.taskType, RDF.type, OWL.DatatypeProperty))
g.add((KG.taskType, RDFS.label, Literal("Task on which the model was evaluated", lang="en")))
g.add((KG.taskType, RDFS.domain, KG.EvaluationMetric))
g.add((KG.taskType, RDFS.range, RDFS.Literal))

# Dataset name
g.add((KG.datasetName, RDF.type, OWL.DatatypeProperty))
g.add((KG.datasetName, RDFS.label, Literal("Dataset Name", lang="en")))
g.add((KG.datasetName, RDFS.comment, Literal("The name of the dataset used to evaluate the model, e.g. MNIST, IMDB, or custom", lang="en")))
g.add((KG.datasetName, RDFS.domain, KG.EvaluationMetric))
g.add((KG.datasetName, RDFS.range, RDFS.Literal))

# Metric type
g.add((KG.metricType, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricType, RDFS.label, Literal("Metric Type", lang="en")))
g.add((KG.metricType, RDFS.comment, Literal("The type of evaluation metric used, e.g. accuracy, F1 score, etc.", lang="en")))
g.add((KG.metricType, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricType, RDFS.range, RDFS.Literal))

# Metric value
g.add((KG.metricValue, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricValue, RDFS.label, Literal("Metric Value", lang="en")))
g.add((KG.metricValue, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricValue, RDFS.range, XSD.float))

# Metric error
g.add((KG.metricError, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricError, RDFS.label, Literal("Metric Error", lang="en")))
g.add((KG.metricError, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricError, RDFS.range, XSD.float))

# Metric mean
g.add((KG.metricMean, RDF.type, OWL.DatatypeProperty))
g.add((KG.metricMean, RDFS.label, Literal("Metric Mean", lang="en")))
g.add((KG.metricMean, RDFS.domain, KG.EvaluationMetric))
g.add((KG.metricMean, RDFS.range, XSD.float))

<Graph identifier=N064bdd83fcbe46b494f192e550663925 (<class 'rdflib.graph.Graph'>)>

#### Paper class

In [24]:
# ----------------- Paper class -----------------#
g.add((KG.Paper, RDF.type, OWL.Class))
g.add((KG.Paper, RDFS.label, Literal("Scientific paper", lang="en")))
g.add((KG.Paper, RDFS.comment, Literal("A scientific paper that describes a model or research work", lang="en")))
g.add((KG.Paper, OWL.equivalentClass, BIBO.Article))
g.add((SCHEMA.Person, RDFS.comment, Literal("A person, company, or organization that created the model", lang="en")))

<Graph identifier=N064bdd83fcbe46b494f192e550663925 (<class 'rdflib.graph.Graph'>)>

#### Save ontology

In [25]:
g.serialize(destination='./output/graph/ontology.ttl', format='turtle')

<Graph identifier=N064bdd83fcbe46b494f192e550663925 (<class 'rdflib.graph.Graph'>)>

#### **Graph construction**

In [26]:
'''dataframe = pd.read_csv("./data/merged-dataset.csv")
dataframe = (dataframe.groupby("id_x"))
display(dataframe.head())

for index, row in dataframe.iterrows():
    model_uri = URIRef(KG + row["id"])
    g.add((model_uri, RDF.type, KG.Model))
    g.add((model_uri, KG.name, Literal(row["id"].split("/")[1])))
    g.add((model_uri, KG.creator, Literal(row["author"])))
    g.add((model_uri, KG.created_at, Literal(row["created_at"], datatype=XSD.dateTime)))
    g.add((model_uri, KG.downloads, Literal(row["downloads"], datatype=XSD.integer)))

    if not pd.isna(row["pipeline_tag"]):
       g.add((model_uri, KG.task, Literal(row["pipeline_tag"])))

    if not pd.isna(row["base_model"]):
        if "/" in row["base_model"]:
            row["base_model"] = row["base_model"].split("/")[1]
            g.add((model_uri, KG.base_model, Literal(row["base_model"])))
        else:
            g.add((model_uri, KG.base_model, Literal(row["base_model"])))
    language_string = row["language"]

    if pd.isna(language_string):
        pass
    else:
        language_list = [aid.strip() for aid in language_string.split(",") if aid.strip()] 
        if len(language_list) == 1:
            g.add((model_uri, KG.language, Literal(language_list[0])))
        elif len(language_list) > 1:
            for lang in language_list:
                g.add((model_uri, KG.language, Literal(lang)))
    
    metrics_list = json.loads(row["evaluation_metrics"])
    for metric in metrics_list:
        metric_uri = URIRef(KG + "EvaluationMetric_" + str(uuid.uuid4()))
        g.add((metric_uri, RDF.type, KG.EvaluationMetric))
        g.add((metric_uri, KG.taskType, Literal(metric["task_type"])))
        g.add((metric_uri, KG.datasetName, Literal(metric["dataset_name"])))
        if metric.get("metric_type"):
            g.add((metric_uri, KG.metricType, Literal(metric["metric_type"])))
        if metric.get("metric_value"):
            g.add((metric_uri, KG.metricValue, Literal(metric["metric_value"], datatype=XSD.float)))
        else:
            if metric.get("metric_mean"):
                g.add((metric_uri, KG.metricMean, Literal(metric["metric_mean"], datatype=XSD.float)))
            if metric.get("metric_error"):
                g.add((metric_uri, KG.metricError, Literal(metric["metric_error"], datatype=XSD.float)))
        
        g.add((model_uri, KG.hasEvaluationMetric, metric_uri))

g.serialize(destination="./output/graph/models-metrics.ttl", format="turtle")'''

'dataframe = pd.read_csv("./data/merged-dataset.csv")\ndataframe = (dataframe.groupby("id_x"))\ndisplay(dataframe.head())\n\nfor index, row in dataframe.iterrows():\n    model_uri = URIRef(KG + row["id"])\n    g.add((model_uri, RDF.type, KG.Model))\n    g.add((model_uri, KG.name, Literal(row["id"].split("/")[1])))\n    g.add((model_uri, KG.creator, Literal(row["author"])))\n    g.add((model_uri, KG.created_at, Literal(row["created_at"], datatype=XSD.dateTime)))\n    g.add((model_uri, KG.downloads, Literal(row["downloads"], datatype=XSD.integer)))\n\n    if not pd.isna(row["pipeline_tag"]):\n       g.add((model_uri, KG.task, Literal(row["pipeline_tag"])))\n\n    if not pd.isna(row["base_model"]):\n        if "/" in row["base_model"]:\n            row["base_model"] = row["base_model"].split("/")[1]\n            g.add((model_uri, KG.base_model, Literal(row["base_model"])))\n        else:\n            g.add((model_uri, KG.base_model, Literal(row["base_model"])))\n    language_string = row[

In [None]:
merged = pd.read_csv("./data/merged-dataset.csv")
i = 0
paper_resources = {}

for model_id, group in merged.groupby("id_x"):
    first_row = group.iloc[0]
    model_uri = URIRef(KG + first_row["id_x"])
    g.add((model_uri, RDF.type, KG.Model))
    g.add((model_uri, KG.name, Literal(first_row["id_x"].split("/")[1])))
    g.add((model_uri, KG.creator, Literal(first_row["author"])))
    g.add((model_uri, KG.created_at, Literal(first_row["created_at"], datatype=XSD.dateTime)))
    g.add((model_uri, KG.downloads, Literal(first_row["downloads"], datatype=XSD.integer)))
    if not pd.isna(first_row["pipeline_tag"]):
        g.add((model_uri, KG.task, Literal(first_row["pipeline_tag"])))
    
    if not pd.isna(first_row["base_model"]):
        base_model = first_row["base_model"]
        if "/" in base_model:
            base_model = base_model.split("/")[1]
        g.add((model_uri, KG.base_model, Literal(base_model)))
    
    language_string = first_row["language"]
    if not pd.isna(language_string) and language_string.strip():
        language_list = [lang.strip() for lang in language_string.split(",") if lang.strip()]
        for lang in language_list:
            g.add((model_uri, KG.language, Literal(lang)))
    
    # Process evaluation metrics for each row in the group (if metrics vary per row)
    for idx, row in group.iterrows():
        metrics_list = json.loads(row["evaluation_metrics"])
        for metric in metrics_list:
            metric_uri = URIRef(KG + "EvaluationMetric_" + str(uuid.uuid4()))
            g.add((metric_uri, RDF.type, KG.EvaluationMetric))
            g.add((metric_uri, KG.taskType, Literal(metric["task_type"])))
            g.add((metric_uri, KG.datasetName, Literal(metric["dataset_name"])))
            if metric.get("metric_type"):
                g.add((metric_uri, KG.metricType, Literal(metric["metric_type"])))
            if metric.get("metric_value"):
                g.add((metric_uri, KG.metricValue, Literal(metric["metric_value"], datatype=XSD.float)))
            else:
                if metric.get("metric_mean"):
                    g.add((metric_uri, KG.metricMean, Literal(metric["metric_mean"], datatype=XSD.float)))
                if metric.get("metric_error"):
                    g.add((metric_uri, KG.metricError, Literal(metric["metric_error"], datatype=XSD.float)))
            g.add((model_uri, KG.hasEvaluationMetric, metric_uri))
    
    # Process paper linking for the entire group (unique papers across rows)
    # Process paper linking for the entire group (unique papers across rows)
    unique_papers = group['id_y'].dropna().unique()  # paper_info id column
    for paper_id in unique_papers:
        paper_id_str = str(paper_id).strip()
        if paper_id_str == "":
            continue
        if paper_id_str not in paper_resources:
            paper_uri = URIRef(KG + "Paper_" + paper_id_str)
            paper_resources[paper_id_str] = paper_uri
            g.add((paper_uri, RDF.type, KG.Paper))
            g.add((paper_uri, BIBO.identifier, Literal(paper_id_str)))
            authors = group.iloc[0].get("authors")
            if pd.notna(authors):
                authors = authors.strip()
                g.add((paper_uri, BIBO.authorList, Literal(authors)))
            title = group.iloc[0].get("title")
            if pd.notna(title):
                title = title.strip()
                g.add((paper_uri, BIBO.title, Literal(title, lang="en")))
            summary = group.iloc[0].get("summary")  
            if pd.notna(summary) and summary.strip():
                g.add((paper_uri, BIBO.abstract, Literal(summary, lang="en")))
        else:
            paper_uri = paper_resources[paper_id_str]
        # Link the model to the paper
        g.add((model_uri, KG.hasPaper, paper_uri))

g.serialize(destination="./output/graph/models-metrics-papers.ttl", format="turtle")

<Graph identifier=N064bdd83fcbe46b494f192e550663925 (<class 'rdflib.graph.Graph'>)>

In [28]:
print(len(g))

3674


In [29]:
merged.loc[merged["author"]== "1aurent"]

Unnamed: 0,id_x,author,created_at,downloads,pipeline_tag,arxiv_ids,base_model,language,evaluation_metrics,id_y,authors,title,summary
9572,1aurent/vit_base_patch16_224.owkin_pancancer,1aurent,2023-10-22T22:56:17,110,feature-extraction,,,,"[{""task_type"": ""image-classification"", ""datase...",,,,
9653,1aurent/vit_base_patch16_224.owkin_pancancer_f...,1aurent,2023-10-27T16:52:29,28,image-classification,,1aurent/vit_base_patch16_224.owkin_pancancer,,"[{""task_type"": ""image-classification"", ""datase...",,,,
9663,1aurent/resnet50.tcga_brca_simclr,1aurent,2023-10-28T13:33:20,46,feature-extraction,arxiv:2203.00585,,,"[{""task_type"": ""image-classification"", ""datase...",2203.00585,"Richard J. Chen, Rahul G. Krishnan",Self-Supervised Vision Transformers Learn Visu...,Tissue phenotyping is a fundamental task in le...
9664,1aurent/vit_small_patch16_256.tcga_brca_dino,1aurent,2023-10-28T14:42:11,41,feature-extraction,arxiv:2203.00585,,,"[{""task_type"": ""image-classification"", ""datase...",2203.00585,"Richard J. Chen, Rahul G. Krishnan",Self-Supervised Vision Transformers Learn Visu...,Tissue phenotyping is a fundamental task in le...
9702,1aurent/swin_tiny_patch4_window7_224.CTransPath,1aurent,2023-10-31T01:25:20,2176,feature-extraction,,,,"[{""task_type"": ""image-classification"", ""datase...",,,,
9826,1aurent/phikon-finetuned-lora-kather2016,1aurent,2023-11-08T19:42:56,13,image-classification,,owkin/phikon,,"[{""task_type"": ""image-classification"", ""datase...",,,,
9830,1aurent/phikon-distil-mobilenet_v2-kather2016,1aurent,2023-11-09T10:16:17,235,image-classification,,1aurent/phikon-finetuned-lora-kather2016,,"[{""task_type"": ""image-classification"", ""datase...",,,,
9832,1aurent/phikon-distil-vit-tiny-patch16-224-kat...,1aurent,2023-11-09T11:00:55,254,image-classification,,1aurent/phikon-finetuned-lora-kather2016,,"[{""task_type"": ""image-classification"", ""datase...",,,,
13928,1aurent/vit_small_patch16_224.kaiko_ai_towards...,1aurent,2024-06-07T20:17:03,119,feature-extraction,arxiv:2404.15217,,,"[{""task_type"": ""image-classification"", ""datase...",2404.15217,"kaiko. ai, Nanne Aben, Edwin D. de Jong, Ioann...",Towards Large-Scale Training of Pathology Foun...,Driven by the recent advances in deep learning...
13929,1aurent/vit_small_patch8_224.kaiko_ai_towards_...,1aurent,2024-06-07T20:19:59,138,feature-extraction,arxiv:2404.15217,,,"[{""task_type"": ""image-classification"", ""datase...",2404.15217,"kaiko. ai, Nanne Aben, Edwin D. de Jong, Ioann...",Towards Large-Scale Training of Pathology Foun...,Driven by the recent advances in deep learning...
