In [1]:
import os
import json
import pandas as pd

In [2]:
from IPython.core.display import HTML

def data_frame_to_html(df):
    display(HTML(df.to_html()))

In [3]:
# path in which datasets are stored
datasets_folder = "../datasets"

# path of the qrels file
qrels_fp = "qrels.txt"

In [4]:
datasets = sorted(os.listdir(datasets_folder), key=lambda i: int(i))

### Load data

In [5]:
edf = pd.DataFrame(
    columns=[
        "dataset_id",
        "size",
        "extracted_using",
        "classes",
        "literals",
        "entities",
        "properties",
        "connections",
        "connected_vertices",
        "average_literals_per_vertex",
    ]
)

In [6]:
for dataset in datasets:
    dataset_base_path = f"{datasets_folder}/{dataset}"
    metadata_file_path = f"{dataset_base_path}/metadata.json"
    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)
        for e in metadata["extracted"]:

            classes_file= f"{dataset_base_path}/{e['classesFile']}"
            literals_file= f"{dataset_base_path}/{e['literalsFile']}"
            properties_file= f"{dataset_base_path}/{e['propertiesFile']}"
            entities_file= f"{dataset_base_path}/{e['entitiesFile']}"

            classes_count = sum(1 for _ in open(classes_file))
            literals_count = sum(1 for _ in open(literals_file))
            entities_count = sum(1 for _ in open(entities_file))
            properties_count = sum(1 for _ in open(properties_file))

            edf.loc[len(edf)] = [
                metadata["id"],
                e["size"],
                e["extractedWith"],
                classes_count,
                literals_count,
                entities_count,
                properties_count,
                e["connections"],
                e["connectedVertices"],
                e["averageLiteralsPerVertex"]
            ]

### Estimate missing data

In [7]:
"""
for dataset in datasets:
    dataset_base_path = f"{datasets_folder}/{dataset}"
    metadata_file_path = f"{dataset_base_path}/metadata.json"
    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)
        for u in metadata["unusedFiles"]:
            edf.loc[len(edf)] = [
                metadata["id"],
                u["size"],
                "ESTIMATED",
                np.nan,
                np.nan,
                np.nan,
                np.nan,
                np.nan,
                np.nan,
                np.nan
            ]
"""


'\nfor dataset in datasets:\n    dataset_base_path = f"{datasets_folder}/{dataset}"\n    metadata_file_path = f"{dataset_base_path}/metadata.json"\n    with open(metadata_file_path, "r") as f:\n        metadata = json.load(f, strict=False)\n        for u in metadata["unusedFiles"]:\n            edf.loc[len(edf)] = [\n                metadata["id"],\n                u["size"],\n                "ESTIMATED",\n                np.nan,\n                np.nan,\n                np.nan,\n                np.nan,\n                np.nan,\n                np.nan,\n                np.nan\n            ]\n'

In [8]:
"""
from missforest.miss_forest import MissForest
mf = MissForest()
res = mf.fit_transform(edf)

from IPython.core.display import HTML

display(HTML(res.to_html()))
"""

'\nfrom missforest.miss_forest import MissForest\nmf = MissForest()\nres = mf.fit_transform(edf)\n\nfrom IPython.core.display import HTML\n\ndisplay(HTML(res.to_html()))\n'

### Add relevance to data

In [9]:
tdf = pd.DataFrame(
    columns=[
        "dataset_id",
        "size",
        "number_of_classes",
        "number_of_literals",
        "number_of_entities",
        "number_of_properties",
        "number_of_connections",
        "number_of_connected_vertices",
        "average_literals_per_vertex",
    ]
)

In [10]:
# Group the DataFrame by the "dataset_id" column
grouped = edf.groupby("dataset_id")

# Iterate over each group and print the records with the same dataset ID
size = 0

for dataset_id, group in grouped:
    weights = group["size"]
    classes = sum(group["classes"])
    literals = sum(group["literals"])
    entities = sum(group["entities"])
    properties = sum(group["properties"])
    connections = sum(group["connections"])
    connected_vertices = sum(group["connected_vertices"])
    average_literals_per_vertex = sum(group["average_literals_per_vertex"])

    tdf.loc[len(tdf)] = [
        dataset_id,
        sum(weights),
        classes,
        literals,
        entities,
        properties,
        connections,
        connected_vertices,
        average_literals_per_vertex,
    ]

In [11]:
# data_frame_to_html(tdf)

In [12]:
# stores: (query_id, list of relevant datasets with their relevance)
rank = dict()

with open(qrels_fp, "r") as qrels_file:
    for line in qrels_file:
        tokens = line.split()

        query_id = tokens[0]
        dataset_id = tokens[2]
        relevance_val = int(tokens[3])

        if relevance_val > 0:

            if query_id not in rank.keys():
                rank[query_id] = []

            rank[query_id].append((dataset_id, relevance_val))

Add to the dataframe a column `relevance_score` that is computed as the sum of all relevance judgement (0,1,2) for that dataset 

In [13]:
relevance_scores = dict()

In [14]:
for list in rank.values():
    for tuple in list:
        dataset_id, relevance_val = tuple

        if dataset_id not in relevance_scores:
            relevance_scores[dataset_id] = []

        relevance_scores[dataset_id].append(int(relevance_val))

Add relevance score to the dataframe

In [15]:
from statistics import mean

tdf["relevance"] = tdf["dataset_id"].map(lambda dataset_id: mean(relevance_scores[dataset_id]) if dataset_id in relevance_scores.keys() else 0)


In [16]:
# Normalize relevance scores
tdf['relevance'] = (tdf['relevance'] - tdf['relevance'].min()) / (tdf['relevance'].max() - tdf['relevance'].min())

### Export dataset

In [19]:
tdf.to_csv("features.csv", index=False)