# Knowledge Graphs - datasets quality review
### Analysis of the quality of top publishers at data.europa.eu
##### Patryk Rakus, Michał Tomczyk

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from utils import SPARQL_to_df
from FAIR import findability, accessibility, interoperability, reusability
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
connection = SPARQLWrapper("https://data.europa.eu/sparql")

In [3]:
PREFIXES = """
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX void: <http://rdfs.org/ns/void#>
"""


Finding the top publishers:

In [4]:
query = '''
SELECT ?publisher (count(*) as ?DatasetsPublished) WHERE {
    ?datasetURI a dcat:Dataset;
    dct:publisher?publisher .
}
GROUP BY ?publisher
ORDER BY DESC(?DatasetsPublished)
'''

In [5]:
df = SPARQL_to_df(connection, query, PREFIXES)

In [6]:
df.head(21)

Unnamed: 0,publisher,DatasetsPublished
0,https://www.data.gv.at/katalog/organization/f0...,42544
1,http://datos.gob.es/recurso/sector-publico/org...,21014
2,http://datos.gob.es/recurso/sector-publico/org...,20482
3,https://opendata.schleswig-holstein.de/organiz...,15193
4,https://data.gov.ie/organization/central-stati...,10699
5,http://publications.europa.eu/resource/authori...,9242
6,http://datos.gob.es/recurso/sector-publico/org...,8217
7,http://standaarden.overheid.nl/owms/terms/Cent...,7740
8,https://www.data.gv.at/katalog/organization/89...,6151
9,http://dataportal.se/organisation/SE2220000315,6122


In [20]:
df.head(20)['DatasetsPublished'].astype('Int64').sum()

np.int64(187527)

In [7]:
top_publishers = list(df.head(20)['publisher'])

If you have python < 3.12, you can't use \ in f-string (which is ridiculous for me)

In [8]:
newline = '\n'

In [9]:
query = f'''
VALUES ?publisher {{
    {"".join(f"<{publisher}>" + newline for publisher in top_publishers)}
    }}
    ?dataset a dcat:Dataset ;
           dct:publisher ?publisher .
'''

In [10]:
df_findability = findability.findability_aggregated_properties(connection, query, PREFIXES)

In [11]:
df_findability

Unnamed: 0,publisher,totalDatasets,avgKeywords,percentageWithTitle,percentageWithIssuedDate,percentageWithLocation,percentageWithType,percentageWithPartOf
0,http://publications.europa.eu/resource/authori...,3662,1,100.0,100.0,89.13162206444566,0.0,100
1,https://www.bfs.admin.ch/,2904,1,100.0,100.0,98.58815426997245,0.0,0
2,https://org.belgif.be/id/CbeEstablishmentUnit/...,5660,1,100.0,97.54416961130742,100.0,0.0,0
3,https://opendata.schleswig-holstein.de/organiz...,15193,1,100.0,100.0,99.91443427894424,0.085565721055749,0
4,http://datos.gob.es/recurso/sector-publico/org...,8217,1,100.0,100.0,100.0,0.0,0
5,https://www.data.gv.at/katalog/organization/f0...,42544,1,100.0,100.0,0.0,0.0,0
6,https://data.gov.ie/organization/central-stati...,10699,1,100.0,100.0,0.028040003738667,0.0,0
7,https://www.data.gv.at/katalog/organization/89...,6151,1,100.0,100.0,0.0,0.0,0
8,http://standaarden.overheid.nl/owms/terms/Cent...,7740,1,100.0,0.0,0.0,0.0,0
9,http://publications.europa.eu/resource/authori...,9242,1,100.0,90.80285652456178,72.67907379355118,99.95671932482145,0


In [12]:
df_interoperability = interoperability.interoperability_aggregated_properties(connection, query, PREFIXES)

In [13]:
df_interoperability

Unnamed: 0,publisher,totalDatasets,avgReferences,percentageWithLanguage,percentageWithAccessRights,percentageBeingVersionOf,percentageWithIdentifier,percentageWithRightsHolder
0,https://org.belgif.be/id/CbeEstablishmentUnit/...,5660,1.0,100.0,100.0,0.0,100.0,99.68197879858656
1,http://datos.gob.es/recurso/sector-publico/org...,8217,4.044541803577948,100.0,0.0,0.0,100.0,0.0
2,https://www.bfs.admin.ch/,2904,1.0,100.0,0.0,0.0,100.0,0.0
3,https://data.gov.ie/organization/central-stati...,10699,1.0,99.99065333208712,0.0,0.0,100.0,0.0
4,http://publications.europa.eu/resource/authori...,3662,1.082195521572911,87.73894046968869,0.0,0.0,100.0,0.0
5,https://opendata.schleswig-holstein.de/organiz...,15193,1.0,0.04607384979925,0.0394918712565,2.415586125189232,100.0,0.0
6,https://www.data.gv.at/katalog/organization/f0...,42544,1.0,0.0,0.0,0.0,100.0,0.0
7,https://www.data.gv.at/katalog/organization/89...,6151,1.0,0.0,0.0,0.0,100.0,0.0
8,http://standaarden.overheid.nl/owms/terms/Cent...,7740,1.0,100.0,16.963824289405682,0.0,0.0,0.0
9,http://publications.europa.eu/resource/authori...,9242,1.0,0.0324605063839,94.6872971218351,0.0,99.97835966241072,0.0


In [14]:
df_both_attributes = df_interoperability.merge(df_findability, on='publisher')

In [18]:
df_both_attributes

Unnamed: 0,publisher,totalDatasets_x,avgReferences,percentageWithLanguage,percentageWithAccessRights,percentageBeingVersionOf,percentageWithIdentifier,percentageWithRightsHolder,totalDatasets_y,avgKeywords,percentageWithTitle,percentageWithIssuedDate,percentageWithLocation,percentageWithType,percentageWithPartOf
0,https://org.belgif.be/id/CbeEstablishmentUnit/...,5660,1.0,100.0,100.0,0.0,100.0,99.68197879858656,5660,1,100.0,97.54416961130742,100.0,0.0,0
1,http://datos.gob.es/recurso/sector-publico/org...,8217,4.044541803577948,100.0,0.0,0.0,100.0,0.0,8217,1,100.0,100.0,100.0,0.0,0
2,https://www.bfs.admin.ch/,2904,1.0,100.0,0.0,0.0,100.0,0.0,2904,1,100.0,100.0,98.58815426997245,0.0,0
3,https://data.gov.ie/organization/central-stati...,10699,1.0,99.99065333208712,0.0,0.0,100.0,0.0,10699,1,100.0,100.0,0.028040003738667,0.0,0
4,http://publications.europa.eu/resource/authori...,3662,1.082195521572911,87.73894046968869,0.0,0.0,100.0,0.0,3662,1,100.0,100.0,89.13162206444566,0.0,100
5,https://opendata.schleswig-holstein.de/organiz...,15193,1.0,0.04607384979925,0.0394918712565,2.415586125189232,100.0,0.0,15193,1,100.0,100.0,99.91443427894424,0.085565721055749,0
6,https://www.data.gv.at/katalog/organization/f0...,42544,1.0,0.0,0.0,0.0,100.0,0.0,42544,1,100.0,100.0,0.0,0.0,0
7,https://www.data.gv.at/katalog/organization/89...,6151,1.0,0.0,0.0,0.0,100.0,0.0,6151,1,100.0,100.0,0.0,0.0,0
8,http://standaarden.overheid.nl/owms/terms/Cent...,7740,1.0,100.0,16.963824289405682,0.0,0.0,0.0,7740,1,100.0,0.0,0.0,0.0,0
9,http://publications.europa.eu/resource/authori...,9242,1.0,0.0324605063839,94.6872971218351,0.0,99.97835966241072,0.0,9242,1,100.0,90.80285652456178,72.67907379355118,99.95671932482145,0
