# Knowledge Graphs - datasets quality review
### Analysis of the quality of top publishers at data.europa.eu
##### Patryk Rakus, Michał Tomczyk

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from utils import SPARQL_to_df
from FAIR import findability, accessibility, interoperability, reusability
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
connection = SPARQLWrapper("https://data.europa.eu/sparql")

In [3]:
PREFIXES = """
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX void: <http://rdfs.org/ns/void#>
"""


Finding the top themes:

In [4]:
query = '''
SELECT ?theme (count(*) as ?DatasetsPublished) WHERE {
    ?datasetURI a dcat:Dataset;
    dcat:theme ?theme .
}
GROUP BY ?theme
ORDER BY DESC(?DatasetsPublished)
'''

In [5]:
df = SPARQL_to_df(connection, query, PREFIXES)

In [24]:
df.iloc[25:35]

Unnamed: 0,theme,DatasetsPublished
25,https://data.gov.ie/Environment,4089
26,http://eurovoc.europa.eu/6416,4068
27,http://inspire.ec.europa.eu/theme/nz,2674
28,http://standaarden.overheid.nl/owms/terms/Econ...,2672
29,http://standaarden.overheid.nl/owms/terms/Natu...,2660
30,https://data.gov.ie/Health,2377
31,http://standaarden.overheid.nl/owms/terms/Bestuur,2287
32,http://publications.europa.eu/resource/authori...,1731
33,http://standaarden.overheid.nl/owms/terms/Orga...,1426
34,http://inspire.ec.europa.eu/theme/sd,1328


In [25]:
df.iloc[25:35]['DatasetsPublished'].astype('Int64').sum()

np.int64(25312)

In [26]:
top_themes = list(df.iloc[25:35]['theme'])

In [27]:
newline = '\n'

In [28]:
query = f'''
VALUES ?theme {{
    {"".join(f"<{theme}>" + newline for theme in top_themes)}
    }}
    ?dataset a dcat:Dataset ;
           dcat:theme ?publisher .
'''

In [29]:
df_findability = findability.findability_aggregated_properties(connection, query, PREFIXES, 'theme')

EndPointInternalError: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'Virtuoso 42000 Error The estimated execution time 337 (sec) exceeds the limit of 300 (sec).\n\nSPARQL query:\n\n    PREFIX dcat: <http://www.w3.org/ns/dcat#>\n    PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>\n    PREFIX dct: <http://purl.org/dc/terms/>\n    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n    PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n    PREFIX void: <http://rdfs.org/ns/void#>\n\n    SELECT ?theme\n        (COUNT(DISTINCT ?dataset) AS ?totalDatasets)\n        (AVG(?keywordCount) AS ?avgKeywords)\n        ((100.0 * SUM(?hasTitle) / COUNT(DISTINCT ?dataset)) AS ?percentageWithTitle)\n        ((100.0 * SUM(?hasIssuedDate) / COUNT(DISTINCT ?dataset)) AS ?percentageWithIssuedDate)\n        ((100.0 * SUM(?hasLocation) / COUNT(DISTINCT ?dataset)) AS ?percentageWithLocation)\n        ((100.0 * SUM(?hasType) / COUNT(DISTINCT ?dataset)) AS ?percentageWithType)\n        ((100.0 * SUM(?hasPartOf) / COUNT(DISTINCT ?dataset)) AS ?percentageWithPartOf)\n    WHERE {\n        {\n            SELECT ?dataset ?theme \n                (COUNT(DISTINCT ?keyword) AS ?keywordCount)\n                ?hasTitle ?hasIssuedDate ?hasLocation ?hasType ?hasPartOf\n            WHERE {\n                \nVALUES ?theme {\n    <https://data.gov.ie/Environment>\n<http://eurovoc.europa.eu/6416>\n<http://inspire.ec.europa.eu/theme/nz>\n<http://standaarden.overheid.nl/owms/terms/Economie>\n<http://standaarden.overheid.nl/owms/terms/Natuur_en_milieu>\n<https://data.gov.ie/Health>\n<http://standaarden.overheid.nl/owms/terms/Bestuur>\n<http://publications.europa.eu/resource/authority/data-theme/undefined>\n<http://standaarden.overheid.nl/owms/terms/Organisatie_en_beleid>\n<http://inspire.ec.europa.eu/theme/sd>\n\n    }\n    ?dataset a dcat:Dataset ;\n           dcat:theme ?publisher .\n\n                \n                OPTIONAL { ?dataset dct:title ?title }\n                OPTIONAL { ?dataset dct:issued ?issuedDate }\n                OPTIONAL { ?dataset dct:type ?datasetType }\n                OPTIONAL { ?dataset dct:spatial ?datasetLocation }\n                OPTIONAL { ?dataset dct:isPartOf ?isPartOf }\n                \n                BIND (IF(BOUND(?title), 1, 0) AS ?hasTitle)\n                BIND (IF(BOUND(?issuedDate), 1, 0) AS ?hasIssuedDate)\n                BIND (IF(BOUND(?datasetLocation), 1, 0) AS ?hasLocation)\n                BIND (IF(BOUND(?datasetType), 1, 0) AS ?hasType)\n                BIND (IF(BOUND(?isPartOf), 1, 0) AS ?hasPartOf)\n            }\n            GROUP BY ?dataset ?theme ?keywordCount ?hasTitle ?hasIssuedDate ?hasLocation ?hasType ?hasPartOf\n        }\n    }\n    GROUP BY ?theme\n    '

In [11]:
df_findability

Unnamed: 0,publisher,totalDatasets,avgKeywords,percentageWithTitle,percentageWithIssuedDate,percentageWithLocation,percentageWithType,percentageWithPartOf
0,http://publications.europa.eu/resource/authori...,3662,1,100.0,100.0,89.13162206444566,0.0,100
1,https://www.bfs.admin.ch/,2904,1,100.0,100.0,98.58815426997245,0.0,0
2,https://org.belgif.be/id/CbeEstablishmentUnit/...,5660,1,100.0,97.54416961130742,100.0,0.0,0
3,https://opendata.schleswig-holstein.de/organiz...,15193,1,100.0,100.0,99.91443427894424,0.085565721055749,0
4,http://datos.gob.es/recurso/sector-publico/org...,8217,1,100.0,100.0,100.0,0.0,0
5,https://www.data.gv.at/katalog/organization/f0...,42544,1,100.0,100.0,0.0,0.0,0
6,https://data.gov.ie/organization/central-stati...,10699,1,100.0,100.0,0.028040003738667,0.0,0
7,https://www.data.gv.at/katalog/organization/89...,6151,1,100.0,100.0,0.0,0.0,0
8,http://standaarden.overheid.nl/owms/terms/Cent...,7740,1,100.0,0.0,0.0,0.0,0
9,http://publications.europa.eu/resource/authori...,9242,1,100.0,90.80285652456178,72.67907379355118,99.95671932482145,0


In [30]:
df_interoperability = interoperability.interoperability_aggregated_properties(connection, query, PREFIXES)

HTTPError: HTTP Error 504: Gateway Time-out

In [None]:
df_interoperability

In [14]:
df_both_attributes = df_interoperability.merge(df_findability, on='publisher')

In [18]:
df_both_attributes

Unnamed: 0,publisher,totalDatasets_x,avgReferences,percentageWithLanguage,percentageWithAccessRights,percentageBeingVersionOf,percentageWithIdentifier,percentageWithRightsHolder,totalDatasets_y,avgKeywords,percentageWithTitle,percentageWithIssuedDate,percentageWithLocation,percentageWithType,percentageWithPartOf
0,https://org.belgif.be/id/CbeEstablishmentUnit/...,5660,1.0,100.0,100.0,0.0,100.0,99.68197879858656,5660,1,100.0,97.54416961130742,100.0,0.0,0
1,http://datos.gob.es/recurso/sector-publico/org...,8217,4.044541803577948,100.0,0.0,0.0,100.0,0.0,8217,1,100.0,100.0,100.0,0.0,0
2,https://www.bfs.admin.ch/,2904,1.0,100.0,0.0,0.0,100.0,0.0,2904,1,100.0,100.0,98.58815426997245,0.0,0
3,https://data.gov.ie/organization/central-stati...,10699,1.0,99.99065333208712,0.0,0.0,100.0,0.0,10699,1,100.0,100.0,0.028040003738667,0.0,0
4,http://publications.europa.eu/resource/authori...,3662,1.082195521572911,87.73894046968869,0.0,0.0,100.0,0.0,3662,1,100.0,100.0,89.13162206444566,0.0,100
5,https://opendata.schleswig-holstein.de/organiz...,15193,1.0,0.04607384979925,0.0394918712565,2.415586125189232,100.0,0.0,15193,1,100.0,100.0,99.91443427894424,0.085565721055749,0
6,https://www.data.gv.at/katalog/organization/f0...,42544,1.0,0.0,0.0,0.0,100.0,0.0,42544,1,100.0,100.0,0.0,0.0,0
7,https://www.data.gv.at/katalog/organization/89...,6151,1.0,0.0,0.0,0.0,100.0,0.0,6151,1,100.0,100.0,0.0,0.0,0
8,http://standaarden.overheid.nl/owms/terms/Cent...,7740,1.0,100.0,16.963824289405682,0.0,0.0,0.0,7740,1,100.0,0.0,0.0,0.0,0
9,http://publications.europa.eu/resource/authori...,9242,1.0,0.0324605063839,94.6872971218351,0.0,99.97835966241072,0.0,9242,1,100.0,90.80285652456178,72.67907379355118,99.95671932482145,0
