# Knowledge Graphs - datasets quality review
### Analysis of the quality of top publishers at data.europa.eu
##### Patryk Rakus, Michał Tomczyk

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from utils import SPARQL_to_df
from FAIR import findability, accessibility, interoperability, reusability
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
connection = SPARQLWrapper("https://data.europa.eu/sparql")

In [3]:
PREFIXES = """
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX void: <http://rdfs.org/ns/void#>
"""


Finding the top themes:

In [4]:
query = '''
SELECT ?theme (count(*) as ?DatasetsPublished) WHERE {
    ?datasetURI a dcat:Dataset;
    dcat:theme ?theme .
}
GROUP BY ?theme
ORDER BY DESC(?DatasetsPublished)
'''

In [5]:
df = SPARQL_to_df(connection, query, PREFIXES)

In [65]:
df.iloc[10: 20]

Unnamed: 0,theme,DatasetsPublished
10,http://publications.europa.eu/resource/authori...,57074
11,http://publications.europa.eu/resource/authori...,35656
12,http://inspire.ec.europa.eu/theme/cp,27513
13,http://eurovoc.europa.eu/5079,27511
14,http://eurovoc.europa.eu/100142,26727
15,http://eurovoc.europa.eu/100154,26154
16,http://publications.europa.eu/resource/authori...,26111
17,http://eurovoc.europa.eu/100160,25726
18,http://publications.europa.eu/resource/authori...,14263
19,http://eurovoc.europa.eu/100156,11005


In [66]:
df.iloc[10:20]['DatasetsPublished'].astype('Int64').sum()

np.int64(277740)

In [71]:
top_themes = list(df.iloc[10:20]['theme'])

In [72]:
newline = '\n'

In [73]:
query = f'''
VALUES ?theme {{
    {"".join(f"<{theme}>" + newline for theme in top_themes)}
    }}
    ?dataset a dcat:Dataset ;
           dcat:theme ?theme .
'''

In [74]:
df_findability = findability.findability_aggregated_properties(connection, query, PREFIXES, 'theme')

In [75]:
df_findability

Unnamed: 0,theme,totalDatasets,avgKeywords,percentageWithTitle,percentageWithIssuedDate,percentageWithLocation,percentageWithType,percentageWithPartOf
0,http://publications.europa.eu/resource/authori...,57074,1,100.0,44.10239338402775,90.60167501839717,71.23734099590006,45.565406314609106
1,http://publications.europa.eu/resource/authori...,26111,1,100.0,67.99816169430508,59.51514687296542,25.7018114970702,2.17532840565279
2,http://eurovoc.europa.eu/100156,11005,1,100.0,39.05497501135847,100.0,99.77283053157656,25.615629259427532
3,http://eurovoc.europa.eu/5079,27511,1,100.0,0.574315728254153,100.0,99.92003198720512,47.94445858020428
4,http://inspire.ec.europa.eu/theme/cp,27513,1,100.0,0.599716497655654,99.98909606367897,99.93821102751426,47.94097335804892
5,http://publications.europa.eu/resource/authori...,35656,1,100.0,72.79279784608481,66.24971954229302,5.056652456809513,0.098160197442226
6,http://eurovoc.europa.eu/100142,26727,1,99.99625846522243,3.303775208590564,99.98877539566729,99.92142776967114,93.5533355782542
7,http://eurovoc.europa.eu/100160,25726,1,100.0,2.483868459923812,100.0,99.96890305527482,96.11676902744304
8,http://publications.europa.eu/resource/authori...,14263,1,100.0,99.32692981841129,99.97195540910047,0.02804459089953,0.0
9,http://eurovoc.europa.eu/100154,26154,1,100.0,2.840865641966812,100.0,99.9273533685096,94.84591267110191


In [76]:
df_interoperability = interoperability.interoperability_aggregated_properties(connection, query, PREFIXES, 'theme')

In [77]:
df_interoperability

Unnamed: 0,theme,totalDatasets,avgReferences,percentageWithLanguage,percentageWithAccessRights,percentageBeingVersionOf,percentageWithIdentifier,percentageWithRightsHolder
0,http://publications.europa.eu/resource/authori...,26111,1.030829918425185,64.78112672819884,32.93631036727816,8.123013289418253,86.19355827046073,3.967676458197694
1,http://publications.europa.eu/resource/authori...,57074,1.011896835687003,83.8069874198409,19.646423940848724,0.417002487998038,94.7822125661422,8.152573851491047
2,http://eurovoc.europa.eu/100156,11005,1.0,98.23716492503408,0.0,0.0,98.88232621535664,37.2557928214448
3,http://publications.europa.eu/resource/authori...,35656,1.069581557101189,75.68712138209558,38.018846757908904,9.14292124747588,89.39589409916984,9.586044424500784
4,http://eurovoc.europa.eu/100142,26727,1.0,99.71190182212744,0.0,0.0,99.36393908781382,2.660231226849254
5,http://inspire.ec.europa.eu/theme/cp,27513,1.0,99.98182677279831,0.0,0.0,99.89822992767056,4.136226511103842
6,http://eurovoc.europa.eu/5079,27511,1.0,99.96365090327504,0.018174548362473,0.0,99.89822252917016,4.118352658936425
7,http://eurovoc.europa.eu/100160,25726,1.0,99.84062815828345,0.0,0.0,99.90670916582445,1.492653346808676
8,http://publications.europa.eu/resource/authori...,14263,1.0,64.97230596648672,99.5933534319568,0.0,99.97195540910047,99.14463997756432
9,http://eurovoc.europa.eu/100154,26154,1.0,99.90441232698632,0.0,0.0,99.88911829930413,1.953812036399786


In [78]:
df_both_attributes = df_interoperability.merge(df_findability, on='theme')

In [79]:
df_both_attributes

Unnamed: 0,theme,totalDatasets_x,avgReferences,percentageWithLanguage,percentageWithAccessRights,percentageBeingVersionOf,percentageWithIdentifier,percentageWithRightsHolder,totalDatasets_y,avgKeywords,percentageWithTitle,percentageWithIssuedDate,percentageWithLocation,percentageWithType,percentageWithPartOf
0,http://publications.europa.eu/resource/authori...,26111,1.030829918425185,64.78112672819884,32.93631036727816,8.123013289418253,86.19355827046073,3.967676458197694,26111,1,100.0,67.99816169430508,59.51514687296542,25.7018114970702,2.17532840565279
1,http://publications.europa.eu/resource/authori...,57074,1.011896835687003,83.8069874198409,19.646423940848724,0.417002487998038,94.7822125661422,8.152573851491047,57074,1,100.0,44.10239338402775,90.60167501839717,71.23734099590006,45.565406314609106
2,http://eurovoc.europa.eu/100156,11005,1.0,98.23716492503408,0.0,0.0,98.88232621535664,37.2557928214448,11005,1,100.0,39.05497501135847,100.0,99.77283053157656,25.615629259427532
3,http://publications.europa.eu/resource/authori...,35656,1.069581557101189,75.68712138209558,38.018846757908904,9.14292124747588,89.39589409916984,9.586044424500784,35656,1,100.0,72.79279784608481,66.24971954229302,5.056652456809513,0.098160197442226
4,http://eurovoc.europa.eu/100142,26727,1.0,99.71190182212744,0.0,0.0,99.36393908781382,2.660231226849254,26727,1,99.99625846522243,3.303775208590564,99.98877539566729,99.92142776967114,93.5533355782542
5,http://inspire.ec.europa.eu/theme/cp,27513,1.0,99.98182677279831,0.0,0.0,99.89822992767056,4.136226511103842,27513,1,100.0,0.599716497655654,99.98909606367897,99.93821102751426,47.94097335804892
6,http://eurovoc.europa.eu/5079,27511,1.0,99.96365090327504,0.018174548362473,0.0,99.89822252917016,4.118352658936425,27511,1,100.0,0.574315728254153,100.0,99.92003198720512,47.94445858020428
7,http://eurovoc.europa.eu/100160,25726,1.0,99.84062815828345,0.0,0.0,99.90670916582445,1.492653346808676,25726,1,100.0,2.483868459923812,100.0,99.96890305527482,96.11676902744304
8,http://publications.europa.eu/resource/authori...,14263,1.0,64.97230596648672,99.5933534319568,0.0,99.97195540910047,99.14463997756432,14263,1,100.0,99.32692981841129,99.97195540910047,0.02804459089953,0.0
9,http://eurovoc.europa.eu/100154,26154,1.0,99.90441232698632,0.0,0.0,99.88911829930413,1.953812036399786,26154,1,100.0,2.840865641966812,100.0,99.9273533685096,94.84591267110191
