# Configuration

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from utils import SPARQL_to_df
from FAIR import findability, accessibility, interoperability, reusability
import pandas as pd

In [2]:
connection = SPARQLWrapper("https://data.europa.eu/sparql")

In [3]:
PREFIXES = """
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX void: <http://rdfs.org/ns/void#>
"""

## Sample query

In [4]:
query = """
    SELECT DISTINCT ?p WHERE {
        SELECT * WHERE {
        ?d a dcat:Dataset .
        ?d ?p ?o .
      }
      LIMIT 5000
    }
    """

df = SPARQL_to_df(connection, query, PREFIXES)

In [5]:
df.head()

Unnamed: 0,p
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1,http://purl.org/dc/terms/created
2,http://purl.org/dc/terms/modified
3,http://www.w3.org/2000/01/rdf-schema#seeAlso
4,http://www.w3.org/ns/dqv#hasQualityMeasurement


In [6]:
accessibility.is_url_ok(df.p[0])

True

In [7]:
df[df.p.str.startswith("http://rdfs.org/ns/void#")]

Unnamed: 0,p


No void vocabulary in the dataset

In [8]:
datasets_query = """
    SELECT ?d WHERE {
        SELECT * WHERE {
        ?d a dcat:Dataset .
      }
      LIMIT 5000
    }
    """

distr_links = accessibility.distribution_links(connection, datasets_query, PREFIXES)

In [9]:
distr_links.head()

Unnamed: 0,dataset,distribution,accessURL,downloadURL
0,http://data.europa.eu/88u/dataset/22984271-bun...,http://data.europa.eu/88u/distribution/1265c36...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...
1,http://data.europa.eu/88u/dataset/22984271-bun...,http://data.europa.eu/88u/distribution/882c468...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...
2,http://data.europa.eu/88u/dataset/22984271-bun...,http://data.europa.eu/88u/distribution/8bf3489...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...
3,http://data.europa.eu/88u/dataset/22984271-bun...,http://data.europa.eu/88u/distribution/d31eb28...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...,https://dam-api.bfs.admin.ch/hub/api/dam/asset...
4,http://data.europa.eu/88u/dataset/https-dane-g...,http://data.europa.eu/88u/distribution/e627608...,https://dane.gov.pl/pl/dataset/2783/resource/3...,"https://api.dane.gov.pl/resources/39657,umowy-..."


In [10]:
distr_agg = distr_links.groupby("distribution").agg({"accessURL": "count", "downloadURL": "count"}).sort_values(["accessURL", "downloadURL"], ascending=False)
distr_agg['ratio'] = distr_agg.downloadURL / distr_agg.accessURL
distr_agg

Unnamed: 0_level_0,accessURL,downloadURL,ratio
distribution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http://data.europa.eu/88u/distribution/c2a8034e-6e4c-4193-855f-69f18d6bd41d,222,222,1.0
http://data.europa.eu/88u/distribution/92faea4e-8a42-4253-b22c-fea0413bc9d1,23,23,1.0
http://data.europa.eu/88u/distribution/a723e418-eb26-4f19-8708-2442b2b4680f,23,23,1.0
http://data.europa.eu/88u/distribution/e8f585b1-96a1-413b-a6a0-1ba6dd8080b2,4,0,0.0
http://data.europa.eu/88u/distribution/bd5ea88f-112b-497d-8e86-e40bd747aed7,2,2,1.0
...,...,...,...
http://data.europa.eu/88u/distribution/fff8b4c6-e771-4c90-bb38-58ece19b2aff,1,0,0.0
http://data.europa.eu/88u/distribution/fffc3456-c3b4-4e85-b861-eff9c006fd30,1,0,0.0
http://data.europa.eu/88u/distribution/01a33f69-b46f-49b4-b4d0-a3611c699d21,0,0,
http://data.europa.eu/88u/distribution/0b4cb8a8-df48-43c6-8bda-523c85e024f5,0,0,


In [11]:
distr_agg = distr_links.groupby("dataset").agg({"accessURL": "count", "downloadURL": "count", "distribution": "count"}).sort_values(["accessURL", "downloadURL"], ascending=False)
distr_agg['ratio'] = distr_agg.downloadURL / (distr_agg.accessURL.apply(lambda x: x or 1))
distr_agg

Unnamed: 0_level_0,accessURL,downloadURL,distribution,ratio
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://data.europa.eu/88u/dataset/https-www-dov-vlaanderen-be-dataset-e5a46140-137f-4f67-87cd-9c359ebbdfd2,225,225,225,1.0
http://data.europa.eu/88u/dataset/organogram-hm-revenue-and-customs,73,0,73,0.0
http://data.europa.eu/88u/dataset/39c7626b-44f6-4ce2-a12c-5a1438237c7a,46,46,46,1.0
http://data.europa.eu/88u/dataset/dc9159e7-88d1-43b4-a952-1b75dfb1041f,23,23,23,1.0
http://data.europa.eu/88u/dataset/https-opendata-ugr-es-dataset-3adf9739-092d-4ed8-8643-87a48e2453d0,21,0,21,0.0
...,...,...,...,...
http://data.europa.eu/88u/dataset/spaicv0202_ortomacval,0,0,0,0.0
http://data.europa.eu/88u/dataset/spasitnaadmloc_pol_redesreser-xml~~1,0,0,0,0.0
http://data.europa.eu/88u/dataset/spasitnaagricu_pol_regviticola_f13t14-xml~~1,0,0,0,0.0
http://data.europa.eu/88u/dataset/urn-x-wmo-md-int-wmo-wis-issa02edzw,0,0,0,0.0


In [12]:
distr_agg[distr_agg.accessURL != distr_agg.distribution]

Unnamed: 0_level_0,accessURL,downloadURL,distribution,ratio
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://data.europa.eu/88u/dataset/250cc292-efee-4df4-a74d-ec7d84464bea,0,0,1,0.0
http://data.europa.eu/88u/dataset/8a72e27f-c065-5365-af42-6b05fffa0d63,0,0,1,0.0
http://data.europa.eu/88u/dataset/99602f78-f90a-5f19-b168-799e2d371ea2,0,0,1,0.0


In [13]:
datasets_query = """
    SELECT ?d WHERE {
        SELECT * WHERE {
        ?d a dcat:Dataset .
      }
      LIMIT 5000
    }
    """

author_info = reusability.publisher_provenance_contant(connection, datasets_query, PREFIXES)

In [22]:
(~author_info.drop_duplicates("dataset").isnull()).mean()

dataset       1.0000
publisher     0.3480
provenance    0.3804
contact       0.8432
dtype: float64