# Configuration

In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON
from utils import SPARQL_to_df
from FAIR import findability, accessibility, interoperability, reusability
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
connection = SPARQLWrapper("https://data.europa.eu/sparql")

In [4]:
PREFIXES = """
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX void: <http://rdfs.org/ns/void#>
"""

## Sample query

In [4]:
query = """
    SELECT DISTINCT ?p WHERE {
        SELECT * WHERE {
        ?d a dcat:Dataset .
        ?d ?p ?o .
      }
      LIMIT 5000
    }
    """

df = SPARQL_to_df(connection, query, PREFIXES)

In [5]:
df.head()

In [6]:
accessibility.is_url_ok(df.p[0])

In [7]:
df[df.p.str.startswith("http://rdfs.org/ns/void#")]

No void vocabulary in the dataset

In [8]:
datasets_query = """
    SELECT ?d WHERE {
        SELECT * WHERE {
        ?d a dcat:Dataset .
      }
      LIMIT 5000
    }
    """

distr_links = accessibility.distribution_links(connection, datasets_query, PREFIXES)

In [9]:
distr_links.head()

In [10]:
distr_agg = distr_links.groupby("distribution").agg({"accessURL": "count", "downloadURL": "count"}).sort_values(["accessURL", "downloadURL"], ascending=False)
distr_agg['ratio'] = distr_agg.downloadURL / distr_agg.accessURL
distr_agg

In [11]:
distr_agg = distr_links.groupby("dataset").agg({"accessURL": "count", "downloadURL": "count", "distribution": "count"}).sort_values(["accessURL", "downloadURL"], ascending=False)
distr_agg['ratio'] = distr_agg.downloadURL / (distr_agg.accessURL.apply(lambda x: x or 1))
distr_agg

In [12]:
distr_agg[distr_agg.accessURL != distr_agg.distribution]

In [13]:
datasets_query = """
    SELECT ?d WHERE {
        SELECT * WHERE {
        ?d a dcat:Dataset .
      }
      LIMIT 5000
    }
    """

author_info = reusability.publisher_provenance_contant(connection, datasets_query, PREFIXES)

In [14]:
(~author_info.drop_duplicates("dataset").isnull()).mean()

### Findability properties

In [18]:
findability_query = """
SELECT * WHERE{
    ?datasetURI a dcat:Dataset;
    dct:publisher/rdfs:label|dct:publisher/skos:prefLabel ?publisher;
    dcat:theme <http://publications.europa.eu/resource/authority/data-theme/ENVI>.
    FILTER (LANG(?publisher) = "" || LANG(?publisher) = "en") .
} LIMIT 1500
"""

In [19]:
df = findability.findability_properties(connection, findability_query, PREFIXES)

In [20]:
df['keywords'] = df['keywords'].astype(np.int64)

In [21]:
df.head()

In [22]:
len(df[df['issuedDate'].isnull()]) / len(df)

In [23]:
len(df[df['englishTitle'] == '']) / len(df)

In [24]:
len(df[df['datasetType'].isnull()]) / len(df)

In [25]:
len(df[df['keywords'] == 0]) / len(df)

In [26]:
len(df[df['datasetLocation'].isnull()]) / len(df)

In [12]:
grouped_by_publisher = df.groupby('publisher').agg({'keywords': 'mean'})

In [13]:
grouped_by_publisher

### Reusability properties

In [5]:
interoperability_query = """
SELECT * WHERE{
    ?datasetURI a dcat:Dataset;
    dct:publisher/rdfs:label|dct:publisher/skos:prefLabel ?publisher;
    dcat:theme <http://publications.europa.eu/resource/authority/data-theme/ENVI>.
} LIMIT 1500
"""

In [6]:
df = interoperability.interoperability_properties(connection, interoperability_query, PREFIXES)

In [7]:
df.head()

In [9]:
df['referencesNumber'] = df['referencesNumber'].astype(np.int64)

In [11]:
plt.hist(df['referencesNumber'], bins=50)
plt.show()

In [12]:
len(df[df['referencesNumber'] == 0]) / len(df)

In [14]:
len(df[~df['datasetLanguage'].isnull()]) / len(df)

In [15]:
len(df[~df['accessRights'].isnull()]) / len(df)