Import toolboxes, mapping files and RDF prefices

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 02 10:42:14 2018

@author: rthomas

V6 updates:
    Added qb:codeList for eg:station_id and eg:statistic.
    Added qb:CodedProperty as type for eg:station_id and eg:statistic
    Changed rdfs:seeAlso to qb:concept for P07 terms linked to qb:MeasuredProperty(s)
"""
import datetime
import os
import pandas as pd
      
# Set working directory
working = 'https://raw.githubusercontent.com/robthomas-marine/ogibindertest/master/'

# Load vocab mappings for measured properties
mapfile = working+'mappings/measures.csv'
mapping = pd.read_csv(mapfile)

# Load details of dimension properties
stnfile = working+'mappings/dimension_station_id.csv'
stations = pd.read_csv(stnfile)

statfile = working+'mappings/dimension_statistic.csv'
statistics = pd.read_csv(statfile)

# Load details of datasets to be QB'd
dsetfile = working+'mappings/dsets2QB.csv'
datasets = pd.read_csv(dsetfile)

dset_ids = datasets.dset_id.tolist()
    
#%% Set global variables
now = datetime.date.today()

# Set output filepath
ttl_vocabs = os.path.join(working,'rdf/MI_QB_vocabs.ttl')

# Construct RDF Data Cube
prefices = """@prefix rdf:      <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs:     <http://www.w3.org/2000/01/rdf-schema#> .
@prefix owl:      <http://www.w3.org/2002/07/owl#> .
@prefix xsd:      <http://www.w3.org/2001/XMLSchema#> .
@prefix skos:     <http://www.w3.org/2004/02/skos/core#> .
@prefix void:     <http://rdfs.org/ns/void#> .
@prefix dcat:     <http://www.w3.org/ns/dcat#> .
@prefix dct:      <http://purl.org/dc/terms/> .
@prefix foaf:     <http://xmlns.com/foaf/0.1/> .
@prefix org:      <http://www.w3.org/ns/org#> .
@prefix geo:      <http://www.opengis.net/ont/geosparql#> .
@prefix prov:     <http://www.w3.org/ns/prov#> .

@prefix qb:       <http://purl.org/linked-data/cube#> .

@prefix sdmx-concept:    <http://purl.org/linked-data/sdmx/2009/concept#> .
@prefix sdmx-dimension:  <http://purl.org/linked-data/sdmx/2009/dimension#> .
@prefix sdmx-attribute:  <http://purl.org/linked-data/sdmx/2009/attribute#> .
@prefix sdmx-measure:    <http://purl.org/linked-data/sdmx/2009/measure#> .
@prefix sdmx-metadata:   <http://purl.org/linked-data/sdmx/2009/metadata#> .
@prefix sdmx-code:       <http://purl.org/linked-data/sdmx/2009/code#> .
@prefix sdmx-subject:    <http://purl.org/linked-data/sdmx/2009/subject#> .

@prefix eg:              <http://data.marine.ie/datacube#> .
@prefix mi-vcb:          <http://data.marine.ie/vocab#> .

"""

Define the SKOS collections to be used by the Data Cubes.

In [None]:
# Define SKOS collections for the dataset
skos_station = """
mi-vcb:Station_id a rdfs:Class, owl:Class ;
\trdfs:subClassOf skos:Concept ;
\trdfs:label "Code list for Station codelist class"@en ;
\trdfs:comment "This code list provides the station details."@en ;
\trdfs:seeAlso mi-vcb:station_id .
    
mi-vcb:station_id a skos:ConceptScheme;
\tskos:prefLabel "Code list for stations in the data cube - codelist scheme"@en ;
\trdfs:label "Code list for stations in the data cube - codelist scheme"@en ;
\tskos:notation "Summary statistic"@en ;
\tskos:note "This code list provides the attributes for the stations in the data cube."@en ;
\tskos:definition "This code list provides the attributes for the stations in the data cube."@en ;
\trdfs:seeAlso mi-vcb:Station_id ;
"""
    
for i in range(0,len(stations)):
    text = "\tskos:member mi-vcb:%s ;\n" % (stations.loc[i]['Label'])
    skos_station = skos_station + text

skos_station = skos_station + "\t.\n"

for i in range(0,len(stations)):
    label = stations.loc[i]['Label']
    name = stations.loc[i]['Name']
    definition = stations.loc[i]['Definition']
    lat = stations.loc[i]['Lat']
    lon = stations.loc[i]['Lon']
    text = """mi-vcb:%s a skos:Concept, mi-vcb:Station_id, geo:Feature ;
    \tskos:topConceptOf mi-vcb:station_id ;
    \tskos:prefLabel "%s"@en ;
    \tskos:notation "%s"@en ;
    \tskos:definition "Narative description of the site here"@en ;
    \tskos:inScheme mi-vcb:station_id ;
    \tgeo:hasGeometry mi-vcb:%s-geom .
mi-vcb:%s-geom geo:asWKT "POINT(%s %s)"^^geo:wktLiteral .\n\n""" % (label,name,name,label,label,lon,lat)
    skos_station = skos_station + text

skos_statistic = """
mi-vcb:Statistic a rdfs:Class, owl:Class ;
\trdfs:subClassOf skos:Concept ;
\trdfs:label "Code list for Statistic codelist class"@en ;
\trdfs:comment "This  code list provides the summary statistic."@en ;
\trdfs:seeAlso mi-vcb:statistic .
    
mi-vcb:statistic a skos:ConceptScheme ;
\tskos:prefLabel "Code list for statistic codelist scheme"@en ;
\trdfs:label "Code list for statistic codelist scheme"@en ;
\tskos:notation "Summary statistic"@en ;
\tskos:note "This code list provides the summary statistics provided in the dataset."@en ;
\tskos:definition <http://vocab.nerc.ac.uk/collection/S07/current/> ;
\trdfs:seeAlso mi-vcb:Statistic ;\n"""
    
for i in range(0,len(statistics)):
    text = "\tskos:member mi-vcb:%s ;\n" % (statistics.loc[i]['notation'])
    skos_statistic = skos_statistic + text

skos_statistic = skos_statistic + "\t.\n\n"

for i in range(0,len(statistics)):
    notation = statistics.loc[i]['notation']
    prefLabel = statistics.loc[i]['prefLabel']
    definition = statistics.loc[i]['definition']
    sameAs = statistics.loc[i]['sameAs']
    if sameAs=='not applicable':
        text = """mi-vcb:%s a skos:Concept, mi-vcb:Statistic ;
\tskos:topConceptOf mi-vcb:statistic ;
\tskos:prefLabel "%s"@en ;
\tskos:notation "%s"@en ;
\tskos:definition "%s"@en ;
\tskos:inScheme mi-vcb:statistic ;
\t.\n\n""" % (notation,prefLabel,notation,definition)
        skos_statistic = skos_statistic + text
    else:
        text = """mi-vcb:%s a skos:Concept, mi-vcb:Statistic ;
\tskos:topConceptOf mi-vcb:statistic ;
\tskos:prefLabel "%s"@en ;
\tskos:notation "%s"@en ;
\tskos:definition "%s"@en ;
\tskos:sameAs <%s> ;
\tskos:inScheme mi-vcb:statistic ;
\t.\n\n""" % (notation,prefLabel,notation,definition,sameAs)
        skos_statistic = skos_statistic + text

Define the dimensions and measures of the Data Cubes

In [None]:
# Dimensions

dimensions = """# -- Dimensions and measures  ----------------------------
 
eg:station_id  a rdf:Property, qb:DimensionProperty, qb:CodedProperty ;
\trdfs:label "station_id"@en ;
\trdfs:subPropertyOf sdmx-dimension:refArea ;
\tqb:codeList mi-vcb:station_id ;
\trdfs:range mi-vcb:station_id ;
\tqb:concept sdmx-concept:refArea ;
\t.

eg:Date  a rdf:Property, qb:DimensionProperty ;
\trdfs:label "Date"@en ;
\trdfs:subPropertyOf sdmx-dimension:refPeriod ;
\trdfs:range xsd:date ;
\tqb:concept sdmx-concept:refPeriod ;
\t.
    
eg:statistic  a rdf:Property, qb:DimensionProperty, qb:CodedProperty ;
\trdfs:label "statistic"@en ;
\trdfs:subPropertyOf sdmx-dimension:statConcDef ;
\tqb:codeList mi-vcb:statistic ;
\trdfs:range mi-vcb:statistic ;
\tqb:concept sdmx-concept:statConcDef ;
\t.
"""

# Measures
measures = ""
for h in range(0,len(mapping)):
    meas = str(mapping.iloc[h]['Local'])
    P01 = str(mapping.iloc[h]['P01'])
    P07 = str(mapping.iloc[h]['P07'])
    P06 = str(mapping.iloc[h]['P06'])
    comment = str(mapping.iloc[h]['Comment'])
    
    text = """eg:%s  a rdf:Property, qb:MeasureProperty;
    \trdfs:label "%s"@en ;\n""" % (meas, meas)
    if P01 != 'nan':
        text = text + """\trdfs:seeAlso <%s> ;\n""" % (P01)
    if P07 != 'nan':
        text = text + """\tqb:concept <%s> ;\n""" % (P07)
    text = text + """\trdfs:comment "%s"@en ;
    \trdfs:subPropertyOf sdmx-measure:obsValue ;
    \tsdmx-attribute:unitMeasure <%s> ;
    \trdfs:range xsd:decimal ;
    \t.\n\n""" % (comment, P06)
    measures = measures + text

# Add unit labels and short names
unit_text = ""
    
units = mapping[['P06', 'Unit_label', 'Unit_note']].reset_index(drop=True)
units = units.drop_duplicates()

for h in range(0,len(units)):
    P06 = str(units.iloc[h]['P06'])
    ulabel = str(units.iloc[h]['Unit_label'])
    unote = str(units.iloc[h]['Unit_note'])
    text = """<%s> skos:prefLabel "%s"@en ;
    \tskos:altLabel "%s"@en ;
    \t.
""" % (P06, ulabel, unote)
    unit_text = unit_text + text

print(dimensions)
print(measures)
print(unit_text)
    

Define the slices available for the Data Cube

In [None]:
# Data Cube slices

slice_keys = """eg:sliceByStationID_statistic a qb:SliceKey;
\trdfs:label "Slice by station-statistic combination"@en;
\trdfs:comment "Slice grouping by station and statistic"@en;
\tqb:componentProperty eg:station_id, eg:statistic; 
\t.
"""
    

Combine the vocabularies, dimension, measures and slice key triples into one file and save. Do not run from mybinder example as saving files is not possible.

In [None]:
header = prefices + skos_station + skos_statistic + dimensions + measures + unit_text + slice_keys

file = open(ttl_vocabs, "w")
file.write(header)
file.close()

In [None]:
#%% Define mappings from vocabs for obervations in RDF
stn_map = dict(zip(stations.Name.tolist(), ['mi-vcb:%s' % string for string in stations.Label.tolist()]))

stat_map = dict(zip([string.replace('daily_','') for string in statistics.notation.tolist()], ['mi-vcb:%s' % string for string in statistics.notation.tolist()]))

#%% Work through building each cube
for typed in dset_ids:
    # Load data from files
    # Load daily data from file
    dailyfile = os.path.join(working,'%s/%s_daily_summary.csv' % (typed,typed))
    daily = pd.read_csv(dailyfile, index_col=[0, 1], header = [0, 1], parse_dates=True, infer_datetime_format=True)
    # Load availability data from file
    availfile = os.path.join(working,'%s/%s_data_availability.csv' % (typed,typed))
    avail = pd.read_csv(availfile, index_col=[0, 1], header = [0, 1], parse_dates=True, infer_datetime_format=True)
    
    # Dictionary of data cubes to add to RDF
    cube_list = {'daily': daily, 'avail': avail}
    
    for dset_type in cube_list.keys():
        dset_data = cube_list[dset_type]
        
        short = datasets[datasets.dset_id==typed]['short'].values[0] + "_%s" % (dset_type)
        edmed = datasets[datasets.dset_id==typed]['edmed'].values[0]
        originator = datasets[datasets.dset_id==typed]['originator'].values[0]
        publisher = datasets[datasets.dset_id==typed]['publisher'].values[0]
        abstract = datasets[datasets.dset_id==typed]['abstract'].values[0]
        if dset_type == 'avail':
            title = datasets[datasets.dset_id==typed]['title'].values[0] + " - Daily availability of data"
            description = """Availability as a percentage by day of year for the %s measurements.""" % (datasets[datasets.dset_id==typed]['title'].values[0])
            unitattach = "DataSet"
            
        elif dset_type == 'daily':
            title = datasets[datasets.dset_id==typed]['title'].values[0] + " - Daily Summary Statistics"
            description = """Summary statistics (daily mean, standard deviation, minimun and maximum) by day of year for the %s measurements.""" % (datasets[datasets.dset_id==typed]['title'].values[0])
            unitattach = "MeasureProperty"
            
    #%% Data structure definition
            
        dset_definition_pt1 = """\neg:dsd-%s a qb:DataStructureDefinition; 
    # The dimensions
    \tqb:component [ a qb:ComponentSpecification ; qb:componentProperty eg:station_id ; qb:dimension eg:station_id; qb:order 1; qb:componentAttachment qb:Slice ],
    \t             [ a qb:ComponentSpecification ; qb:componentProperty eg:Date ; qb:dimension eg:Date; qb:order 2 ],
    \t             [ a qb:ComponentSpecification ; qb:componentProperty eg:statistic ; qb:dimension eg:statistic; qb:order 3; qb:componentAttachment qb:Slice ];
        
    # The measure(s)\n""" % (short)
        for meas in dset_data.columns.levels[0].tolist():
            line = """\tqb:component [ a qb:ComponentSpecification ; qb:componentProperty eg:%s ; qb:measure eg:%s];\n""" % (meas, meas)
            dset_definition_pt1 = dset_definition_pt1 + line
        
        dset_definition_pt2 =  """  
    # The attributes
    \tqb:component [ a qb:ComponentSpecification ; qb:componentProperty sdmx-attribute:unitMeasure ; qb:attribute sdmx-attribute:unitMeasure ; qb:componentAttachment qb:%s; ] ;

    # The slices
    \tqb:sliceKey eg:sliceByStationID_statistic ;
    .
    \n# Attribute and slice types
    sdmx-attribute:unitMeasure a qb:AttributeProperty .    
    eg:sliceByStationID_statistic a qb:SliceKey .    
    """ % (unitattach)
        
        dset_definition = dset_definition_pt1 + dset_definition_pt2
    
    #%% Loop through slices and observations
        cell_tot = len(dset_data.columns.levels[0].tolist()) * dset_data.shape[0]
        cell_num = 0
        idx = pd.IndexSlice
        
        slce_txt = ""
        slce = 0
        for stnid in dset_data.index.levels[0].tolist():
            obs = """\n"""
            for stat in dset_data.columns.levels[1].tolist():
                data = dset_data.loc[stnid].loc[:, idx[:, stat]]
                obs_head = """\n# Column %s %s
eg:%s_slice%s a qb:Slice;
\tqb:sliceStructure  eg:sliceByStationID_statistic ;
\teg:station_id        %s ;
\teg:statistic         %s ;
\tqb:observation """ % (short, slce, short, slce, stn_map[stnid], stat_map[stat])
            
                for i in range(cell_num, cell_num+len(data)):
                    obs_head = obs_head + """eg:%s_o%s, """ % (short, str(i+1).zfill(len(str(cell_tot))))
                    i = i + 1
                obs_head = obs_head[0:-2] + " ;    .\n" 
                obs = obs + obs_head
                for i in range(0, len(data.index)):
                    value = "'%s'^^xsd:date" % (data.index[i].strftime('%Y-%m-%d'))
                    obs_granule = """\neg:%s_o%s a qb:Observation;
\tqb:dataSet          eg:%s ;
\teg:Date             %s ;
\teg:station_id       %s ;
\teg:statistic        %s ;\n""" % (short, str(cell_num+1).zfill(len(str(cell_tot))), short, value, stn_map[stnid], stat_map[stat])
    
                    measures = data.columns.tolist()
                    for meas in measures:
                        value = data.iloc[i][meas]
                        if str(value) == 'nan':
                            value = -9999
                        obs_line = """\teg:%s        %s ;\n""" % (meas[0], value)
                        obs_granule = obs_granule + obs_line    
                    if unitattach == "DataSet":
                        obs_granule = obs_granule + """\tsdmx-attribute:unitMeasure <http://vocab.nerc.ac.uk/collection/P06/current/UPCT/> ;\n\t.\n"""
                    else:
                        obs_granule = obs_granule + """\t.\n"""
                    obs = obs + obs_granule
                    cell_num = cell_num + 1
                slce_txt = slce_txt + "eg:%s_slice%s, " % (short, slce)
                slce = slce + 1
                stn_file = prefices + obs
                ttl_stn = os.path.join(working,'rdf/%s_%s_%s_datacube.ttl' % (typed, dset_type, stnid.replace(' ','_')))
                file = open(ttl_stn, "w")
                file.write(stn_file)
                file.close()
        slce_txt = slce_txt[0:-2] + " ;"
    
    #%% Data Set description    
        dset_desc = """eg:%s a qb:DataSet, prov:Entity ;
    dct:title       "%s"@en;
    rdfs:label      "%s"@en;
    rdfs:comment    "%s"@en;
    dct:description "%s"@en;
    dct:publisher   <http://edmo.seadatanet.org/%s>;
    dct:issued      "%s"^^xsd:date;
    prov:wasDerivedFrom <http://edmed.seadatanet.org/%s> ;
    dct:subject
    \t<http://purl.org/linked-data/sdmx/2009/subject#3.1> ,
    \t<http://purl.org/linked-data/sdmx/2009/subject#2.4.2> ,
    \t<http://vocab.nerc.ac.uk/collection/C19/current/SVX00015/> ;
    qb:structure eg:dsd-%s ;
    qb:slice %s
    .
    
<http://edmo.seadatanet.org/%s> a org:Organization, foaf:Agent;
    rdfs:label "Marine Institute"@en .
        
<http://edmed.seadatanet.org/%s> a dcat:Dataset, prov:Entity . """ % (short, title, title, description, description, publisher, now.strftime('%Y-%m-%d'), edmed, short, slce_txt, publisher, edmed)
    
    #%% Combine all RDF sections for a data cube header    
        dset_rdf = prefices + dset_desc + dset_definition
    
    #%% Save RDF to file as ttl
        ttl = os.path.join(working,'rdf/%s_%s_datacube_header.ttl' % (typed, dset_type))
        file = open(ttl, "w")
        file.write(dset_rdf)
        file.close()