# 1_Create_metadata_for_epc_data

# setup

In [1]:
import csvw_functions
import csv
import json
import rdflib

## functions

### get_glossary_info

In [2]:
with open('glossary_epc_domestic.txt') as f: glossary_text=f.read()

def get_glossary_info(col_name, st=glossary_text):
    ""
    a=st.find(col_name)
    b=st.find('\n',a+1)
    c=st.find('\n',b+1)
    
    variable_string=st[a+len(col_name):b].strip()
    
    description=st[b:c].strip()
    
    if not (variable_string.startswith('VARCHAR') 
            or variable_string.startswith('INT')
            or variable_string.startswith('DATE') 
            or variable_string.startswith('DECIMAL')
            or variable_string.startswith('DATETIME')
            or variable_string==''
           ):
    
        return get_glossary_info(col_name, st=st[a+len(col_name):])
    
    return col_name, variable_string, description

In [3]:
# test for single heading
get_glossary_info('LMK_KEY')

('LMK_KEY',
 'VARCHAR 64',
 'Individual lodgement identifier. Guaranteed to be unique and can be used to identify a certificate in the downloads and the API.')

In [4]:
# test for all headings
with open('certificates.csv') as f: 
    csvreader=csv.reader(f)
    headings=next(iter(csvreader))
for heading in headings:
    print(get_glossary_info(heading))

('LMK_KEY', 'VARCHAR 64', 'Individual lodgement identifier. Guaranteed to be unique and can be used to identify a certificate in the downloads and the API.')
('ADDRESS1', 'VARCHAR 84', 'First line of the address')
('ADDRESS2', 'VARCHAR 100', 'Second line of the address')
('ADDRESS3', 'VARCHAR 100', 'Third line of the address')
('POSTCODE', 'VARCHAR 8', 'The postcode of the property')
('BUILDING_REFERENCE_NUMBER', 'VARCHAR 12', 'Unique identifier for the property.')
('CURRENT_ENERGY_RATING', 'VARCHAR 8', "Current energy rating converted into a linear 'A to G' rating (where A is the most energy efficient and G is the least energy efficient)")
('POTENTIAL_ENERGY_RATING', 'VARCHAR 8', "Estimated potential energy rating converted into a linear 'A to G' rating (where A is the most energy efficient and G is the least energy efficient)")
('CURRENT_ENERGY_EFFICIENCY', 'INT', 'Based on cost of energy, i.e. energy required for space heating, water heating and lighting [in kWh/year] multiplied by 

### get_datatype

In [5]:
def get_datatype(variable_string):
    ""
    if variable_string=='':
        return 'string'
    elif variable_string.startswith('VARCHAR'):
        return 'string'
    elif variable_string.startswith('INT'):
        return 'integer'
    elif variable_string.startswith('DECIMAL'):
        return 'decimal'
    elif variable_string.startswith('DATETIME'):
        return {'base':'datetime','format':'yyyy-MM-dd HH:mm:ss'}
    elif variable_string.startswith('DATE'):
        return 'date'
    else:
        raise Exception

In [6]:
# check for all headings
for heading in headings:
    _, variable_string, description=get_glossary_info(heading)
    print(heading, get_datatype(variable_string))

LMK_KEY string
ADDRESS1 string
ADDRESS2 string
ADDRESS3 string
POSTCODE string
BUILDING_REFERENCE_NUMBER string
CURRENT_ENERGY_RATING string
POTENTIAL_ENERGY_RATING string
CURRENT_ENERGY_EFFICIENCY integer
POTENTIAL_ENERGY_EFFICIENCY integer
PROPERTY_TYPE string
BUILT_FORM string
INSPECTION_DATE date
LOCAL_AUTHORITY string
CONSTITUENCY string
COUNTY string
LODGEMENT_DATE date
TRANSACTION_TYPE string
ENVIRONMENT_IMPACT_CURRENT integer
ENVIRONMENT_IMPACT_POTENTIAL integer
ENERGY_CONSUMPTION_CURRENT integer
ENERGY_CONSUMPTION_POTENTIAL integer
CO2_EMISSIONS_CURRENT decimal
CO2_EMISS_CURR_PER_FLOOR_AREA decimal
CO2_EMISSIONS_POTENTIAL decimal
LIGHTING_COST_CURRENT integer
LIGHTING_COST_POTENTIAL integer
HEATING_COST_CURRENT integer
HEATING_COST_POTENTIAL integer
HOT_WATER_COST_CURRENT integer
HOT_WATER_COST_POTENTIAL integer
TOTAL_FLOOR_AREA decimal
ENERGY_TARIFF string
MAINS_GAS_FLAG string
FLOOR_LEVEL string
FLAT_TOP_STOREY string
FLAT_STOREY_COUNT integer
MAIN_HEATING_CONTROLS string
MU

## certificates

In [7]:
# get embedded metadata from csv file
embedded_metadata_dict=csvw_functions.get_embedded_metadata('certificates.csv', relative_path=True)

# include additional information on columns
columns=embedded_metadata_dict['tableSchema']['columns']
for i in range(len(columns)):
    column=columns[i]
    aboutUrl='http://example.com/epc_data#Certificate_{LMK_KEY}'
    heading, variable_string, description=get_glossary_info(column['name'])
    column['datatype']=get_datatype(variable_string)
    column['dc:description']=description
    column['propertyUrl']='http://example.com/epc_vocab#%s' % column['name']
    column['aboutUrl']=aboutUrl

# change some datatypes to fix errors in the glossart
def change_datatype(col_name,new_datatype):
    for x in embedded_metadata_dict['tableSchema']['columns']:
        if x['name']==col_name:
            x['datatype']=new_datatype
change_datatype('MULTI_GLAZE_PROPORTION','decimal')
change_datatype('EXTENSION_COUNT','decimal')
change_datatype('NUMBER_HABITABLE_ROOMS','decimal')
change_datatype('NUMBER_HEATED_ROOMS','decimal')
change_datatype('PHOTO_SUPPLY','decimal')

# create schema metadata dict
schema_metadata_dict=embedded_metadata_dict['tableSchema']
schema_metadata_dict["@context"]="http://www.w3.org/ns/csvw"
schema_metadata_dict["@type"]="Schema"

    
with open('certificates-schema-metadata.json','w') as f:
    json.dump(schema_metadata_dict,f,indent=4)
    
schema_metadata_dict

{'columns': [{'titles': {'und': ['LMK_KEY']},
   'name': 'LMK_KEY',
   'datatype': 'string',
   'dc:description': 'Individual lodgement identifier. Guaranteed to be unique and can be used to identify a certificate in the downloads and the API.',
   'propertyUrl': 'http://example.com/epc_vocab#LMK_KEY',
   'aboutUrl': 'http://example.com/epc_data#Certificate_{LMK_KEY}'},
  {'titles': {'und': ['ADDRESS1']},
   'name': 'ADDRESS1',
   'datatype': 'string',
   'dc:description': 'First line of the address',
   'propertyUrl': 'http://example.com/epc_vocab#ADDRESS1',
   'aboutUrl': 'http://example.com/epc_data#Certificate_{LMK_KEY}'},
  {'titles': {'und': ['ADDRESS2']},
   'name': 'ADDRESS2',
   'datatype': 'string',
   'dc:description': 'Second line of the address',
   'propertyUrl': 'http://example.com/epc_vocab#ADDRESS2',
   'aboutUrl': 'http://example.com/epc_data#Certificate_{LMK_KEY}'},
  {'titles': {'und': ['ADDRESS3']},
   'name': 'ADDRESS3',
   'datatype': 'string',
   'dc:description

In [8]:
annotated_table_group_dict=csvw_functions.create_annotated_table_group('certificates-metadata.json')
csvw_functions.display_annotated_table_group_dict(annotated_table_group_dict)

{'id': None,
 'notes': False,
 'tables': [{'columns': [{'table': '__recursion__',
     'number': 1,
     'sourceNumber': 1,
     'name': 'LMK_KEY',
     'titles': [{'@value': 'LMK_KEY', '@language': 'und'}],
     'virtual': False,
     'suppressOutput': False,
     'datatype': {'base': 'string'},
     'default': '',
     'lang': 'und',
     'null': '',
     'ordered': False,
     'required': False,
     'separator': None,
     'textDirection': 'auto',
     'aboutURL': 'http://example.com/epc_data#Certificate_{LMK_KEY}',
     'propertyURL': 'http://example.com/epc_vocab#LMK_KEY',
     'valueURL': None,
     'cells': [{'table': '__recursion__',
       'column': '__recursion__',
       'row': '__recursion__',
       'stringValue': '63ff1a3a9c341ac8c4bc081160d572dfc98e7fa98dc5244d3620cd22ada45045',
       'value': {'@value': '63ff1a3a9c341ac8c4bc081160d572dfc98e7fa98dc5244d3620cd22ada45045',
        '@type': 'http://www.w3.org/2001/XMLSchema#string',
        '@language': 'und'},
       'er

In [9]:
csvw_functions.get_errors(annotated_table_group_dict)

[]

### convert to rdf

In [10]:
rdf_ntriples=csvw_functions.create_rdf(annotated_table_group_dict,mode='minimal')
g=rdflib.Graph().parse(data=rdf_ntriples, format='ntriples')
g.serialize('certificates.ttl',format='ttl')
g.bind('sosa',rdflib.SOSA)
g.bind('epc_data',rdflib.URIRef('http://example.com/epc_data#'))
g.bind('epc_vocab',rdflib.URIRef('http://example.com/epc_vocab#'))
print(g.serialize(format='ttl'))

@prefix epc_data: <http://example.com/epc_data#> .
@prefix ns1: <http://example.com/epc_vocab#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

epc_data:Certificate_63ff1a3a9c341ac8c4bc081160d572dfc98e7fa98dc5244d3620cd22ada45045 ns1:ADDRESS "__REMOVED__"^^xsd:string ;
    ns1:ADDRESS1 "__REMOVED__"^^xsd:string ;
    ns1:ADDRESS2 "__REMOVED__"^^xsd:string ;
    ns1:ADDRESS3 "__REMOVED__"^^xsd:string ;
    ns1:BUILDING_REFERENCE_NUMBER "10002887813"^^xsd:string ;
    ns1:BUILT_FORM "Semi-Detached"^^xsd:string ;
    ns1:CO2_EMISSIONS_CURRENT 2.9 ;
    ns1:CO2_EMISSIONS_POTENTIAL 0.9 ;
    ns1:CO2_EMISS_CURR_PER_FLOOR_AREA 38.0 ;
    ns1:CONSTITUENCY "E14000625"^^xsd:string ;
    ns1:CONSTITUENCY_LABEL "Charnwood"^^xsd:string ;
    ns1:CONSTRUCTION_AGE_BAND "England and Wales: 1900-1929"^^xsd:string ;
    ns1:COUNTY "Leicestershire"^^xsd:string ;
    ns1:CURRENT_ENERGY_EFFICIENCY 66 ;
    ns1:CURRENT_ENERGY_RATING "D"^^xsd:string ;
    ns1:ENERGY_CONSUMPTION_CURRENT 239 ;
    ns1:EN