# Create_epc_vocab

This Notebook creates a RDF vocabulary for use with the Energy Performance Certificate data.

The vocabulary is saved as both RDF format (.ttl) and CSVW format (.csv & -metadata.json).

**TO DO - The vocab for the recommendations.csv files.**


## Setup

In [24]:
import csv
import json
import rdflib
import csvw_functions

## Functions

## get_glossary_info

In [25]:
with open('glossary_epc_domestic.txt') as f: glossary_text=f.read()

def get_glossary_info(col_name, st=glossary_text):
    ""
    a=st.find(col_name)
    b=st.find('\n',a+1)
    c=st.find('\n',b+1)
    
    variable_string=st[a+len(col_name):b].strip()
    
    description=st[b:c].strip()
    
    if not (variable_string.startswith('VARCHAR') 
            or variable_string.startswith('INT')
            or variable_string.startswith('DATE') 
            or variable_string.startswith('DECIMAL')
            or variable_string.startswith('DATETIME')
            or variable_string==''
           ):
    
        return get_glossary_info(col_name, st=st[a+len(col_name):])
    
    return col_name, variable_string, description

In [26]:
# test for single heading
get_glossary_info('LMK_KEY')

('LMK_KEY',
 'VARCHAR 64',
 'Individual lodgement identifier. Guaranteed to be unique and can be used to identify a certificate in the downloads and the API.')

In [27]:
# test for all headings
with open('certificates.csv') as f: 
    csvreader=csv.reader(f)
    headings=next(iter(csvreader))
for heading in headings:
    print(get_glossary_info(heading))

('LMK_KEY', 'VARCHAR 64', 'Individual lodgement identifier. Guaranteed to be unique and can be used to identify a certificate in the downloads and the API.')
('ADDRESS1', 'VARCHAR 84', 'First line of the address')
('ADDRESS2', 'VARCHAR 100', 'Second line of the address')
('ADDRESS3', 'VARCHAR 100', 'Third line of the address')
('POSTCODE', 'VARCHAR 8', 'The postcode of the property')
('BUILDING_REFERENCE_NUMBER', 'VARCHAR 12', 'Unique identifier for the property.')
('CURRENT_ENERGY_RATING', 'VARCHAR 8', "Current energy rating converted into a linear 'A to G' rating (where A is the most energy efficient and G is the least energy efficient)")
('POTENTIAL_ENERGY_RATING', 'VARCHAR 8', "Estimated potential energy rating converted into a linear 'A to G' rating (where A is the most energy efficient and G is the least energy efficient)")
('CURRENT_ENERGY_EFFICIENCY', 'INT', 'Based on cost of energy, i.e. energy required for space heating, water heating and lighting [in kWh/year] multiplied by 

## get_datatype

In [28]:
def get_datatype(variable_string):
    ""
    if variable_string=='':
        return 'string'
    elif variable_string.startswith('VARCHAR'):
        return 'string'
    elif variable_string.startswith('INT'):
        return 'integer'
    elif variable_string.startswith('DECIMAL'):
        return 'decimal'
    elif variable_string.startswith('DATETIME'):
        return {'base':'datetime','format':'yyyy-MM-dd HH:mm:ss'}
    elif variable_string.startswith('DATE'):
        return 'date'
    else:
        raise Exception

In [29]:
# check for all headings
for heading in headings:
    _, variable_string, description=get_glossary_info(heading)
    print(heading, get_datatype(variable_string))

LMK_KEY string
ADDRESS1 string
ADDRESS2 string
ADDRESS3 string
POSTCODE string
BUILDING_REFERENCE_NUMBER string
CURRENT_ENERGY_RATING string
POTENTIAL_ENERGY_RATING string
CURRENT_ENERGY_EFFICIENCY integer
POTENTIAL_ENERGY_EFFICIENCY integer
PROPERTY_TYPE string
BUILT_FORM string
INSPECTION_DATE date
LOCAL_AUTHORITY string
CONSTITUENCY string
COUNTY string
LODGEMENT_DATE date
TRANSACTION_TYPE string
ENVIRONMENT_IMPACT_CURRENT integer
ENVIRONMENT_IMPACT_POTENTIAL integer
ENERGY_CONSUMPTION_CURRENT integer
ENERGY_CONSUMPTION_POTENTIAL integer
CO2_EMISSIONS_CURRENT decimal
CO2_EMISS_CURR_PER_FLOOR_AREA decimal
CO2_EMISSIONS_POTENTIAL decimal
LIGHTING_COST_CURRENT integer
LIGHTING_COST_POTENTIAL integer
HEATING_COST_CURRENT integer
HEATING_COST_POTENTIAL integer
HOT_WATER_COST_CURRENT integer
HOT_WATER_COST_POTENTIAL integer
TOTAL_FLOOR_AREA decimal
ENERGY_TARIFF string
MAINS_GAS_FLAG string
FLOOR_LEVEL string
FLAT_TOP_STOREY string
FLAT_STOREY_COUNT integer
MAIN_HEATING_CONTROLS string
MU

## Create csv file

This CSV file contains the information about the EPC vocabulary. This can be converted to RDF using the CSVW metadata file below.

In [30]:
with open('epc_vocab.csv','w',newline='') as f:
    
    csvwriter=csv.writer(f)
    
    csvwriter.writerow(
        ['id',
         'label',
         'comment',
         'original_epc_datatype',
         'csvw_datatype'
        ])
    
    # epc_vocab:datatype
    csvwriter.writerow(
        ['original_epc_datatype',
         'original_epc_datatype',
         'The datatype of a EPC variable, as given in the EPC guidance notes.',
         ''
        ])
    
    # epc_vocab:datatype
    csvwriter.writerow(
        ['csvw_datatype',
         'csvw_datatype',
         'The CSVW datatype of a epc_vocab Property.',
         ''
        ])
    
    for heading in headings:
        
        heading, variable_string, description=get_glossary_info(heading)
        
        datatype=get_datatype(variable_string)
        
        cleaned_datatypes={
            'MULTI_GLAZE_PROPORTION':'decimal',
            'EXTENSION_COUNT':'decimal',
            'NUMBER_HABITABLE_ROOMS':'decimal',
            'NUMBER_HEATED_ROOMS':'decimal',
            'PHOTO_SUPPLY':'decimal'
        }
        if heading in cleaned_datatypes:
            datatype=cleaned_datatypes[heading]
        
        
        csvwriter.writerow(
            [heading,
             heading,
             description,
             variable_string,
             datatype
            ])
    

## Create metadata file

In [31]:
metadata_table_dict={
    "@context": "http://www.w3.org/ns/csvw",
    "@type": "Table",
    'url': 'epc_vocab.csv',
    'tableSchema':{
        'columns':[
            {
                'name':'id',
                'suppressOutput': True,
                'titles':'id',
            },
            {
                'name':'label',
                'titles':'label',
                'aboutUrl': 'http://purl.org/berg/epc_vocab#{id}',
                'propertyUrl': 'rdfs:label'
            },
            {
                'name':'comment',
                'titles':'comment',
                'aboutUrl': 'http://purl.org/berg/epc_vocab#{id}',
                'propertyUrl': 'rdfs:comment'
            },
            {
                'name':'original_epc_datatype',
                'titles':'original_epc_datatype',
                'aboutUrl': 'http://purl.org/berg/epc_vocab#{id}',
                'propertyUrl': 'http://purl.org/berg/epc_vocab#original_epc_datatype'
            },
            {
                'name':'csvw_datatype',
                'titles':'csvw_datatype',
                'aboutUrl': 'http://purl.org/berg/epc_vocab#{id}',
                'propertyUrl': 'http://purl.org/berg/epc_vocab#csvw_datatype'
            },
            {
                'name':'type',
                'virtual':True,
                'aboutUrl': 'http://purl.org/berg/epc_vocab#{id}',
                'propertyUrl': 'rdf:type',
                'valueUrl': 'rdf:Property'
            }
            
        ]
    }
    
    
    
}
with open('epc_vocab.csv-metadata.json','w') as f:
    json.dump(metadata_table_dict,f,indent=4)

## Create rdf file

In [32]:
annotated_table_group_dict=csvw_functions.create_annotated_table_group('epc_vocab.csv-metadata.json')
csvw_functions.display_annotated_table_group_dict(annotated_table_group_dict)
print('errors: ',csvw_functions.get_errors(annotated_table_group_dict))
rdf_ntriples=csvw_functions.create_rdf(annotated_table_group_dict,mode='minimal')
#print(rdf_ntriples)
g=rdflib.Graph().parse(data=rdf_ntriples, format='ntriples')
g.bind('sosa',rdflib.SOSA)
g.bind('epc_data',rdflib.URIRef('http://purl.org/berg/epc_data#'))
g.bind('epc_vocab',rdflib.URIRef('http://purl.org/berg/epc_vocab#'))
g.serialize('epc_vocab.ttl',format='ttl')
print(g.serialize(format='ttl'))

errors:  []
@prefix epc_vocab: <http://purl.org/berg/epc_vocab#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

epc_vocab:ADDRESS a rdf:Property ;
    rdfs:label "ADDRESS"^^xsd:string ;
    epc_vocab:csvw_datatype "string"^^xsd:string ;
    rdfs:comment "Field containing the concatenation of address1, address2 and address3. Note that post code is recorded separately."^^xsd:string .

epc_vocab:ADDRESS1 a rdf:Property ;
    rdfs:label "ADDRESS1"^^xsd:string ;
    epc_vocab:csvw_datatype "string"^^xsd:string ;
    epc_vocab:original_epc_datatype "VARCHAR 84"^^xsd:string ;
    rdfs:comment "First line of the address"^^xsd:string .

epc_vocab:ADDRESS2 a rdf:Property ;
    rdfs:label "ADDRESS2"^^xsd:string ;
    epc_vocab:csvw_datatype "string"^^xsd:string ;
    epc_vocab:original_epc_datatype "VARCHAR 100"^^xsd:string ;
    rdfs:comment "Second line of the address"^^xs