In [1]:
import polars as pl
import os
import get_ca_data as get_ca # functions for retrieving CA \ common data


In [2]:
ca_la_df = get_ca.get_ca_la_df(2023, inc_ns=True) # include NS
la_list = ca_la_df['ladcd'] #includes north somerset

Test the lazy query to understand what's included

In [5]:
file_path = 'data/all-domestic-certificates/domestic-E06000001-Hartlepool/certificates.csv'

In [14]:
cols_schema_nondom = {
    'LMK_KEY': pl.Utf8,
    'POSTCODE': pl.Utf8,
    'BUILDING_REFERENCE_NUMBER': pl.Int64,
    'ASSET_RATING': pl.Int64,
    'ASSET_RATING_BAND': pl.Utf8,
    'PROPERTY_TYPE': pl.Utf8,
    'LOCAL_AUTHORITY': pl.Utf8,
    'CONSTITUENCY': pl.Utf8,
    'TRANSACTION_TYPE': pl.Utf8,
    'STANDARD_EMISSIONS': pl.Float64,
    'TYPICAL_EMISSIONS': pl.Float64,
    'TARGET_EMISSIONS': pl.Float64,
    'BUILDING_EMISSIONS': pl.Float64,
    'BUILDING_LEVEL': pl.Int64,
    'RENEWABLE_SOURCES': pl.Utf8,
    'LODGEMENT_DATETIME': pl.Datetime,
    'UPRN': pl.Int64
    }

In [4]:
cols_schema_dom = {
                        'LMK_KEY':pl.Utf8,
                        'POSTCODE':pl.Utf8,
                        'CURRENT_ENERGY_RATING':pl.Utf8,
                        'LOCAL_AUTHORITY':pl.Utf8,
                        'PROPERTY_TYPE':pl.Utf8,
                        'LODGEMENT_DATETIME':pl.Datetime,
                        'TRANSACTION_TYPE': pl.Utf8,
                        'ENVIRONMENT_IMPACT_CURRENT':pl.Int64,
                        'CO2_EMISSIONS_CURRENT': pl.Float64,
                        'TENURE':pl.Utf8,
                        'UPRN':pl.Int64
                    }

In [6]:
def ingest_certs(la_list: list, cols_schema: dict, root_dir: str):
    """
    Loop through all folders in a root directory
    if the folder name matches an item in a list of folder names
    us an optimised polars query to ingest a subset of columns and do 
    some transformations to create a single large DF of EPC data
    """
    all_dataframes = []
    cols_select_list = list(cols_schema.keys())
    for item in la_list:
        for folder_name in os.listdir(root_dir):
            # Check if the folder name matches an item in la_list
            if item in folder_name:
                file_path = os.path.join(root_dir, folder_name, "certificates.csv")
                # Check if certificates.csv actually exists inside the folder
                if os.path.exists(file_path):
                    # Optimised query which implements predicate pushdown for each file
                    # Polars optimises the query to make it fast and efficient
                    q = (
                    pl.scan_csv(file_path,
                    dtypes = cols_schema) #all as strings
                        .select(pl.col(cols_select_list))
                    .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                    .group_by('UPRN').last()
                    )
                    # The query is collected for each file
                    df = q.collect()
                    # the collected dataframe is appended to the list
                    all_dataframes.append(df)
    # Concatenate list of dataframes into one consolidated DF                
    certs_df = pl.concat(all_dataframes)                
    return certs_df

In [7]:
dom_certs_df = opt_certs(la_list=la_list,
                         cols_schema=cols_schema_dom,
                         root_dir='data/all-domestic-certificates/')

In [15]:
nondom_certs = opt_certs(la_list=la_list,
                         cols_schema=cols_schema_nondom,
                         root_dir='data/all-non-domestic-certificates/'

)

In [16]:
nondom_certs.glimpse()

Rows: 160684
Columns: 17
$ UPRN                               <i64> None, 10001240792, 10001240793, 10001240855, 10001240878, 10001240879, 10001240935, 10001240937, 10001241269, 10001241306
$ LMK_KEY                            <str> '987452fe0c52d188f32ec4c6ea2b17225abd4e7e3658086c173267a59e5e184f', 'ffb12f5ee354363af759b2fb244c27889e121ed19ca6f08f3b6f39ed8b477179', '3387d6d899e4a5a1783ebb5b264e560568070ea39eb557bf18568d7cfbd02c82', '87637770952014011609163407009139', 'bb6f0769a6f508b6fde6eb41fe032bf904b830b55261731a9563386e1c2df1a0', '144594841032019042611130857000497', '4fa1809006824d509debfedbf0203725df4c4f3de7e33f1a5491f08d695bc311', '0443186cc593abd547c9d3e954712af9f45df3e22984eb76c643f714b2f11d6c', '63020028052012042010543409009768', '0d6fcf659ff3badad4c0c131b86a7f76adff28dba92b65da92e38c1e51419f90'
$ POSTCODE                           <str> 'BL3 6JT', 'BL6 6QQ', 'BL6 6QQ', 'BL5 3XJ', 'BL5 3XH', 'BL5 3XH', 'BL3 6BT', 'BL3 6DH', 'BL1 4DH', 'BL6 6SY'
$ BUILDING_REFERENCE_NUMBER    