In [2]:
# use llm env - make sure to restart jupyter kernel. duckdb need python 3.10
import polars as pl
import duckdb
import get_ca_data as get_ca # functions for retrieving CA \ common data
from pathlib import Path

In [3]:
pl.DataFrame()
# get_ca.ingest_certs.

This notebook retrieves all the base data needed for comparison analysis with other Combined Authorities and loads it into a duckdb database.
SQLite was tried, but it is slow, not directly compatible with polars and does not work well with datasette because of the size of data.

In [4]:
ca_la_df = get_ca.get_ca_la_df(2023, inc_ns=True) # include NS
ca_la_df.glimpse()


Rows: 54
Columns: 4
$ ladcd   <str> E08000001, E08000002, E08000003, E08000004, E08000005, E08000006, E08000007, E08000008, E08000009, E08000010
$ ladnm   <str> Bolton, Bury, Manchester, Oldham, Rochdale, Salford, Stockport, Tameside, Trafford, Wigan
$ cauthcd <str> E47000001, E47000001, E47000001, E47000001, E47000001, E47000001, E47000001, E47000001, E47000001, E47000001
$ cauthnm <str> Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester, Greater Manchester



In [5]:
# retrieve the epc certs from the source csv s
la_list = (ca_la_df['ladcd']) #includes north somerset
f'There are {str(la_list.shape)[1:3]} Local Authorities in Combined Authorities'


'There are 54 Local Authorities in Combined Authorities'

Get the lookup table that relates DFT Local authority ID's in the Combined authorities to ONS LA codes

In [6]:
ca_la_dft_lookup_df = get_ca.get_ca_la_dft_lookup(
    dft_csv_path = 'https://storage.googleapis.com/dft-statistics/road-traffic/downloads/data-gov-uk/local_authority_traffic.csv',
    la_list = la_list)
ca_la_dft_lookup_df.glimpse()

Rows: 49
Columns: 3
$ dft_la_id <i64> 56, 62, 63, 75, 79, 84, 85, 86, 87, 88
$ ladcd     <str> E08000007, E08000018, E08000035, E08000010, E08000017, E08000015, E08000003, E08000002, E08000033, E08000036
$ year      <i64> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022



In [7]:
cols_schema_nondom = {
    'LMK_KEY': pl.Utf8,
    'POSTCODE': pl.Utf8,
    'BUILDING_REFERENCE_NUMBER': pl.Int64,
    'ASSET_RATING': pl.Int64,
    'ASSET_RATING_BAND': pl.Utf8,
    'PROPERTY_TYPE': pl.Utf8,
    'LOCAL_AUTHORITY': pl.Utf8,
    'CONSTITUENCY': pl.Utf8,
    'TRANSACTION_TYPE': pl.Utf8,
    'STANDARD_EMISSIONS': pl.Float64,
    'TYPICAL_EMISSIONS': pl.Float64,
    'TARGET_EMISSIONS': pl.Float64,
    'BUILDING_EMISSIONS': pl.Float64,
    'BUILDING_LEVEL': pl.Int64,
    'RENEWABLE_SOURCES': pl.Utf8,
    'LODGEMENT_DATETIME': pl.Utf8,
    'UPRN': pl.Utf8
    }

In [8]:

epc_non_domestic = get_ca.ingest_certs(la_list,
                                              cols_schema = cols_schema_nondom,
                                              root_dir = 'data/all-non-domestic-certificates')
epc_non_domestic.glimpse()

Rows: 160684
Columns: 17
$ UPRN                               <str> 10070926743, 200002544806, 100012552086, 10070921447, 10070929417, 200002560793, 100012555277, 100012562246, 200002559499, 100012561389
$ LMK_KEY                            <str> 125957590402017103112461189509700, 22707574052009090708511004000692, 8c225de542537f954b5eca17edcfa9025092ea349033c9547022fa54b527239c, 96442130542014102317572942609200, 154396610062020032416120001970160, 34614150032010042919154815000094, 16919593052009051119231403000989, 29e1dfa1becc02ea35249557037efecc19f79ad8ca041743faed058bee6ce61d, 135962200002018081509452114400980, 120366573032017031410360532000594
$ POSTCODE                           <str> BL5 1FH, BL1 2BZ, BL1 2HJ, BL5 3RZ, BL6 6GQ, BL3 6NR, BL1 7AF, BL6 7AX, BL1 8TU, BL5 3AZ
$ BUILDING_REFERENCE_NUMBER          <i64> 245049580000, 626444090000, 10001294401, 757153640000, 608615160000, 543507450000, 990433780000, 10001709451, 300340410000, 747250200000
$ ASSET_RATING                    

In [9]:
dates = ['2014-01-23 19:35:54', '2016-07-25 17:56:11', '2022-09-15 16:26:14']
index = [1, 2, 3]
date_df = pl.DataFrame({'dates':dates,
           'index': index})

date_df.with_columns(pl.col('dates').str.to_datetime(format='%Y-%m-%d %H:%M:%S'))

dates,index
datetime[μs],i64
2014-01-23 19:35:54,1
2016-07-25 17:56:11,2
2022-09-15 16:26:14,3


In [10]:
cols_schema_dom = {
                        'LMK_KEY':pl.Utf8,
                        'POSTCODE':pl.Utf8,
                        'CURRENT_ENERGY_RATING':pl.Utf8,
                        'LOCAL_AUTHORITY':pl.Utf8,
                        'PROPERTY_TYPE':pl.Utf8,
                        'LODGEMENT_DATETIME':pl.Utf8,
                        'TRANSACTION_TYPE': pl.Utf8,
                        'ENVIRONMENT_IMPACT_CURRENT':pl.Int64,
                        'CO2_EMISSIONS_CURRENT': pl.Float64,
                        'TENURE':pl.Utf8,
                        'UPRN':pl.Int64
                    }

In [11]:
epc_domestic = get_ca.ingest_certs(la_list = la_list,
                                   cols_schema = cols_schema_dom,
                                   root_dir = 'data/all-domestic-certificates')
epc_domestic.glimpse()


Rows: 4728852
Columns: 11
$ UPRN                                <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ LMK_KEY                             <str> 632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2, 8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c, 69223609222016052017263804808356, 1099459139642014100812510823040588, 876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d, e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15, 1019963559062013100615514564448457, 1485207013512016100421133096069648, 1389662563632016072714480291278400, aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505
$ POSTCODE                            <str> BL5 3WB, BL3 1PS, BL1 5WA, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP
$ CURRENT_ENERGY_RATING               <str> B, C, C, D, C, C, C, D, E, D
$ LOCAL_AUTHORITY                     <str> E0800000

In [12]:
# read from saved file
epc_clean = (pl.read_csv('data/epc_subset_polars_last.csv',
                          dtypes={'LODGEMENT_DATETIME': pl.Datetime}))
# epc_clean.glimpse()

In [13]:
epc_domestic_df = get_ca.wrangle_epc(epc_domestic)
epc_domestic_df.glimpse()

Rows: 4726616
Columns: 13
$ uprn                       <i64> 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716, 10001239717, 10001239718
$ lmk_key                    <str> 69223609222016052017263804808356, 1099459139642014100812510823040588, 876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d, e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15, 1019963559062013100615514564448457, 1485207013512016100421133096069648, 1389662563632016072714480291278400, aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505, 1006031609732013112914032764978690, de7a02fb12a4ef27399bd7054917b4fd2c166ac4a3d7da461f4f4b7876c8a664
$ postcode                   <str> BL1 5WA, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP
$ current_energy_rating      <str> C, D, C, C, C, D, E, D, D, C
$ local_authority            <str> E08000001, E08000001, E08000001, E08000001, E08000001, E08000001, E08000001

In [14]:
epc_non_domestic_df = get_ca.wrangle_epc(epc_non_domestic)
epc_non_domestic_df.glimpse()

Rows: 160550
Columns: 19
$ uprn                      <str> 10070926743, 200002544806, 100012552086, 10070921447, 10070929417, 200002560793, 100012555277, 100012562246, 200002559499, 100012561389
$ lmk_key                   <str> 125957590402017103112461189509700, 22707574052009090708511004000692, 8c225de542537f954b5eca17edcfa9025092ea349033c9547022fa54b527239c, 96442130542014102317572942609200, 154396610062020032416120001970160, 34614150032010042919154815000094, 16919593052009051119231403000989, 29e1dfa1becc02ea35249557037efecc19f79ad8ca041743faed058bee6ce61d, 135962200002018081509452114400980, 120366573032017031410360532000594
$ postcode                  <str> BL5 1FH, BL1 2BZ, BL1 2HJ, BL5 3RZ, BL6 6GQ, BL3 6NR, BL1 7AF, BL6 7AX, BL1 8TU, BL5 3AZ
$ building_reference_number <i64> 245049580000, 626444090000, 10001294401, 757153640000, 608615160000, 543507450000, 990433780000, 10001709451, 300340410000, 747250200000
$ asset_rating              <i64> 17, 133, 77, 83, 30, 61, 73, 64, 56,

In [15]:
ca_la_codes = get_ca.get_ca_la_codes(ca_la_df)
postcode_file = get_ca.get_zipped_csv_file(url = "https://www.arcgis.com/sharing/rest/content/items/3770c5e8b0c24f1dbe6d2fc6b46a0b18/data",
                      file_folder_name = "postcode_lookup")
postcodes_df = get_ca.get_postcode_df(postcode_file, ca_la_codes)

In [16]:
# this is hit by the 2000 record limit too so use DL file
# input_file = get_ca.get_geojson(url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022/FeatureServer/0/query",
#                       destination_directory = "data\\geojson")

In [17]:
ca_lsoa_codes = get_ca.get_ca_lsoa_codes(postcodes_df)

In [18]:
url_imd_csv = "https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::index-of-multiple-deprivation-dec-2019-lookup-in-england.csv?where=1=1&outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"

In [19]:
imd_df_raw = pl.read_csv(url_imd_csv)

In [20]:
rename_dict_imd = get_ca.get_rename_dict(imd_df_raw, get_ca.remove_numbers, rm_numbers = True)

In [21]:
imd_df = (
    imd_df_raw
    .rename(rename_dict_imd)
    .select(pl.all().exclude('fid'))
    .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
)
n_unmatched_lsoas = len(ca_lsoa_codes) - imd_df.shape[0]

In [22]:
# https://geoportal.statistics.gov.uk/datasets/lsoa-dec-2021-pwc-for-england-and-wales/explore

In [23]:
# rename the LSOA features and return the path
cleaned_lsoa_pwc_path = get_ca.clean_lsoa_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107 (2).geojson',
                                                  lsoacd='LSOA21CD')

In [24]:
# filter for just the LSOA's in Combined Authorities
ca_lsoa_pwc_path = get_ca.filter_geojson(input_file = cleaned_lsoa_pwc_path,
                                         output_file='data/geojson/ca_lsoa_pwc.geojson',
                                         property_name ='lsoacd',
                                         ca_lsoa_codes = ca_lsoa_codes)

In [25]:
# reproject to WGS84:4326 as default from ONS is 27700
reproject_path = get_ca.reproject(ca_lsoa_pwc_path, output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson', lsoa_code = 'lsoacd')

Load the data into a duckDB data base

In [26]:
command_list = [
    'INSTALL spatial;',
    'LOAD spatial;',
    f'CREATE OR REPLACE TABLE lsoa_pwc_tbl AS SELECT * FROM ST_Read("{reproject_path}")',
    'CREATE UNIQUE INDEX lsoacd_idx ON lsoa_pwc_tbl (lsoacd)',
    'CREATE OR REPLACE TABLE ca_la_tbl AS SELECT * FROM ca_la_df',
    'CREATE OR REPLACE TABLE imd_tbl AS SELECT * FROM imd_df',
    'CREATE OR REPLACE TABLE postcodes_tbl AS SELECT * FROM postcodes_df',
    'CREATE UNIQUE INDEX postcode_idx ON postcodes_tbl (postcode)',
    'CREATE OR REPLACE TABLE epc_domestic_tbl AS SELECT * FROM epc_domestic_df',
    'CREATE UNIQUE INDEX uprn_idx ON epc_domestic_tbl (uprn)',
    'CREATE OR REPLACE TABLE epc_non_domestic_tbl AS SELECT * FROM epc_non_domestic_df',
    'CREATE UNIQUE INDEX uprn_nondom_idx ON epc_non_domestic_tbl (uprn)',
    'CREATE OR REPLACE TABLE ca_la_dft_lookup_tbl AS SELECT * FROM ca_la_dft_lookup_df',
    'CREATE UNIQUE INDEX ca_la_dft_lookup_idx ON ca_la_dft_lookup_tbl (ladcd)'
]

In [27]:
# con = duckdb.connect('data/ca_epc_test.duckdb')

# con.execute(command_list[5])

In [28]:
out = get_ca.load_data(command_list=command_list, db_path='data/ca_epc.duckdb', overwrite = True)

Introspect Database


In [29]:
con = duckdb.connect(out)

In [30]:
con.sql("SHOW ALL TABLES;")

┌──────────┬─────────┬──────────────────────┬──────────────────────┬───────────────────────────────────────┬───────────┐
│ database │ schema  │         name         │     column_names     │             column_types              │ temporary │
│ varchar  │ varchar │       varchar        │      varchar[]       │               varchar[]               │  boolean  │
├──────────┼─────────┼──────────────────────┼──────────────────────┼───────────────────────────────────────┼───────────┤
│ ca_epc   │ main    │ ca_la_dft_lookup_tbl │ [dft_la_id, ladcd,…  │ [BIGINT, VARCHAR, BIGINT]             │ false     │
│ ca_epc   │ main    │ ca_la_tbl            │ [ladcd, ladnm, cau…  │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR]  │ false     │
│ ca_epc   │ main    │ epc_domestic_tbl     │ [uprn, lmk_key, po…  │ [BIGINT, VARCHAR, VARCHAR, VARCHAR,…  │ false     │
│ ca_epc   │ main    │ epc_non_domestic_tbl │ [uprn, lmk_key, po…  │ [VARCHAR, VARCHAR, VARCHAR, BIGINT,…  │ false     │
│ ca_epc   │ main    │ imd_tbl  

In [31]:
con.sql('DESCRIBE epc_domestic_tbl')

┌────────────────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│        column_name         │ column_type │  null   │   key   │ default │ extra │
│          varchar           │   varchar   │ varchar │ varchar │ varchar │ int32 │
├────────────────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ uprn                       │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ lmk_key                    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ postcode                   │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ current_energy_rating      │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ local_authority            │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ property_type              │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ transaction_type           │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ environment_impact_current │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ co

In [32]:
con.sql('DESCRIBE epc_non_domestic_tbl')

┌───────────────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│        column_name        │ column_type │  null   │   key   │ default │ extra │
│          varchar          │   varchar   │ varchar │ varchar │ varchar │ int32 │
├───────────────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ uprn                      │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ lmk_key                   │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ postcode                  │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ building_reference_number │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ asset_rating              │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ asset_rating_band         │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ property_type             │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ local_authority           │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ constituency  

In [33]:
con.sql('SELECT COUNT(*) num_rows FROM lsoa_pwc_tbl')

┌──────────┐
│ num_rows │
│  int64   │
├──────────┤
│     9585 │
└──────────┘

In [34]:
con.close()

Section below is to load data into SQLite.

In [26]:
# dictionary of all the dfs to be imported to sqlite - for datasette
dfs_dict = {
    'ca_la_tbl':ca_la_df,
    'imd_tbl':imd_df,
    'postcodes_tbl':postcodes_df,
    'epc_domestic_tbl':epc_domestic_df,
    'epc_non_domestic_tbl': epc_non_domestic_df,
    'ca_la_dft_lookup_tbl':ca_la_dft_lookup_df

}

In [4]:
# %pip install adbc_driver_sqlite

In [23]:
[df.write_csv(f'data/holding/{table_name}.csv') for table_name, df in dfs_dict.items()]

[None, None, None, None, None, None]

In [3]:
def import_dfs(folder_path: str = 'data/holding'):
    # Looping through each CSV file in the folder
    for file in Path(folder_path).glob('*.csv'):
        # Getting the stem (file name without extension) of the file
        stem = file.stem

        # Reading the CSV file into a DataFrame
        df = pl.read_csv(file)

        # Storing the DataFrame in the dictionary with the stem as the key
        globals()[stem] = df
    # return dataframes

In [6]:
import_dfs() # to save running the import routines

In [28]:
get_ca.populate_sqlite(dfs_dict, db_path='data/sqlite/ca_epc.db', overwrite=True)

In [10]:
get_ca.populate_sqlite(tables_dict, uri)

NameError: name 'ca_la_df' is not defined

InternalError: INTERNAL: [SQLite] Failed to create table: table "ca_la_tbl" already exists (executed 'CREATE TABLE main . "ca_la_tbl" ("ladcd" TEXT, "ladnm" TEXT, "cauthcd" TEXT, "cauthnm" TEXT)')

In [9]:
# population weighted centroids for each LSOA - has to be downloaded as there is no straightforward way to query the CA LSOA's from
# open online datasets

lsoa_bng_file_path = get_ca.filter_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107.geojson',
 output_file='data/geojson/ca_lsoa_pwc.geojson',
 property_name = 'LSOA21CD',
 ca_lsoa_codes = ca_lsoa_codes)

In [10]:
get_ca.reproject(input_bng_file='data/geojson/ca_lsoa_pwc.geojson',
                  output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson',
                    lsoa_code='LSOA21CD')

'data/geojson/ca_lsoa_pwc_wgs84.geojson'

In [9]:

db = Database('data/ca_epc.db', recreate = True)
db.close()

In [62]:
# %pip install geojson-to-sqlite

In [60]:
# import the lsoa PWC file as geojson to the DB setting the primary key to lsoacd
!geojson-to-sqlite data/sqlite/ca_epc.db lsoa_pwc_tbl data/geojson/ca_lsoa_pwc_wgs84.geojson --pk=lsoacd


'geojson-to-sqlite' is not recognized as an internal or external command,
operable program or batch file.


In [11]:
db = Database('data/ca_epc.db', recreate = False)
print(db.schema)

CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER,
   [FID] INTEGER,
   [GlobalID] TEXT,
   [lsoacd] TEXT PRIMARY KEY,
   [geometry] TEXT
);


In [12]:
db['imd_tbl'].create(
{
 'lsoacd': str,
 'lsoanm': str,
 'ladcd': str,
 'ladnm': str,
 'imd': int
 },
 pk = 'lsoacd'
)

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [13]:
db['ca_la_tbl'].create(
{
 'ladcd': str,
 'ladnm': str,
 'cauthcd': str,
 'cauthnm': str
 },
 pk = 'ladcd'
)

<Table ca_la_tbl (ladcd, ladnm, cauthcd, cauthnm)>

In [14]:
db['postcodes_tbl'].create(
    {"pcds": str,
     "lsoacd": str,
     "msoacd": str,
     "ladcd": str,
     "ladnm": str},
     pk = 'pcds'
     )

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [None]:
print(db.schema)

In [16]:
# db['epc_clean_tbl'].add_foreign_key('postcode', 'postcodes_tbl', 'pcds') # too big causes crash
db['postcodes_tbl'].add_foreign_key('lsoacd', 'imd_tbl', 'lsoacd')
db['postcodes_tbl'].add_foreign_key('ladcd', 'ca_la_tbl', 'ladcd')
db['postcodes_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')
db['imd_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [17]:
db['ca_la_tbl'].insert_all(ca_la_tbl_payload)
# db['epc_clean_tbl'].insert_all(epc_clean_tbl_payload)
db['postcodes_tbl'].insert_all(postcodes_tbl_payload)
db['imd_tbl'].insert_all(imd_tbl_payload)


<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [None]:
db.close()