In [1]:
# use duck env - make sure to restart jupyter kernel. duckdb need python 3.10
import polars as pl
import duckdb
import get_ca_data as get_ca # functions for retrieving CA \ common data

This notebook retrieves all the base data needed for comparison analysis with other Combined Authorities and loads it into a duckdb database.
SQLite was tried, but it is slow, not directly compatible with polars and does not work well with datasette because of the size of data.

In [2]:
ca_la_df = get_ca.get_ca_la_df(2023, inc_ns=True) # include NS
ca_la_df.glimpse()


Rows: 54
Columns: 4
$ ladcd   <str> 'E08000001', 'E08000002', 'E08000003', 'E08000004', 'E08000005', 'E08000006', 'E08000007', 'E08000008', 'E08000009', 'E08000010'
$ ladnm   <str> 'Bolton', 'Bury', 'Manchester', 'Oldham', 'Rochdale', 'Salford', 'Stockport', 'Tameside', 'Trafford', 'Wigan'
$ cauthcd <str> 'E47000001', 'E47000001', 'E47000001', 'E47000001', 'E47000001', 'E47000001', 'E47000001', 'E47000001', 'E47000001', 'E47000001'
$ cauthnm <str> 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester', 'Greater Manchester'



In [3]:
# retrieve the epc certs from the source csv s
la_list = ca_la_df['ladcd'] #includes north somerset
f'There are {str(la_list.shape)[1:3]} Local Authorities in Combined Authorities'


'There are 54 Local Authorities in Combined Authorities'

In [4]:
epc_non_domestic = get_ca.ingest_nondom_certs(la_list, root_dir = 'data/all-non-domestic-certificates')
epc_non_domestic.glimpse()

Rows: 160684
Columns: 17
$ UPRN                               <i64> None, 10001240792, 10001240793, 10001240855, 10001240878, 10001240879, 10001240935, 10001240937, 10001241269, 10001241306
$ LMK_KEY                            <str> '987452fe0c52d188f32ec4c6ea2b17225abd4e7e3658086c173267a59e5e184f', 'ffb12f5ee354363af759b2fb244c27889e121ed19ca6f08f3b6f39ed8b477179', '3387d6d899e4a5a1783ebb5b264e560568070ea39eb557bf18568d7cfbd02c82', '87637770952014011609163407009139', 'bb6f0769a6f508b6fde6eb41fe032bf904b830b55261731a9563386e1c2df1a0', '144594841032019042611130857000497', '4fa1809006824d509debfedbf0203725df4c4f3de7e33f1a5491f08d695bc311', '0443186cc593abd547c9d3e954712af9f45df3e22984eb76c643f714b2f11d6c', '63020028052012042010543409009768', '0d6fcf659ff3badad4c0c131b86a7f76adff28dba92b65da92e38c1e51419f90'
$ POSTCODE                           <str> 'BL3 6JT', 'BL6 6QQ', 'BL6 6QQ', 'BL5 3XJ', 'BL5 3XH', 'BL5 3XH', 'BL3 6BT', 'BL3 6DH', 'BL1 4DH', 'BL6 6SY'
$ BUILDING_REFERENCE_NUMBER    

In [5]:
epc_domestic = get_ca.ingest_dom_certs(la_list, root_dir = 'data/all-domestic-certificates')
epc_domestic.glimpse()


Rows: 4728852
Columns: 11
$ UPRN                                <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ LMK_KEY                             <str> '632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2', '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505'
$ POSTCODE                            <str> 'BL5 3WB', 'BL3 1PS', 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ CURRENT_ENERGY_RATING               <str> 'B', 'C', 'C', 'D', 'C', 'C', 'C', 'D', '

In [6]:
# read from saved file
epc_clean = (pl.read_csv('data/epc_subset_polars_last.csv',
                          dtypes={'LODGEMENT_DATETIME': pl.Datetime}))
# epc_clean.glimpse()

In [6]:
epc_domestic_df = get_ca.wrangle_epc(epc_domestic)
epc_domestic_df.glimpse()

Rows: 4728852
Columns: 13
$ uprn                       <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ lmk_key                    <str> '632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2', '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505'
$ postcode                   <str> 'BL5 3WB', 'BL3 1PS', 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ current_energy_rating      <str> 'B', 'C', 'C', 'D', 'C', 'C', 'C', 'D', 'E', 'D'
$ local_authority           

In [7]:
epc_non_domestic_df = get_ca.wrangle_epc(epc_non_domestic)
epc_non_domestic_df.glimpse()

Rows: 160684
Columns: 19
$ uprn                      <i64> None, 10001240792, 10001240793, 10001240855, 10001240878, 10001240879, 10001240935, 10001240937, 10001241269, 10001241306
$ lmk_key                   <str> '987452fe0c52d188f32ec4c6ea2b17225abd4e7e3658086c173267a59e5e184f', 'ffb12f5ee354363af759b2fb244c27889e121ed19ca6f08f3b6f39ed8b477179', '3387d6d899e4a5a1783ebb5b264e560568070ea39eb557bf18568d7cfbd02c82', '87637770952014011609163407009139', 'bb6f0769a6f508b6fde6eb41fe032bf904b830b55261731a9563386e1c2df1a0', '144594841032019042611130857000497', '4fa1809006824d509debfedbf0203725df4c4f3de7e33f1a5491f08d695bc311', '0443186cc593abd547c9d3e954712af9f45df3e22984eb76c643f714b2f11d6c', '63020028052012042010543409009768', '0d6fcf659ff3badad4c0c131b86a7f76adff28dba92b65da92e38c1e51419f90'
$ postcode                  <str> 'BL3 6JT', 'BL6 6QQ', 'BL6 6QQ', 'BL5 3XJ', 'BL5 3XH', 'BL5 3XH', 'BL3 6BT', 'BL3 6DH', 'BL1 4DH', 'BL6 6SY'
$ building_reference_number <str> '10005205750', '10003935

In [8]:
ca_la_codes = get_ca.get_ca_la_codes(ca_la_df)
postcode_file = get_ca.get_zipped_csv_file(url = "https://www.arcgis.com/sharing/rest/content/items/3770c5e8b0c24f1dbe6d2fc6b46a0b18/data",
                      file_folder_name = "postcode_lookup")
postcodes_df = get_ca.get_postcode_df(postcode_file, ca_la_codes)

In [10]:
# this is hit by the 2000 record limit too so use DL file
# input_file = get_ca.get_geojson(url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022/FeatureServer/0/query",
#                       destination_directory = "data\\geojson")

In [9]:
ca_lsoa_codes = get_ca.get_ca_lsoa_codes(postcodes_df)

In [10]:
url_imd_csv = "https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::index-of-multiple-deprivation-dec-2019-lookup-in-england.csv?where=1=1&outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"

In [11]:
imd_df_raw = pl.read_csv(url_imd_csv)

In [12]:
rename_dict_imd = get_ca.get_rename_dict(imd_df_raw, get_ca.remove_numbers, rm_numbers = True)

In [13]:
imd_df = (
    imd_df_raw
    .rename(rename_dict_imd)
    .select(pl.all().exclude('fid'))
    .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
)
n_unmatched_lsoas = len(ca_lsoa_codes) - imd_df.shape[0]

In [16]:
# https://geoportal.statistics.gov.uk/datasets/lsoa-dec-2021-pwc-for-england-and-wales/explore

In [14]:
# rename the LSOA features and return the path
cleaned_lsoa_pwc_path = get_ca.clean_lsoa_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107 (2).geojson',
                                                  lsoacd='LSOA21CD')

In [15]:
# filter for just the LSOA's in Combined Authorities
ca_lsoa_pwc_path = get_ca.filter_geojson(input_file = cleaned_lsoa_pwc_path,
                                         output_file='data/geojson/ca_lsoa_pwc.geojson',
                                         property_name ='lsoacd',
                                         ca_lsoa_codes = ca_lsoa_codes)

In [16]:
# reproject to WGS84:4326 as default from ONS is 27700
reproject_path = get_ca.reproject(ca_lsoa_pwc_path, output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson', lsoa_code = 'lsoacd')

Load the data into a duckDB data base

In [17]:
command_list = [
    'INSTALL spatial;',
    'LOAD spatial;',
    f'CREATE OR REPLACE TABLE lsoa_pwc_tbl AS SELECT * FROM ST_Read("{reproject_path}")',
    'CREATE UNIQUE INDEX lsoacd_idx ON lsoa_pwc_tbl (lsoacd)',
    'CREATE OR REPLACE TABLE ca_la_tbl AS SELECT * FROM ca_la_df',
    'CREATE OR REPLACE TABLE imd_tbl AS SELECT * FROM imd_df',
    'CREATE OR REPLACE TABLE postcodes_tbl AS SELECT * FROM postcodes_df',
    'CREATE UNIQUE INDEX postcode_idx ON postcodes_tbl (postcode)',
    'CREATE OR REPLACE TABLE epc_domestic_tbl AS SELECT * FROM epc_domestic_df',
    'CREATE OR REPLACE TABLE epc_non_domestic_tbl AS SELECT * FROM epc_non_domestic_df'
]

In [18]:
out = get_ca.load_data(command_list=command_list, db_path='data/ca_epc.duckdb', overwrite = True)

Introspect Database


In [19]:
con = duckdb.connect('data/ca_epc.duckdb')

In [20]:
con.sql("SHOW ALL TABLES;")

┌──────────┬─────────┬──────────────────────┬──────────────────────┬───────────────────────────────────────┬───────────┐
│ database │ schema  │         name         │     column_names     │             column_types              │ temporary │
│ varchar  │ varchar │       varchar        │      varchar[]       │               varchar[]               │  boolean  │
├──────────┼─────────┼──────────────────────┼──────────────────────┼───────────────────────────────────────┼───────────┤
│ ca_epc   │ main    │ ca_la_tbl            │ [ladcd, ladnm, cau…  │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR]  │ false     │
│ ca_epc   │ main    │ epc_domestic_tbl     │ [uprn, lmk_key, po…  │ [BIGINT, VARCHAR, VARCHAR, VARCHAR,…  │ false     │
│ ca_epc   │ main    │ epc_non_domestic_tbl │ [uprn, lmk_key, po…  │ [BIGINT, VARCHAR, VARCHAR, VARCHAR,…  │ false     │
│ ca_epc   │ main    │ imd_tbl              │ [lsoacd, lsoanm, l…  │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR…  │ false     │
│ ca_epc   │ main    │ lsoa_pwc_

In [21]:
con.sql('DESCRIBE lsoa_pwc_tbl')

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│ column_name │ column_type │  null   │   key   │ default │ extra │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ lsoacd      │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ geom        │ GEOMETRY    │ YES     │ NULL    │ NULL    │  NULL │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘

In [22]:
con.sql('SELECT COUNT(*) num_rows FROM lsoa_pwc_tbl')

┌──────────┐
│ num_rows │
│  int64   │
├──────────┤
│     9585 │
└──────────┘

In [23]:
con.close()

Section below is to load data into SQLite - legacy.

In [8]:
# epc_clean_tbl_payload = epc_clean_df.to_dicts()
ca_la_tbl_payload = ca_la_df.to_dicts()
postcodes_tbl_payload = postcodes_df.to_dicts()
imd_tbl_payload = imd_df.to_dicts()

In [9]:
# population weighted centroids for each LSOA - has to be downloaded as there is no straightforward way to query the CA LSOA's from
# open online datasets

lsoa_bng_file_path = get_ca.filter_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107.geojson',
 output_file='data/geojson/ca_lsoa_pwc.geojson',
 property_name = 'LSOA21CD',
 ca_lsoa_codes = ca_lsoa_codes)

In [10]:
get_ca.reproject(input_bng_file='data/geojson/ca_lsoa_pwc.geojson',
                  output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson',
                    lsoa_code='LSOA21CD')

'data/geojson/ca_lsoa_pwc_wgs84.geojson'

In [9]:

db = Database('data/ca_epc.db', recreate = True)
db.close()

In [10]:
# import the lsoa PWC file as geojson to the DB setting the primary key to lsoacd
!geojson-to-sqlite data/ca_epc.db lsoa_pwc_tbl data/geojson/ca_lsoa_pwc_wgs84.geojson --pk=lsoacd


In [11]:
db = Database('data/ca_epc.db', recreate = False)
print(db.schema)

CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER,
   [FID] INTEGER,
   [GlobalID] TEXT,
   [lsoacd] TEXT PRIMARY KEY,
   [geometry] TEXT
);


In [12]:
db['imd_tbl'].create(
{
 'lsoacd': str,
 'lsoanm': str,
 'ladcd': str,
 'ladnm': str,
 'imd': int
 },
 pk = 'lsoacd'
)

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [13]:
db['ca_la_tbl'].create(
{
 'ladcd': str,
 'ladnm': str,
 'cauthcd': str,
 'cauthnm': str
 },
 pk = 'ladcd'
)

<Table ca_la_tbl (ladcd, ladnm, cauthcd, cauthnm)>

In [14]:
db['postcodes_tbl'].create(
    {"pcds": str,
     "lsoacd": str,
     "msoacd": str,
     "ladcd": str,
     "ladnm": str},
     pk = 'pcds'
     )

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [20]:
# db['epc_clean_tbl'].create(
# {'uprn': int,
#  'lmk_key': str,
#  'postcode': str,
#  'current_energy_rating': str,
#  'local_authority': str,
#  'property_type': str,
#  'transaction_type': str,
#  'environment_impact_current': int,
#  'co2_emissions_current': float,
#  'tenure': str,
#  'date': str,
#  'year': int,
#  'month': int},
#  pk = 'lmk_key'
# )

<Table epc_clean_tbl (uprn, lmk_key, postcode, current_energy_rating, local_authority, property_type, transaction_type, environment_impact_current, co2_emissions_current, tenure, date, year, month)>

In [15]:
print(db.schema)

CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER,
   [FID] INTEGER,
   [GlobalID] TEXT,
   [lsoacd] TEXT PRIMARY KEY,
   [geometry] TEXT
);
CREATE TABLE [imd_tbl] (
   [lsoacd] TEXT PRIMARY KEY,
   [lsoanm] TEXT,
   [ladcd] TEXT,
   [ladnm] TEXT,
   [imd] INTEGER
);
CREATE TABLE [ca_la_tbl] (
   [ladcd] TEXT PRIMARY KEY,
   [ladnm] TEXT,
   [cauthcd] TEXT,
   [cauthnm] TEXT
);
CREATE TABLE [postcodes_tbl] (
   [pcds] TEXT PRIMARY KEY,
   [lsoacd] TEXT,
   [msoacd] TEXT,
   [ladcd] TEXT,
   [ladnm] TEXT
);


In [16]:
# db['epc_clean_tbl'].add_foreign_key('postcode', 'postcodes_tbl', 'pcds') # too big causes crash
db['postcodes_tbl'].add_foreign_key('lsoacd', 'imd_tbl', 'lsoacd')
db['postcodes_tbl'].add_foreign_key('ladcd', 'ca_la_tbl', 'ladcd')
db['postcodes_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')
db['imd_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [17]:
db['ca_la_tbl'].insert_all(ca_la_tbl_payload)
# db['epc_clean_tbl'].insert_all(epc_clean_tbl_payload)
db['postcodes_tbl'].insert_all(postcodes_tbl_payload)
db['imd_tbl'].insert_all(imd_tbl_payload)


<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [19]:
for row in db['lsoa_pwc_tbl'].rows:
    print(row)

{'id': 7, 'FID': 7, 'GlobalID': '7870373f-0e3a-4c12-bc57-7dcd9a6be0c5', 'lsoacd': 'E01009311', 'geometry': '{"type": "Point", "coordinates": [-1.7922739589276733, 52.48516715035018]}'}
{'id': 20, 'FID': 20, 'GlobalID': '5e776c0d-c7f8-4502-977c-513085c2ddd2', 'lsoacd': 'E01014512', 'geometry': '{"type": "Point", "coordinates": [-2.5880359277956795, 51.47609023629021]}'}
{'id': 36, 'FID': 36, 'GlobalID': '7fbd7749-9219-4f4a-860a-46fd889078fa', 'lsoacd': 'E01014555', 'geometry': '{"type": "Point", "coordinates": [-2.6121562985959565, 51.455023667326465]}'}
{'id': 42, 'FID': 42, 'GlobalID': '51d53b99-19d5-4927-a97e-d4911fa41e78', 'lsoacd': 'E01014483', 'geometry': '{"type": "Point", "coordinates": [-2.352957896397478, 51.37686042258114]}'}
{'id': 57, 'FID': 57, 'GlobalID': '8c136428-69ed-4cd0-ab60-9e15b68f1bcb', 'lsoacd': 'E01014577', 'geometry': '{"type": "Point", "coordinates": [-2.5521929447872216, 51.484767867012316]}'}
{'id': 76, 'FID': 76, 'GlobalID': 'a52cc2bf-8bc6-4827-a7e1-cba3d64

In [20]:
print(db.schema)

CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER,
   [FID] INTEGER,
   [GlobalID] TEXT,
   [lsoacd] TEXT PRIMARY KEY,
   [geometry] TEXT
);
CREATE TABLE [ca_la_tbl] (
   [ladcd] TEXT PRIMARY KEY,
   [ladnm] TEXT,
   [cauthcd] TEXT,
   [cauthnm] TEXT
);
CREATE TABLE "postcodes_tbl" (
   [pcds] TEXT PRIMARY KEY,
   [lsoacd] TEXT REFERENCES [lsoa_pwc_tbl]([lsoacd]),
   [msoacd] TEXT,
   [ladcd] TEXT REFERENCES [ca_la_tbl]([ladcd]),
   [ladnm] TEXT
);
CREATE TABLE "imd_tbl" (
   [lsoacd] TEXT PRIMARY KEY REFERENCES [lsoa_pwc_tbl]([lsoacd]),
   [lsoanm] TEXT,
   [ladcd] TEXT,
   [ladnm] TEXT,
   [imd] INTEGER
);


In [23]:
db.close()

: 

In [22]:
!datasette data/ca_epc.db

^C
