In [11]:
# use .venv - make sure to restart jupyter kernel. duckdb need python 3.10
import polars as pl
import pyarrow
import duckdb
import get_ca_data as get_ca # functions for retrieving CA \ common data
from pathlib import Path
import requests
import os
import yaml
from tqdm import tqdm

This notebook retrieves all the base data needed for comparison analysis with other Combined Authorities and loads it into a duckdb database.
SQLite was tried, but it is slow, not directly compatible with polars and does not work well with datasette because of the size of data.

In [2]:
ca_la_df = get_ca.get_ca_la_df(2023, inc_ns=True) # include NS
# ca_la_df.glimpse()


In [3]:
la_list = (ca_la_df['ladcd']) #includes north somerset
f'There are {str(la_list.shape)[1:3]} Local Authorities in Combined Authorities'
ladnm = tuple(ca_la_df['ladnm'].to_list())

Get the lookup table that relates DFT Local authority ID's in the Combined authorities to ONS LA codes

In [4]:
ca_la_dft_lookup_df = get_ca.get_ca_la_dft_lookup(
    dft_csv_path = 'https://storage.googleapis.com/dft-statistics/road-traffic/downloads/data-gov-uk/local_authority_traffic.csv',
    la_list = la_list)
# ca_la_dft_lookup_df.glimpse()

In [5]:
ca_la_codes = get_ca.get_ca_la_codes(ca_la_df)
postcode_file = get_ca.get_zipped_csv_file(url = "https://www.arcgis.com/sharing/rest/content/items/3770c5e8b0c24f1dbe6d2fc6b46a0b18/data",
                      file_folder_name = "postcode_lookup")
postcodes_df = get_ca.get_postcode_df(postcode_file, ca_la_codes)

No files found in the directory.


In [6]:
# this is hit by the 2000 record limit too so use DL file
# input_file = get_ca.get_geojson(url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022/FeatureServer/0/query",
#                       destination_directory = "data\\geojson")

In [7]:
ca_lsoa_codes = get_ca.get_ca_lsoa_codes(postcodes_df)

run cell below if not needing to update LSOA geodata (its expensive and crashes)

In [8]:
reproject_path = 'data/geojson/ca_lsoa_pwc_wgs84.geojson'
reproject_lsoa_poly_path = 'data/geojson/ca_lsoa_poly_wgs84.geojson'

In [16]:
# rename the LSOA features and return the path
cleaned_lsoa_pwc_path = get_ca.clean_lsoa_geojson('data/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107.geojson',
                                                  lsoacd='LSOA21CD')

In [19]:
# filter for just the LSOA's in Combined Authorities
ca_lsoa_pwc_path = get_ca.filter_geojson(input_file = cleaned_lsoa_pwc_path,
                                         output_file='data/geojson/ca_lsoa_pwc.geojson',
                                         property_name ='lsoacd',
                                         ca_lsoa_codes = ca_lsoa_codes)

In [20]:
cleaned_lsoa_poly_path = get_ca.clean_lsoa_geojson('data/geojson/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFE_V10_1289561450475266465.geojson',
                                                   lsoacd='LSOA21CD')

In [21]:
# reproject to WGS84:4326 as default from ONS is 27700
reproject_path = get_ca.reproject(ca_lsoa_pwc_path, output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson', lsoa_code = 'lsoacd')

In [22]:
# filter for just the LSOA's polys in Combined Authorities
ca_lsoa_poly_path = get_ca.filter_geojson(input_file = cleaned_lsoa_poly_path,
                                         output_file='data/geojson/ca_lsoa_poly.geojson',
                                         property_name ='lsoacd',
                                         ca_lsoa_codes = ca_lsoa_codes)

In [23]:
reproject_lsoa_poly_path = get_ca.reproject(ca_lsoa_poly_path, output_wgs84_file='data/geojson/ca_lsoa_poly_wgs84.geojson', lsoa_code = 'lsoacd')

CSV no longer works so json is the way

In [24]:
url_imd_json = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Index_of_Multiple_Deprivation_Dec_2019_Lookup_in_England_2022/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"

In [25]:
imd_json = requests.get(url_imd_json)

In [26]:
features = imd_json.json()['features']

In [27]:
imd_df_raw = pl.DataFrame(features).unnest(columns = 'attributes')

In [28]:
rename_dict_imd = get_ca.get_rename_dict(imd_df_raw, get_ca.remove_numbers, rm_numbers = True)

In [29]:
imd_df = (
    imd_df_raw
    .rename(rename_dict_imd)
    .select(pl.all().exclude('fid'))
    .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
)
n_unmatched_lsoas = len(ca_lsoa_codes) - imd_df.shape[0]

In [30]:
del imd_df_raw, imd_json

In [31]:
# https://geoportal.statistics.gov.uk/datasets/lsoa-dec-2021-pwc-for-england-and-wales/explore

In [32]:
cols_schema_nondom = {
    'LMK_KEY': pl.Utf8,
    'POSTCODE': pl.Utf8,
    'BUILDING_REFERENCE_NUMBER': pl.Int64,
    'ASSET_RATING': pl.Int64,
    'ASSET_RATING_BAND': pl.Utf8,
    'PROPERTY_TYPE': pl.Utf8,
    'LOCAL_AUTHORITY': pl.Utf8,
    'CONSTITUENCY': pl.Utf8,
    'TRANSACTION_TYPE': pl.Utf8,
    'STANDARD_EMISSIONS': pl.Float64,
    'TYPICAL_EMISSIONS': pl.Float64,
    'TARGET_EMISSIONS': pl.Float64,
    'BUILDING_EMISSIONS': pl.Float64,
    'BUILDING_LEVEL': pl.Int64,
    'RENEWABLE_SOURCES': pl.Utf8,
    'LODGEMENT_DATETIME': pl.Utf8,
    'UPRN': pl.Utf8
    }

In [17]:
epc_non_domestic = (pl.scan_csv(
    'data/all-non-domestic-certificates-single-file/certificates.csv',
    schema = cols_schema_nondom,

)
.filter(pl.col('LOCAL_AUTHORITY').is_in(la_list.to_list()))
.with_columns(pl.col('LODGEMENT_DATETIME').str.to_datetime(format='%Y-%m-%d %H:%M:%S', strict=False))
                    .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                    .group_by('UPRN').last()
).collect()

In [33]:
cols_schema_dom = {
                        'LMK_KEY':pl.Utf8,
                        'POSTCODE':pl.Utf8,
                        'LOCAL_AUTHORITY':pl.Utf8,
                        'PROPERTY_TYPE':pl.Utf8,
                        'LODGEMENT_DATETIME':pl.Utf8,
                        'TRANSACTION_TYPE': pl.Utf8,
                        'TENURE':pl.Utf8,
                        'MAINS_GAS_FLAG':pl.Utf8,
                        'HOT_WATER_ENERGY_EFF':pl.Utf8,
                        'WINDOWS_DESCRIPTION':pl.Utf8,
                        'WINDOWS_ENERGY_EFF':pl.Utf8,
                        'WALLS_DESCRIPTION':pl.Utf8,
                        'WALLS_ENERGY_EFF':pl.Utf8,
                        'ROOF_DESCRIPTION':pl.Utf8,
                        'ROOF_ENERGY_EFF':pl.Utf8,
                        'MAINHEAT_DESCRIPTION':pl.Utf8,
                        'MAINHEAT_ENERGY_EFF':pl.Utf8,
                        'MAINHEAT_ENV_EFF':pl.Utf8,
                        'MAIN_HEATING_CONTROLS':pl.Utf8,
                        'MAINHEATCONT_DESCRIPTION':pl.Utf8,
                        'MAINHEATC_ENERGY_EFF':pl.Utf8,
                        'MAIN_FUEL':pl.Utf8,
                        'SOLAR_WATER_HEATING_FLAG':pl.Utf8,
                        'CONSTRUCTION_AGE_BAND':pl.Utf8,
                        'CURRENT_ENERGY_RATING':pl.Utf8,
                        'POTENTIAL_ENERGY_RATING':pl.Utf8,
                        'CURRENT_ENERGY_EFFICIENCY':pl.Utf8,
                        'POTENTIAL_ENERGY_EFFICIENCY':pl.Utf8,
                        'BUILT_FORM':pl.Utf8,
                        'CONSTITUENCY':pl.Utf8,
                        'FLOOR_DESCRIPTION':pl.Utf8,
                        'ENVIRONMENT_IMPACT_CURRENT':pl.Int64,
                        'ENVIRONMENT_IMPACT_POTENTIAL':pl.Int64,
                        'ENERGY_CONSUMPTION_CURRENT':pl.Int64,
                        'ENERGY_CONSUMPTION_POTENTIAL':pl.Int64,
                        'CO2_EMISS_CURR_PER_FLOOR_AREA':pl.Int64,
                        'CO2_EMISSIONS_CURRENT': pl.Float64,
                        'CO2_EMISSIONS_POTENTIAL':pl.Float64,
                        'LIGHTING_COST_CURRENT':pl.Int64,
                        'LIGHTING_COST_POTENTIAL':pl.Int64,
                        'HEATING_COST_CURRENT':pl.Int64,
                        'HEATING_COST_POTENTIAL':pl.Int64,
                        'HOT_WATER_COST_CURRENT':pl.Int64,
                        'HOT_WATER_COST_POTENTIAL':pl.Int64,
                        'TOTAL_FLOOR_AREA':pl.Float64,
                        'NUMBER_HABITABLE_ROOMS':pl.Int64,
                        'NUMBER_HEATED_ROOMS':pl.Int64,
                        'PHOTO_SUPPLY':pl.Float64,
                        'UPRN':pl.Int64,
                        'BUILDING_REFERENCE_NUMBER':pl.Int64
                    }

In [34]:
# cols_schema_adjusted this schema is for the csv files retrieved using the epc API
cols_schema_adjusted = {
 'lmk-key': pl.Utf8,
 'postcode': pl.Utf8,
 'local-authority': pl.Utf8,
 'property-type': pl.Utf8,
 'lodgement-datetime': pl.Utf8,
 'transaction-type': pl.Utf8,
 'tenure': pl.Utf8,
 'mains-gas-flag': pl.Utf8,
 'hot-water-energy-eff': pl.Utf8,
 'windows-description': pl.Utf8,
 'windows-energy-eff': pl.Utf8,
 'walls-description': pl.Utf8,
 'walls-energy-eff': pl.Utf8,
 'roof-description': pl.Utf8,
 'roof-energy-eff': pl.Utf8,
 'mainheat-description': pl.Utf8,
 'mainheat-energy-eff': pl.Utf8,
 'mainheat-env-eff': pl.Utf8,
 'main-heating-controls': pl.Utf8,
 'mainheatcont-description': pl.Utf8,
 'mainheatc-energy-eff': pl.Utf8,
 'main-fuel': pl.Utf8,
 'solar-water-heating-flag': pl.Utf8,
 'construction-age-band': pl.Utf8,
 'current-energy-rating': pl.Utf8,
 'potential-energy-rating': pl.Utf8,
 'current-energy-efficiency': pl.Utf8,
 'potential-energy-efficiency': pl.Utf8,
 'built-form': pl.Utf8,
 'constituency': pl.Utf8,
 'floor-description': pl.Utf8,
 'environment-impact-current': pl.Int64,
 'environment-impact-potential': pl.Int64,
 'energy-consumption-current': pl.Int64,
 'energy-consumption-potential': pl.Int64,
 'co2-emiss-curr-per-floor-area': pl.Int64,
 'co2-emissions-current': pl.Float64,
 'co2-emissions-potential': pl.Float64,
 'lighting-cost-current': pl.Int64,
 'lighting-cost-potential': pl.Int64,
 'heating-cost-current': pl.Int64,
 'heating-cost-potential': pl.Int64,
 'hot-water-cost-current': pl.Int64,
 'hot-water-cost-potential': pl.Int64,
 'total-floor-area': pl.Float64,
 'number-habitable-rooms': pl.Int64,
 'number-heated-rooms': pl.Int64,
 'photo-supply': pl.Float64,
 'uprn': pl.Int64,
 'building-reference-number': pl.Int64}

In [36]:
# Only run this if you are updating the EPC data from the opendatacommunities API
# THIS WILL TAKE AT LEAST 2 HOURS TO RUN!!!
get_ca.delete_all_csv_files('data/epc_csv')
epc_key = yaml.safe_load(open('../config.yml'))['epc']['auth_token']
[get_ca.get_epc_csv(la, epc_key) for la in la_list]

Written 4540681 bytes to data/epc_csv/epc_E08000001.csv
Written 4547719 bytes to data/epc_csv/epc_E08000001.csv
Written 4566051 bytes to data/epc_csv/epc_E08000001.csv
Written 4539925 bytes to data/epc_csv/epc_E08000001.csv
Written 4643859 bytes to data/epc_csv/epc_E08000001.csv
Written 4489296 bytes to data/epc_csv/epc_E08000001.csv
Written 4481119 bytes to data/epc_csv/epc_E08000001.csv
Written 4473265 bytes to data/epc_csv/epc_E08000001.csv
Written 4450053 bytes to data/epc_csv/epc_E08000001.csv
Written 4488283 bytes to data/epc_csv/epc_E08000001.csv
Written 4461222 bytes to data/epc_csv/epc_E08000001.csv
Written 4464468 bytes to data/epc_csv/epc_E08000001.csv
Written 4474019 bytes to data/epc_csv/epc_E08000001.csv
Written 4475073 bytes to data/epc_csv/epc_E08000001.csv
Written 4476302 bytes to data/epc_csv/epc_E08000001.csv
Written 4483704 bytes to data/epc_csv/epc_E08000001.csv
Written 4477335 bytes to data/epc_csv/epc_E08000001.csv
Written 4446718 bytes to data/epc_csv/epc_E08000

KeyboardInterrupt: 

In [20]:
epc_domestic = get_ca.ingest_dom_certs_csv(la_list, cols_schema_adjusted)
epc_domestic.glimpse()

Rows: 4834093
Columns: 50
$ uprn                                   <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ lmk_key                                <str> 'e6ad9006bb1a9f37a05272e0804964314bb806040d4e913b18557f87435a4e62', '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505'
$ postcode                               <str> 'BL3 3GR', 'BL3 1PS', 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ local_authority                        <str> 'E08000001', 'E08000001', 'E0

In [21]:
# epc_domestic = get_ca.ingest_certs(la_list = la_list,
#                                    cols_schema = cols_schema_dom,
#                                    root_dir = 'data/all-domestic-certificates')
# epc_domestic.glimpse()


In [None]:
all_dom_epc_raw = ingest_dom_certs_csv(la_list, cols_schema_adjusted)

In [None]:
epc_domestic_df = get_ca.wrangle_epc(certs_df = epc_domestic)
epc_domestic_df.glimpse()

Rows: 4805082
Columns: 52
$ uprn                          <i64> 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716, 10001239717, 10001239718
$ lmk_key                       <str> '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505', '1006031609732013112914032764978690', 'de7a02fb12a4ef27399bd7054917b4fd2c166ac4a3d7da461f4f4b7876c8a664'
$ postcode                      <str> 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ local_authority               <str> 'E08000001', 'E08000001', 'E08000001', 'E08000001', 'E08000001', 'E08000001', 'E0800000

In [23]:
epc_domestic_df = get_ca.wrangle_epc(certs_df = epc_domestic)

In [25]:
epc_non_domestic.rename(lambda col: col.lower())

uprn,lmk_key,postcode,building_reference_number,asset_rating,asset_rating_band,property_type,local_authority,constituency,transaction_type,standard_emissions,typical_emissions,target_emissions,building_emissions,building_level,renewable_sources,lodgement_datetime
str,str,str,i64,i64,str,str,str,str,str,f64,f64,f64,f64,i64,str,datetime[μs]
"""100000547109""","""85808380802013…","""NE16 3AW""",585648960000,121,"""E""","""B1 Offices and…","""E08000037""","""E14000574""","""Mandatory issu…",34.56,49.76,18.66,83.75,3,,2013-11-08 14:11:42
"""100012801072""","""10454919006201…","""WA15 9SD""",504195120000,78,"""D""","""A3/A4/A5 Resta…","""E08000009""","""E14000532""","""Mandatory issu…",93.93,158.0,53.92,147.04,3,,2015-10-07 15:42:59
"""100051933318""","""37389190342010…","""BD13 2ES""",736659470000,97,"""D""","""Restaurant/pub…","""E08000032""","""E14000588""","""Mandatory issu…",64.83,82.12,47.91,125.73,3,,2010-07-01 18:08:32
"""100052089863""","""cb9629f0eddbc0…","""S2 4LT""",10002494537,49,"""B""","""A3/A4/A5 Resta…","""E08000019""","""E14000919""","""Mandatory issu…",113.45,197.23,67.3,112.16,3,,2022-02-22 19:56:58
"""100070480039""","""85272574252013…","""B30 2YH""",523649720000,76,"""D""","""A1/A2 Retail a…","""E08000025""","""E14000567""","""Mandatory issu…",76.17,107.7,40.39,115.45,3,,2013-10-24 14:07:52
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""100012740537""","""43492325201604…","""OL6 7TP""",440339720000,58,"""C""","""D1 Non-residen…","""E08000008""","""E14000537""","""Mandatory issu…",33.24,54.84,18.71,38.7,3,,2016-04-12 23:50:45
"""72724300""","""da7a6cfc960769…","""LS3 1BS""",10005205283,59,"""C""","""Storage or Dis…","""E08000035""","""E14000777""","""Mandatory issu…",12.1,28.5,7.11,14.28,3,,2023-10-02 15:54:03
"""100012853605""","""482bbc1bca8c7f…","""M41 7TE""",10001311654,81,"""D""","""B1 Offices and…","""E08000009""","""E14000979""","""Mandatory issu…",41.39,72.87,24.87,66.95,3,,2021-06-28 09:24:29
"""100052039593""","""37f09fb8ba124b…","""HX2 0AL""",10001300394,97,"""D""","""A3/A4/A5 Resta…","""E08000033""","""E14000723""","""Mandatory issu…",102.27,201.63,68.81,198.62,3,,2021-05-29 05:26:35


In [26]:
epc_non_domestic_df = get_ca.wrangle_epc(epc_non_domestic)
epc_non_domestic_df.glimpse()

Rows: 162851
Columns: 19
$ uprn                      <str> '100000547109', '100012801072', '100051933318', '100052089863', '100070480039', '10006629006', '100050815770', '10090245109', '10008051954', '100110739430'
$ lmk_key                   <str> '85808380802013110814114263909090', '104549190062015100715425901210850', '37389190342010070118083277400260', 'cb9629f0eddbc08f17211a5f1a55545e30c70e4c16410037862fc097b1ddb19d', '85272574252013102414075209909522', '132928202252018042410271003019240', '962340502018091521591684500910', '149624629052019101021563607909928', '141580320022019070212350652260840', 'd3ac70f4ac16dc5a74d8e627c46148c418da62f3309db574d49ce1ac585137fe'
$ postcode                  <str> 'NE16 3AW', 'WA15 9SD', 'BD13 2ES', 'S2 4LT', 'B30 2YH', 'DN4 0TG', 'S60 5NU', 'B33 0SJ', 'PE1 5TU', 'DH1 5LX'
$ building_reference_number <i64> 585648960000, 504195120000, 736659470000, 10002494537, 523649720000, 205623740001, 454674580000, 982497320000, 450620230000, 10002495346
$ asset_ra

In [27]:
del epc_domestic, epc_non_domestic

Collect centroids - here to stop crash

In [28]:
pc_centroids_q = pl.scan_csv('data/postcode_centroids.csv',
                             dtypes={
                                 'RU11IND': pl.Utf8,
                                 'x': pl.Float64,
                                 'y': pl.Float64
                                 })
# pc_centroids_df.head()

In [29]:
pc_min_df = pc_centroids_q.head(1).collect()

In [30]:
pc_rename_dict = get_ca.get_rename_dict(pc_min_df, get_ca.remove_numbers, rm_numbers=False)

In [31]:
pc_rename_dict

{'OBJECTID': 'objectid',
 'PCD': 'pcd',
 'PCD2': 'pcd2',
 'PCDS': 'pcds',
 'DOINTR': 'dointr',
 'DOTERM': 'doterm',
 'USERTYPE': 'usertype',
 'OSEAST1M': 'oseast1m',
 'OSNRTH1M': 'osnrth1m',
 'OSGRDIND': 'osgrdind',
 'OA11': 'oa11',
 'CTY': 'cty',
 'CED': 'ced',
 'LAUA': 'laua',
 'WARD': 'ward',
 'HLTHAU': 'hlthau',
 'NHSER': 'nhser',
 'CTRY': 'ctry',
 'RGN': 'rgn',
 'PCON': 'pcon',
 'EER': 'eer',
 'TECLEC': 'teclec',
 'TTWA': 'ttwa',
 'PCT': 'pct',
 'ITL': 'itl',
 'NPARK': 'npark',
 'LSOA11': 'lsoa11',
 'MSOA11': 'msoa11',
 'WZ11': 'wz11',
 'SICBL': 'sicbl',
 'BUA11': 'bua11',
 'BUASD11': 'buasd11',
 'RU11IND': 'ru11ind',
 'OAC11': 'oac11',
 'LAT': 'lat',
 'LONG': 'long',
 'LEP1': 'lep1',
 'LEP2': 'lep2',
 'PFA': 'pfa',
 'IMD': 'imd',
 'CALNCV': 'calncv',
 'ICB': 'icb',
 'OA21': 'oa21',
 'LSOA21': 'lsoa21',
 'MSOA21': 'msoa21',
 'BUA22': 'bua22',
 'x': 'x',
 'y': 'y'}

In [32]:
pc_centroids_df = (pc_centroids_q
                   .filter(pl.col('LAUA').is_in(la_list))
                   .rename(mapping=pc_rename_dict)).collect()
# pc_centroids_df.glimpse()

Tenure - ts054 from NOMIS - slightly cleaned - remove csv header 

In [33]:
ca_tenure_lsoa = (pl.scan_csv('data/ts054_tenure_nomis.csv')
                  .select(pl.all().name.map(lambda col_name: col_name.replace(' ', '_')))
                  .select(pl.all().name.to_lowercase())
                  .select(pl.all().name.map(lambda col_name: col_name.replace(':', '')))
                  .with_columns(pl.col('lsoa').str.slice(0, 9).alias('lsoacd'))
                  .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
                  ).collect()

In [34]:
# ca_tenure_lsoa.glimpse()

In [35]:
del postcodes_df

Load the data into a duckDB data base

In [36]:
con = duckdb.connect('data/ca_epc.duckdb')

In [39]:
try:
    con.execute("BEGIN TRANSACTION;")
    con.execute('INSTALL spatial;')
    con.execute('LOAD spatial;')
    con.execute(f'CREATE OR REPLACE TABLE lsoa_pwc_tbl AS SELECT * FROM ST_Read("{reproject_path}")')
    con.execute(f'CREATE OR REPLACE TABLE lsoa_poly_tbl AS SELECT * FROM ST_Read("{reproject_lsoa_poly_path}")')
    con.execute('CREATE UNIQUE INDEX lsoacd_poly_idx ON lsoa_poly_tbl (lsoacd)')
    con.execute('CREATE UNIQUE INDEX lsoacd_pwc_idx ON lsoa_pwc_tbl (lsoacd)')
    con.execute('CREATE OR REPLACE TABLE ca_tenure_lsoa_tbl AS SELECT * FROM ca_tenure_lsoa')
    con.execute('CREATE UNIQUE INDEX lsoacd_tenure_idx ON ca_tenure_lsoa_tbl (lsoacd)')
    con.execute('CREATE OR REPLACE TABLE ca_la_tbl AS SELECT * FROM ca_la_df')
    con.execute('CREATE OR REPLACE TABLE imd_tbl AS SELECT * FROM imd_df')
    # con.execute('CREATE OR REPLACE TABLE postcodes_tbl AS SELECT * FROM postcodes_df')
    # con.execute('CREATE UNIQUE INDEX postcode_idx ON postcodes_tbl (postcode)')
    con.execute('CREATE OR REPLACE TABLE postcode_centroids_tbl AS SELECT * FROM pc_centroids_df')
    con.execute('CREATE UNIQUE INDEX postcode_centroids_idx ON postcode_centroids_tbl (PCDS)')
    con.execute('CREATE OR REPLACE TABLE epc_non_domestic_tbl AS SELECT * FROM epc_non_domestic_df')
    con.execute('CREATE UNIQUE INDEX uprn_nondom_idx ON epc_non_domestic_tbl (uprn)')
    con.execute('CREATE OR REPLACE TABLE ca_la_dft_lookup_tbl AS SELECT * FROM ca_la_dft_lookup_df')
    con.execute('CREATE UNIQUE INDEX ca_la_dft_lookup_idx ON ca_la_dft_lookup_tbl (ladcd)')
    con.execute("COMMIT;")
except Exception as e:
    # If an error occurs, rollback the transaction
    con.execute("ROLLBACK;")
    print(f"Transaction rolled back due to an error: {e}")

Have to do the domestic epc's outside the transaction block otherwise memory fail

In [40]:
del epc_non_domestic_df, pc_centroids_df

In [41]:
con.execute('CREATE OR REPLACE TABLE epc_domestic_tbl AS SELECT * FROM epc_domestic_df')
con.execute('CREATE UNIQUE INDEX uprn_idx ON epc_domestic_tbl (uprn)')

<duckdb.duckdb.DuckDBPyConnection at 0x13400337370>

In [44]:
del epc_domestic_df, ca_tenure_lsoa

In [45]:
con.execute("EXPORT DATABASE 'data/db_export' (FORMAT PARQUET);")

<duckdb.duckdb.DuckDBPyConnection at 0x13400337370>

In [46]:
con.close()

In [None]:
# con = duckdb.connect('data/ca_epc_test.duckdb')

# con.execute(command_list[5])

below is superseded by transaction above

In [None]:
# out = get_ca.load_data(command_list=command_list, db_path='data/ca_epc.duckdb', overwrite = True)

Introspect Database


In [None]:
con = duckdb.connect('data/ca_epc.duckdb')

In [None]:
con.sql("SHOW ALL TABLES;")

┌──────────┬─────────┬──────────────────────┬──────────────────────┬───────────────────────────────────────┬───────────┐
│ database │ schema  │         name         │     column_names     │             column_types              │ temporary │
│ varchar  │ varchar │       varchar        │      varchar[]       │               varchar[]               │  boolean  │
├──────────┼─────────┼──────────────────────┼──────────────────────┼───────────────────────────────────────┼───────────┤
│ ca_epc   │ main    │ ca_la_dft_lookup_tbl │ [dft_la_id, ladcd,…  │ [BIGINT, VARCHAR, BIGINT]             │ false     │
│ ca_epc   │ main    │ ca_la_tbl            │ [ladcd, ladnm, cau…  │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR]  │ false     │
│ ca_epc   │ main    │ ca_tenure_lsoa_tbl   │ [lsoa, total, owne…  │ [VARCHAR, BIGINT, BIGINT, BIGINT, B…  │ false     │
│ ca_epc   │ main    │ epc_domestic_tbl     │ [uprn, lmk_key, po…  │ [BIGINT, VARCHAR, VARCHAR, VARCHAR,…  │ false     │
│ ca_epc   │ main    │ epc_non_d

In [None]:
con.sql('DESCRIBE epc_domestic_tbl')

┌───────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│        column_name        │ column_type │  null   │   key   │ default │  extra  │
│          varchar          │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ uprn                      │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ lmk_key                   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ postcode                  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ local_authority           │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ property_type             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ transaction_type          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ tenure                    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ mains_gas_flag            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NU

In [None]:
con.sql('DESCRIBE epc_non_domestic_tbl')

┌───────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│        column_name        │ column_type │  null   │   key   │ default │  extra  │
│          varchar          │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ uprn                      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ lmk_key                   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ postcode                  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ building_reference_number │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ asset_rating              │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ asset_rating_band         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ property_type             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ local_authority           │ VARCHAR     │ YES     │ NULL    │ NULL    │ NU

In [None]:
con.sql('SELECT COUNT(*) num_rows FROM lsoa_pwc_tbl')

┌──────────┐
│ num_rows │
│  int64   │
├──────────┤
│     9585 │
└──────────┘

In [None]:
con.close()

Section below is to load data into SQLite.

In [None]:
# dictionary of all the dfs to be imported to sqlite - for datasette
dfs_dict = {
    'ca_la_tbl':ca_la_df,
    'imd_tbl':imd_df,
    'postcodes_tbl':postcodes_df,
    'epc_domestic_tbl':epc_domestic_df,
    'epc_non_domestic_tbl': epc_non_domestic_df,
    'ca_la_dft_lookup_tbl':ca_la_dft_lookup_df

}

In [None]:
# %pip install adbc_driver_sqlite

In [None]:
[df.write_csv(f'data/holding/{table_name}.csv') for table_name, df in dfs_dict.items()]

[None, None, None, None, None, None]

In [None]:
def import_dfs(folder_path: str = 'data/holding'):
    # Looping through each CSV file in the folder
    for file in Path(folder_path).glob('*.csv'):
        # Getting the stem (file name without extension) of the file
        stem = file.stem

        # Reading the CSV file into a DataFrame
        df = pl.read_csv(file)

        # Storing the DataFrame in the dictionary with the stem as the key
        globals()[stem] = df
    # return dataframes

In [None]:
import_dfs() # to save running the import routines

In [None]:
get_ca.populate_sqlite(dfs_dict, db_path='data/sqlite/ca_epc.db', overwrite=True)

In [None]:
get_ca.populate_sqlite(tables_dict, uri)

NameError: name 'ca_la_df' is not defined

InternalError: INTERNAL: [SQLite] Failed to create table: table "ca_la_tbl" already exists (executed 'CREATE TABLE main . "ca_la_tbl" ("ladcd" TEXT, "ladnm" TEXT, "cauthcd" TEXT, "cauthnm" TEXT)')

In [None]:
# population weighted centroids for each LSOA - has to be downloaded as there is no straightforward way to query the CA LSOA's from
# open online datasets

lsoa_bng_file_path = get_ca.filter_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107.geojson',
 output_file='data/geojson/ca_lsoa_pwc.geojson',
 property_name = 'LSOA21CD',
 ca_lsoa_codes = ca_lsoa_codes)

In [None]:
get_ca.reproject(input_bng_file='data/geojson/ca_lsoa_pwc.geojson',
                  output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson',
                    lsoa_code='LSOA21CD')

'data/geojson/ca_lsoa_pwc_wgs84.geojson'

In [None]:

db = Database('data/ca_epc.db', recreate = True)
db.close()

In [None]:
# %pip install geojson-to-sqlite

In [None]:
# import the lsoa PWC file as geojson to the DB setting the primary key to lsoacd
!geojson-to-sqlite data/sqlite/ca_epc.db lsoa_pwc_tbl data/geojson/ca_lsoa_pwc_wgs84.geojson --pk=lsoacd


'geojson-to-sqlite' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
db = Database('data/ca_epc.db', recreate = False)
print(db.schema)

CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER,
   [FID] INTEGER,
   [GlobalID] TEXT,
   [lsoacd] TEXT PRIMARY KEY,
   [geometry] TEXT
);


In [None]:
db['imd_tbl'].create(
{
 'lsoacd': str,
 'lsoanm': str,
 'ladcd': str,
 'ladnm': str,
 'imd': int
 },
 pk = 'lsoacd'
)

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [None]:
db['ca_la_tbl'].create(
{
 'ladcd': str,
 'ladnm': str,
 'cauthcd': str,
 'cauthnm': str
 },
 pk = 'ladcd'
)

<Table ca_la_tbl (ladcd, ladnm, cauthcd, cauthnm)>

In [None]:
db['postcodes_tbl'].create(
    {"pcds": str,
     "lsoacd": str,
     "msoacd": str,
     "ladcd": str,
     "ladnm": str},
     pk = 'pcds'
     )

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [None]:
print(db.schema)

In [None]:
# db['epc_clean_tbl'].add_foreign_key('postcode', 'postcodes_tbl', 'pcds') # too big causes crash
db['postcodes_tbl'].add_foreign_key('lsoacd', 'imd_tbl', 'lsoacd')
db['postcodes_tbl'].add_foreign_key('ladcd', 'ca_la_tbl', 'ladcd')
db['postcodes_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')
db['imd_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [None]:
db['ca_la_tbl'].insert_all(ca_la_tbl_payload)
# db['epc_clean_tbl'].insert_all(epc_clean_tbl_payload)
db['postcodes_tbl'].insert_all(postcodes_tbl_payload)
db['imd_tbl'].insert_all(imd_tbl_payload)


<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [None]:
db.close()