In [1]:
# use .venv - make sure to restart jupyter kernel. duckdb need python 3.10
import polars as pl
import pyarrow
import duckdb
import get_ca_data as get_ca # functions for retrieving CA \ common data
from pathlib import Path


This notebook retrieves all the base data needed for comparison analysis with other Combined Authorities and loads it into a duckdb database.
SQLite was tried, but it is slow, not directly compatible with polars and does not work well with datasette because of the size of data.

In [2]:
ca_la_df = get_ca.get_ca_la_df(2023, inc_ns=True) # include NS
# ca_la_df.glimpse()


In [3]:
# retrieve the epc certs from the source csv s
la_list = (ca_la_df['ladcd']) #includes north somerset
f'There are {str(la_list.shape)[1:3]} Local Authorities in Combined Authorities'


'There are 54 Local Authorities in Combined Authorities'

Get the lookup table that relates DFT Local authority ID's in the Combined authorities to ONS LA codes

In [4]:
ca_la_dft_lookup_df = get_ca.get_ca_la_dft_lookup(
    dft_csv_path = 'https://storage.googleapis.com/dft-statistics/road-traffic/downloads/data-gov-uk/local_authority_traffic.csv',
    la_list = la_list)
# ca_la_dft_lookup_df.glimpse()

In [5]:
ca_la_codes = get_ca.get_ca_la_codes(ca_la_df)
postcode_file = get_ca.get_zipped_csv_file(url = "https://www.arcgis.com/sharing/rest/content/items/3770c5e8b0c24f1dbe6d2fc6b46a0b18/data",
                      file_folder_name = "postcode_lookup")
postcodes_df = get_ca.get_postcode_df(postcode_file, ca_la_codes)

In [6]:
# this is hit by the 2000 record limit too so use DL file
# input_file = get_ca.get_geojson(url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022/FeatureServer/0/query",
#                       destination_directory = "data\\geojson")

In [7]:
ca_lsoa_codes = get_ca.get_ca_lsoa_codes(postcodes_df)

run cell below if not needing to update LSOA geodata (its expensive and crashes)

In [8]:
reproject_path = 'data/geojson/ca_lsoa_pwc_wgs84.geojson'
reproject_lsoa_poly_path = 'data/geojson/ca_lsoa_poly_wgs84.geojson'

In [17]:
# rename the LSOA features and return the path
cleaned_lsoa_pwc_path = get_ca.clean_lsoa_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107 (2).geojson',
                                                  lsoacd='LSOA21CD')

In [18]:
# filter for just the LSOA's in Combined Authorities
ca_lsoa_pwc_path = get_ca.filter_geojson(input_file = cleaned_lsoa_pwc_path,
                                         output_file='data/geojson/ca_lsoa_pwc.geojson',
                                         property_name ='lsoacd',
                                         ca_lsoa_codes = ca_lsoa_codes)

In [20]:
cleaned_lsoa_poly_path = get_ca.clean_lsoa_geojson('data/geojson/Lower_layer_Super_Output_Areas_2021_EW_BFC_V8_-8407643096148449625.geojson', lsoacd='LSOA21CD')

In [19]:
# reproject to WGS84:4326 as default from ONS is 27700
reproject_path = get_ca.reproject(ca_lsoa_pwc_path, output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson', lsoa_code = 'lsoacd')

In [21]:
# filter for just the LSOA's polys in Combined Authorities
ca_lsoa_poly_path = get_ca.filter_geojson(input_file = cleaned_lsoa_poly_path,
                                         output_file='data/geojson/ca_lsoa_poly.geojson',
                                         property_name ='lsoacd',
                                         ca_lsoa_codes = ca_lsoa_codes)

In [22]:
reproject_lsoa_poly_path = get_ca.reproject(ca_lsoa_poly_path, output_wgs84_file='data/geojson/ca_lsoa_poly_wgs84.geojson', lsoa_code = 'lsoacd')

In [9]:
url_imd_csv = "https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::index-of-multiple-deprivation-dec-2019-lookup-in-england.csv?where=1=1&outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"

In [10]:
imd_df_raw = pl.read_csv(url_imd_csv)

In [11]:
rename_dict_imd = get_ca.get_rename_dict(imd_df_raw, get_ca.remove_numbers, rm_numbers = True)

In [12]:
imd_df = (
    imd_df_raw
    .rename(rename_dict_imd)
    .select(pl.all().exclude('fid'))
    .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
)
n_unmatched_lsoas = len(ca_lsoa_codes) - imd_df.shape[0]

In [13]:
del imd_df_raw

In [14]:
# https://geoportal.statistics.gov.uk/datasets/lsoa-dec-2021-pwc-for-england-and-wales/explore

In [15]:
cols_schema_nondom = {
    'LMK_KEY': pl.Utf8,
    'POSTCODE': pl.Utf8,
    'BUILDING_REFERENCE_NUMBER': pl.Int64,
    'ASSET_RATING': pl.Int64,
    'ASSET_RATING_BAND': pl.Utf8,
    'PROPERTY_TYPE': pl.Utf8,
    'LOCAL_AUTHORITY': pl.Utf8,
    'CONSTITUENCY': pl.Utf8,
    'TRANSACTION_TYPE': pl.Utf8,
    'STANDARD_EMISSIONS': pl.Float64,
    'TYPICAL_EMISSIONS': pl.Float64,
    'TARGET_EMISSIONS': pl.Float64,
    'BUILDING_EMISSIONS': pl.Float64,
    'BUILDING_LEVEL': pl.Int64,
    'RENEWABLE_SOURCES': pl.Utf8,
    'LODGEMENT_DATETIME': pl.Utf8,
    'UPRN': pl.Utf8
    }

In [16]:
epc_non_domestic = (pl.scan_csv(
    'data/all-non-domestic-certificates-single-file/certificates.csv',
    schema = cols_schema_nondom,

)
.filter(pl.col('LOCAL_AUTHORITY').is_in(la_list.to_list()))
.with_columns(pl.col('LODGEMENT_DATETIME').str.to_datetime(format='%Y-%m-%d %H:%M:%S', strict=False))
                    .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                    .group_by('UPRN').last()
).collect()

In [17]:
cols_schema_dom = {
                        'LMK_KEY':pl.Utf8,
                        'POSTCODE':pl.Utf8,
                        'LOCAL_AUTHORITY':pl.Utf8,
                        'PROPERTY_TYPE':pl.Utf8,
                        'LODGEMENT_DATETIME':pl.Utf8,
                        'TRANSACTION_TYPE': pl.Utf8,
                        'TENURE':pl.Utf8,
                        'MAINS_GAS_FLAG':pl.Utf8,
                        'HOT_WATER_ENERGY_EFF':pl.Utf8,
                        'WINDOWS_DESCRIPTION':pl.Utf8,
                        'WINDOWS_ENERGY_EFF':pl.Utf8,
                        'WALLS_DESCRIPTION':pl.Utf8,
                        'WALLS_ENERGY_EFF':pl.Utf8,
                        'ROOF_DESCRIPTION':pl.Utf8,
                        'ROOF_ENERGY_EFF':pl.Utf8,
                        'MAINHEAT_DESCRIPTION':pl.Utf8,
                        'MAINHEAT_ENERGY_EFF':pl.Utf8,
                        'MAINHEAT_ENV_EFF':pl.Utf8,
                        'MAIN_HEATING_CONTROLS':pl.Utf8,
                        'MAINHEATCONT_DESCRIPTION':pl.Utf8,
                        'MAINHEATC_ENERGY_EFF':pl.Utf8,
                        'MAIN_FUEL':pl.Utf8,
                        'SOLAR_WATER_HEATING_FLAG':pl.Utf8,
                        'CONSTRUCTION_AGE_BAND':pl.Utf8,
                        'CURRENT_ENERGY_RATING':pl.Utf8,
                        'POTENTIAL_ENERGY_RATING':pl.Utf8,
                        'CURRENT_ENERGY_EFFICIENCY':pl.Utf8,
                        'POTENTIAL_ENERGY_EFFICIENCY':pl.Utf8,
                        'BUILT_FORM':pl.Utf8,
                        'CONSTITUENCY':pl.Utf8,
                        'FLOOR_DESCRIPTION':pl.Utf8,
                        'ENVIRONMENT_IMPACT_CURRENT':pl.Int64,
                        'ENVIRONMENT_IMPACT_POTENTIAL':pl.Int64,
                        'ENERGY_CONSUMPTION_CURRENT':pl.Int64,
                        'ENERGY_CONSUMPTION_POTENTIAL':pl.Int64,
                        'CO2_EMISS_CURR_PER_FLOOR_AREA':pl.Int64,
                        'CO2_EMISSIONS_CURRENT': pl.Float64,
                        'CO2_EMISSIONS_POTENTIAL':pl.Float64,
                        'LIGHTING_COST_CURRENT':pl.Int64,
                        'LIGHTING_COST_POTENTIAL':pl.Int64,
                        'HEATING_COST_CURRENT':pl.Int64,
                        'HEATING_COST_POTENTIAL':pl.Int64,
                        'HOT_WATER_COST_CURRENT':pl.Int64,
                        'HOT_WATER_COST_POTENTIAL':pl.Int64,
                        'TOTAL_FLOOR_AREA':pl.Float64,
                        'NUMBER_HABITABLE_ROOMS':pl.Int64,
                        'NUMBER_HEATED_ROOMS':pl.Int64,
                        'PHOTO_SUPPLY':pl.Float64,
                        'UPRN':pl.Int64,
                        'BUILDING_REFERENCE_NUMBER':pl.Int64
                    }

In [18]:
epc_domestic = get_ca.ingest_certs(la_list = la_list,
                                   cols_schema = cols_schema_dom,
                                   root_dir = 'data/all-domestic-certificates')
epc_domestic.glimpse()


Rows: 4778169
Columns: 50
$ UPRN                                   <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ LMK_KEY                                <str> '75436a7d8ce6132957c7c841b1bcec12a5749b472666c674b982b87b7b78d975', '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505'
$ POSTCODE                               <str> 'BL5 3FS', 'BL3 1PS', 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ LOCAL_AUTHORITY                        <str> 'E08000001', 'E08000001', 'E0

In [19]:
certs_df = (epc_domestic        
    
    .with_columns([pl.col('LODGEMENT_DATETIME')
                   .dt.date()
                   .alias('date')])
    .with_columns([pl.col('date').dt.year().alias('year'),
                   pl.col('date').dt.month().cast(pl.Int16).alias('month'),
                   pl.col('date').cast(pl.Utf8)])
    .filter(~pl.col('UPRN').is_duplicated()) # some nulls and duplicates (~134) so remove
    .drop('lodgement_datetime'))

In [20]:
epc_domestic_df = get_ca.wrangle_epc(certs_df = certs_df)
epc_domestic_df.glimpse()

Rows: 4776233
Columns: 52
$ uprn                          <i64> 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716, 10001239717, 10001239718
$ lmk_key                       <str> '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505', '1006031609732013112914032764978690', 'de7a02fb12a4ef27399bd7054917b4fd2c166ac4a3d7da461f4f4b7876c8a664'
$ postcode                      <str> 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ local_authority               <str> 'E08000001', 'E08000001', 'E08000001', 'E08000001', 'E08000001', 'E08000001', 'E0800000

In [21]:
epc_non_domestic_df = get_ca.wrangle_epc(epc_non_domestic)
epc_non_domestic_df.glimpse()

Rows: 162851
Columns: 19
$ uprn                      <str> '10023053058', '100012687166', '32176581', '45159707', '100110747621', '100110779122', '10013314398', '39037016', '100071420623', '100110747674'
$ lmk_key                   <str> '140522430962019020918095246020310', '13905181032019101610321999900496', '93940850142014072316320817200230', '0b932a1e8f3229f5fab28caacf82aff6137b83c3982acaa923b23de5bd81ba86', '36262460022017092118521403780050', '97847160962014121515481567920260', '65627902752012060813421608009206', '45082700202010120109090701809990', '9275c9a341eaa5b199d86fa8a4c3c63dae746adbbf9c4d4c4399652d4cb13ea1', '112413790062016051718135917950950'
$ postcode                  <str> 'M90 5UJ', 'M24 1SN', 'B43 6PB', 'SR1 2NG', 'DL14 7EH', 'TS17 0RR', 'DL1 1YN', 'WA12 9BW', 'B11 1AX', 'DL14 7EQ'
$ building_reference_number <i64> 144536660000, 864949200000, 214985210000, 10004529300, 507466340000, 666265770000, 261428500000, 822210800000, 726283000001, 519497700000
$ asset_rating    

In [22]:
del epc_domestic
del certs_df
del epc_non_domestic

Collect centroids - here to stop crash

In [23]:
pc_centroids_q = pl.scan_csv('data/postcode_centroids.csv',
                             dtypes={
                                 'RU11IND': pl.Utf8,
                                 'x': pl.Float64,
                                 'y': pl.Float64
                                 })
# pc_centroids_df.head()

In [24]:
pc_min_df = pc_centroids_q.head(1).collect()

In [27]:
pc_rename_dict = get_ca.get_rename_dict(pc_min_df, get_ca.remove_numbers, rm_numbers=False)

In [28]:
pc_rename_dict

{'OBJECTID': 'objectid',
 'PCD': 'pcd',
 'PCD2': 'pcd2',
 'PCDS': 'pcds',
 'DOINTR': 'dointr',
 'DOTERM': 'doterm',
 'USERTYPE': 'usertype',
 'OSEAST1M': 'oseast1m',
 'OSNRTH1M': 'osnrth1m',
 'OSGRDIND': 'osgrdind',
 'OA11': 'oa11',
 'CTY': 'cty',
 'CED': 'ced',
 'LAUA': 'laua',
 'WARD': 'ward',
 'HLTHAU': 'hlthau',
 'NHSER': 'nhser',
 'CTRY': 'ctry',
 'RGN': 'rgn',
 'PCON': 'pcon',
 'EER': 'eer',
 'TECLEC': 'teclec',
 'TTWA': 'ttwa',
 'PCT': 'pct',
 'ITL': 'itl',
 'NPARK': 'npark',
 'LSOA11': 'lsoa11',
 'MSOA11': 'msoa11',
 'WZ11': 'wz11',
 'SICBL': 'sicbl',
 'BUA11': 'bua11',
 'BUASD11': 'buasd11',
 'RU11IND': 'ru11ind',
 'OAC11': 'oac11',
 'LAT': 'lat',
 'LONG': 'long',
 'LEP1': 'lep1',
 'LEP2': 'lep2',
 'PFA': 'pfa',
 'IMD': 'imd',
 'CALNCV': 'calncv',
 'ICB': 'icb',
 'OA21': 'oa21',
 'LSOA21': 'lsoa21',
 'MSOA21': 'msoa21',
 'BUA22': 'bua22',
 'x': 'x',
 'y': 'y'}

In [29]:
pc_centroids_df = (pc_centroids_q
                   .filter(pl.col('LAUA').is_in(la_list))
                   .rename(mapping=pc_rename_dict)).collect()
# pc_centroids_df.glimpse()

Tenure - ts054 from NOMIS - slightly cleaned - remove csv header 

In [30]:
ca_tenure_lsoa = (pl.scan_csv('data/ts054_tenure_nomis.csv')
                  .select(pl.all().name.map(lambda col_name: col_name.replace(' ', '_')))
                  .select(pl.all().name.to_lowercase())
                  .select(pl.all().name.map(lambda col_name: col_name.replace(':', '')))
                  .with_columns(pl.col('lsoa').str.slice(0, 9).alias('lsoacd'))
                  .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
                  ).collect()

In [31]:
# ca_tenure_lsoa.glimpse()

In [32]:
del postcodes_df

Load the data into a duckDB data base

In [33]:
con = duckdb.connect('data/ca_epc.duckdb')

In [34]:
try:
    con.execute("BEGIN TRANSACTION;")
    con.execute('INSTALL spatial;')
    con.execute('LOAD spatial;')
    con.execute(f'CREATE OR REPLACE TABLE lsoa_pwc_tbl AS SELECT * FROM ST_Read("{reproject_path}")')
    con.execute(f'CREATE OR REPLACE TABLE lsoa_poly_tbl AS SELECT * FROM ST_Read("{reproject_lsoa_poly_path}")')
    con.execute('CREATE UNIQUE INDEX lsoacd_poly_idx ON lsoa_poly_tbl (lsoacd)')
    con.execute('CREATE UNIQUE INDEX lsoacd_pwc_idx ON lsoa_pwc_tbl (lsoacd)')
    con.execute('CREATE OR REPLACE TABLE ca_tenure_lsoa_tbl AS SELECT * FROM ca_tenure_lsoa')
    con.execute('CREATE UNIQUE INDEX lsoacd_tenure_idx ON ca_tenure_lsoa_tbl (lsoacd)')
    con.execute('CREATE OR REPLACE TABLE ca_la_tbl AS SELECT * FROM ca_la_df')
    con.execute('CREATE OR REPLACE TABLE imd_tbl AS SELECT * FROM imd_df')
    # con.execute('CREATE OR REPLACE TABLE postcodes_tbl AS SELECT * FROM postcodes_df')
    # con.execute('CREATE UNIQUE INDEX postcode_idx ON postcodes_tbl (postcode)')
    con.execute('CREATE OR REPLACE TABLE postcode_centroids_tbl AS SELECT * FROM pc_centroids_df')
    con.execute('CREATE UNIQUE INDEX postcode_centroids_idx ON postcode_centroids_tbl (PCDS)')
    con.execute('CREATE OR REPLACE TABLE epc_non_domestic_tbl AS SELECT * FROM epc_non_domestic_df')
    con.execute('CREATE UNIQUE INDEX uprn_nondom_idx ON epc_non_domestic_tbl (uprn)')
    con.execute('CREATE OR REPLACE TABLE ca_la_dft_lookup_tbl AS SELECT * FROM ca_la_dft_lookup_df')
    con.execute('CREATE UNIQUE INDEX ca_la_dft_lookup_idx ON ca_la_dft_lookup_tbl (ladcd)')
    con.execute("COMMIT;")
except Exception as e:
    # If an error occurs, rollback the transaction
    con.execute("ROLLBACK;")
    print(f"Transaction rolled back due to an error: {e}")

Have to do the domestic epc's outside the transaction block otherwise memory fail

In [35]:
con.execute('CREATE OR REPLACE TABLE epc_domestic_tbl AS SELECT * FROM epc_domestic_df')
con.execute('CREATE UNIQUE INDEX uprn_idx ON epc_domestic_tbl (uprn)')

<duckdb.duckdb.DuckDBPyConnection at 0x1e117ca23f0>

In [36]:
con.close()

In [44]:
# con = duckdb.connect('data/ca_epc_test.duckdb')

# con.execute(command_list[5])

below is superseded by transaction above

In [88]:
# out = get_ca.load_data(command_list=command_list, db_path='data/ca_epc.duckdb', overwrite = True)

Introspect Database


In [85]:
con = duckdb.connect('data/ca_epc.duckdb')

In [3]:
con.sql("SHOW ALL TABLES;")

┌──────────┬─────────┬──────────────────────┬──────────────────────┬───────────────────────────────────────┬───────────┐
│ database │ schema  │         name         │     column_names     │             column_types              │ temporary │
│ varchar  │ varchar │       varchar        │      varchar[]       │               varchar[]               │  boolean  │
├──────────┼─────────┼──────────────────────┼──────────────────────┼───────────────────────────────────────┼───────────┤
│ ca_epc   │ main    │ ca_la_dft_lookup_tbl │ [dft_la_id, ladcd,…  │ [BIGINT, VARCHAR, BIGINT]             │ false     │
│ ca_epc   │ main    │ ca_la_tbl            │ [ladcd, ladnm, cau…  │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR]  │ false     │
│ ca_epc   │ main    │ ca_tenure_lsoa_tbl   │ [lsoa, total, owne…  │ [VARCHAR, BIGINT, BIGINT, BIGINT, B…  │ false     │
│ ca_epc   │ main    │ epc_domestic_tbl     │ [uprn, lmk_key, po…  │ [BIGINT, VARCHAR, VARCHAR, VARCHAR,…  │ false     │
│ ca_epc   │ main    │ epc_non_d

In [4]:
con.sql('DESCRIBE epc_domestic_tbl')

┌───────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│        column_name        │ column_type │  null   │   key   │ default │  extra  │
│          varchar          │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ uprn                      │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ lmk_key                   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ postcode                  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ local_authority           │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ property_type             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ transaction_type          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ tenure                    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ mains_gas_flag            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NU

In [33]:
con.sql('DESCRIBE epc_non_domestic_tbl')

┌───────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│        column_name        │ column_type │  null   │   key   │ default │  extra  │
│          varchar          │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ uprn                      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ lmk_key                   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ postcode                  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ building_reference_number │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ asset_rating              │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ asset_rating_band         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ property_type             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ local_authority           │ VARCHAR     │ YES     │ NULL    │ NULL    │ NU

In [6]:
con.sql('SELECT COUNT(*) num_rows FROM lsoa_pwc_tbl')

┌──────────┐
│ num_rows │
│  int64   │
├──────────┤
│     9585 │
└──────────┘

In [91]:
con.close()

Section below is to load data into SQLite.

In [26]:
# dictionary of all the dfs to be imported to sqlite - for datasette
dfs_dict = {
    'ca_la_tbl':ca_la_df,
    'imd_tbl':imd_df,
    'postcodes_tbl':postcodes_df,
    'epc_domestic_tbl':epc_domestic_df,
    'epc_non_domestic_tbl': epc_non_domestic_df,
    'ca_la_dft_lookup_tbl':ca_la_dft_lookup_df

}

In [4]:
# %pip install adbc_driver_sqlite

In [23]:
[df.write_csv(f'data/holding/{table_name}.csv') for table_name, df in dfs_dict.items()]

[None, None, None, None, None, None]

In [3]:
def import_dfs(folder_path: str = 'data/holding'):
    # Looping through each CSV file in the folder
    for file in Path(folder_path).glob('*.csv'):
        # Getting the stem (file name without extension) of the file
        stem = file.stem

        # Reading the CSV file into a DataFrame
        df = pl.read_csv(file)

        # Storing the DataFrame in the dictionary with the stem as the key
        globals()[stem] = df
    # return dataframes

In [6]:
import_dfs() # to save running the import routines

In [28]:
get_ca.populate_sqlite(dfs_dict, db_path='data/sqlite/ca_epc.db', overwrite=True)

In [10]:
get_ca.populate_sqlite(tables_dict, uri)

NameError: name 'ca_la_df' is not defined

InternalError: INTERNAL: [SQLite] Failed to create table: table "ca_la_tbl" already exists (executed 'CREATE TABLE main . "ca_la_tbl" ("ladcd" TEXT, "ladnm" TEXT, "cauthcd" TEXT, "cauthnm" TEXT)')

In [9]:
# population weighted centroids for each LSOA - has to be downloaded as there is no straightforward way to query the CA LSOA's from
# open online datasets

lsoa_bng_file_path = get_ca.filter_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107.geojson',
 output_file='data/geojson/ca_lsoa_pwc.geojson',
 property_name = 'LSOA21CD',
 ca_lsoa_codes = ca_lsoa_codes)

In [10]:
get_ca.reproject(input_bng_file='data/geojson/ca_lsoa_pwc.geojson',
                  output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson',
                    lsoa_code='LSOA21CD')

'data/geojson/ca_lsoa_pwc_wgs84.geojson'

In [9]:

db = Database('data/ca_epc.db', recreate = True)
db.close()

In [62]:
# %pip install geojson-to-sqlite

In [60]:
# import the lsoa PWC file as geojson to the DB setting the primary key to lsoacd
!geojson-to-sqlite data/sqlite/ca_epc.db lsoa_pwc_tbl data/geojson/ca_lsoa_pwc_wgs84.geojson --pk=lsoacd


'geojson-to-sqlite' is not recognized as an internal or external command,
operable program or batch file.


In [11]:
db = Database('data/ca_epc.db', recreate = False)
print(db.schema)

CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER,
   [FID] INTEGER,
   [GlobalID] TEXT,
   [lsoacd] TEXT PRIMARY KEY,
   [geometry] TEXT
);


In [12]:
db['imd_tbl'].create(
{
 'lsoacd': str,
 'lsoanm': str,
 'ladcd': str,
 'ladnm': str,
 'imd': int
 },
 pk = 'lsoacd'
)

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [13]:
db['ca_la_tbl'].create(
{
 'ladcd': str,
 'ladnm': str,
 'cauthcd': str,
 'cauthnm': str
 },
 pk = 'ladcd'
)

<Table ca_la_tbl (ladcd, ladnm, cauthcd, cauthnm)>

In [14]:
db['postcodes_tbl'].create(
    {"pcds": str,
     "lsoacd": str,
     "msoacd": str,
     "ladcd": str,
     "ladnm": str},
     pk = 'pcds'
     )

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [None]:
print(db.schema)

In [16]:
# db['epc_clean_tbl'].add_foreign_key('postcode', 'postcodes_tbl', 'pcds') # too big causes crash
db['postcodes_tbl'].add_foreign_key('lsoacd', 'imd_tbl', 'lsoacd')
db['postcodes_tbl'].add_foreign_key('ladcd', 'ca_la_tbl', 'ladcd')
db['postcodes_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')
db['imd_tbl'].add_foreign_key('lsoacd', 'lsoa_pwc_tbl', 'lsoacd')

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [17]:
db['ca_la_tbl'].insert_all(ca_la_tbl_payload)
# db['epc_clean_tbl'].insert_all(epc_clean_tbl_payload)
db['postcodes_tbl'].insert_all(postcodes_tbl_payload)
db['imd_tbl'].insert_all(imd_tbl_payload)


<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [None]:
db.close()