In [1]:
from sqlite_utils import Database
import polars as pl
from datetime import datetime
import os
# import json
import get_ca_data as get_ca # functions for retrieving CA \ common data

In [2]:
epc_clean = (pl.read_csv('data/epc_subset_polars_last.csv', n_rows=400))


In [3]:
epc_names = get_ca.get_rename_dict(epc_clean, get_ca.remove_numbers)

epc_clean_df = (
    epc_clean
    .with_columns([pl.col('LODGEMENT_DATETIME')
                   .str.to_datetime()
                   .dt.date()
                   .alias('date')])
    .with_columns([pl.col('date').dt.year().alias('year'),
                   pl.col('date').dt.month().cast(pl.Int16).alias('month'),
                   pl.col('date').cast(pl.Utf8)])
    .rename(epc_names)
    .select(pl.exclude('lodgement_datetime'))
    
)
# epc_clean_df.glimpse()

In [4]:
ca_la_df = get_ca.get_ca_la_df(year = 2023)
ca_la_codes = get_ca.get_ca_la_codes(ca_la_df)
postcode_file = get_ca.get_zipped_csv_file(url = "https://www.arcgis.com/sharing/rest/content/items/3770c5e8b0c24f1dbe6d2fc6b46a0b18/data",
                      file_folder_name = "postcode_lookup")
postcodes_df = get_ca.get_postcode_df(postcode_file, ca_la_codes)

In [5]:
# postcodes_df.glimpse()

In [6]:
# ca_la_df.select(pl.col('ladcd')).unique()

In [7]:
# this is hit by the 2000 record limit too so use DL file
# input_file = get_ca.get_geojson(url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022/FeatureServer/0/query",
#                       destination_directory = "data\\geojson")

In [8]:
ca_lsoa_codes = get_ca.get_ca_lsoa_codes(postcodes_df)

In [9]:
url_imd_csv = "https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::index-of-multiple-deprivation-dec-2019-lookup-in-england.csv?where=1=1&outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"

In [10]:
imd_df_raw = pl.read_csv(url_imd_csv)

In [11]:
rename_dict_imd = get_ca.get_rename_dict(imd_df_raw, get_ca.remove_numbers, rm_numbers = True)

In [12]:
imd_df = (
    imd_df_raw
    .rename(rename_dict_imd)
    .select(pl.all().exclude('fid'))
    .filter(pl.col('lsoacd').is_in(ca_lsoa_codes))
)
n_unmatched_lsoas = len(ca_lsoa_codes) - imd_df.shape[0]

In [13]:

db = Database('data/ca_epc.db', recreate=False)

In [15]:
db['imd_tbl'].create(
{
 'lsoacd': str,
 'lsoanm': str,
 'ladcd': str,
 'ladnm': str,
 'imd': int
 },
 pk = 'lsoacd'
)

<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [16]:
db['ca_la_tbl'].create(
{
 'ladcd': str,
 'ladnm': str,
 'cauthcd': str,
 'cauthnm': str
 },
 pk = 'ladcd'
)

<Table ca_la_tbl (ladcd, ladnm, cauthcd, cauthnm)>

In [17]:
db['postcodes_tbl'].create(
    {"pcds": str,
     "lsoacd": str,
     "msoacd": str,
     "ladcd": str,
     "ladnm": str},
     pk = 'pcds'
     )

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [18]:
db['epc_clean_tbl'].create(
{'uprn': int,
 'lmk_key': str,
 'postcode': str,
 'current_energy_rating': str,
 'local_authority': str,
 'property_type': str,
 'transaction_type': str,
 'environment_impact_current': int,
 'co2_emissions_current': float,
 'tenure': str,
 'date': str,
 'year': int,
 'month': int},
 pk = 'lmk_key'
)

<Table epc_clean_tbl (uprn, lmk_key, postcode, current_energy_rating, local_authority, property_type, transaction_type, environment_impact_current, co2_emissions_current, tenure, date, year, month)>

In [19]:
db['epc_clean_tbl'].add_foreign_key('postcode', 'postcodes_tbl', 'pcds')
db['postcodes_tbl'].add_foreign_key('lsoacd', 'imd_tbl', 'lsoacd')
db['postcodes_tbl'].add_foreign_key('ladcd', 'ca_la_tbl', 'ladcd')

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [22]:
print(db['postcodes_tbl'].schema)

CREATE TABLE "postcodes_tbl" (
   [pcds] TEXT PRIMARY KEY,
   [lsoacd] TEXT REFERENCES [imd_tbl]([lsoacd]),
   [msoacd] TEXT,
   [ladcd] TEXT REFERENCES [ca_la_tbl]([ladcd]),
   [ladnm] TEXT
)


In [23]:
db.close()


In [9]:
lsoa_bng_file_path = get_ca.filter_geojson('data/geojson/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022_-7534040603619445107.geojson',
 output_file='data/geojson/ca_lsoa_pwc.geojson',
 property_name = 'LSOA21CD',
 ca_lsoa_codes = ca_lsoa_codes)

In [14]:
get_ca.reproject(input_bng_file='data/geojson/ca_lsoa_pwc.geojson', output_wgs84_file='data/geojson/ca_lsoa_pwc_wgs84.geojson')

'data/geojson/ca_lsoa_pwc_wgs84.geojson'

In [None]:
# import the lsoa PWC file as geojson to the DB
# geojson-to-sqlite ca_epc.db lsoa_pwc_tbl geojson/ca_lsoa_pwc_wgs84.geojson


In [24]:
epc_clean_tbl_payload = epc_clean_df.to_dicts()
ca_la_tbl_payload = ca_la_df.to_dicts()
postcodes_tbl_payload = postcodes_df.to_dicts()
imd_tbl_payload = imd_df.to_dicts()

In [27]:
db['ca_la_tbl'].insert_all(ca_la_tbl_payload)
db['epc_clean_tbl'].insert_all(epc_clean_tbl_payload)
db['postcodes_tbl'].insert_all(postcodes_tbl_payload)
db['imd_tbl'].insert_all(imd_tbl_payload)


<Table imd_tbl (lsoacd, lsoanm, ladcd, ladnm, imd)>

In [47]:
for row in db['epc_clean_tbl'].rows:
    print(row)

{'uprn': None, 'lmk_key': '632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2', 'postcode': 'BL5 3WB', 'current_energy_rating': 'B', 'local_authority': 'E08000001', 'property_type': 'Flat', 'transaction_type': 'marketed sale', 'environment_impact_current': 87, 'co2_emissions_current': 0.8, 'tenure': 'Owner-occupied', 'date': '2023-08-31', 'year': 2023, 'month': 8}
{'uprn': 4210003725, 'lmk_key': '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', 'postcode': 'BL3 1PS', 'current_energy_rating': 'C', 'local_authority': 'E08000001', 'property_type': 'House', 'transaction_type': 'marketed sale', 'environment_impact_current': 74, 'co2_emissions_current': 3.0, 'tenure': 'Owner-occupied', 'date': '2021-04-03', 'year': 2021, 'month': 4}
{'uprn': 10001239702, 'lmk_key': '69223609222016052017263804808356', 'postcode': 'BL1 5WA', 'current_energy_rating': 'C', 'local_authority': 'E08000001', 'property_type': 'House', 'transaction_type': 'marketed sale', 'environment_i

In [28]:
print(db.schema)

CREATE TABLE [imd_tbl] (
   [lsoacd] TEXT PRIMARY KEY,
   [lsoanm] TEXT,
   [ladcd] TEXT,
   [ladnm] TEXT,
   [imd] INTEGER
);
CREATE TABLE [ca_la_tbl] (
   [ladcd] TEXT PRIMARY KEY,
   [ladnm] TEXT,
   [cauthcd] TEXT,
   [cauthnm] TEXT
);
CREATE TABLE "epc_clean_tbl" (
   [uprn] INTEGER,
   [lmk_key] TEXT PRIMARY KEY,
   [postcode] TEXT REFERENCES [postcodes_tbl]([pcds]),
   [current_energy_rating] TEXT,
   [local_authority] TEXT,
   [property_type] TEXT,
   [transaction_type] TEXT,
   [environment_impact_current] INTEGER,
   [co2_emissions_current] FLOAT,
   [tenure] TEXT,
   [date] TEXT,
   [year] INTEGER,
   [month] INTEGER
);
CREATE TABLE "postcodes_tbl" (
   [pcds] TEXT PRIMARY KEY,
   [lsoacd] TEXT REFERENCES [imd_tbl]([lsoacd]),
   [msoacd] TEXT,
   [ladcd] TEXT REFERENCES [ca_la_tbl]([ladcd]),
   [ladnm] TEXT
);
CREATE TABLE [lsoa_pwc_tbl] (
   [id] INTEGER PRIMARY KEY,
   [FID] INTEGER,
   [LSOA21CD] TEXT,
   [GlobalID] TEXT,
   [geometry] TEXT
);


In [49]:
db.close()