In [1]:
from sqlite_utils import Database
import polars as pl
from datetime import datetime
import os
# import json
import get_ca_data as get_ca # functions for retrieving CA \ common data

In [2]:
epc_clean = (pl.read_csv('data/epc_subset_polars_last.csv', n_rows=400))


In [3]:
epc_names = get_ca.get_rename_dict(epc_clean)

epc_clean_df = (
    epc_clean
    .with_columns([pl.col('LODGEMENT_DATETIME')
                   .str.to_datetime()
                   .dt.date()
                   .alias('date')])
    .with_columns([pl.col('date').dt.year().alias('year'),
                   pl.col('date').dt.month().cast(pl.Int16).alias('month'),
                   pl.col('date').cast(pl.Utf8)])
    .rename(epc_names)
    .select(pl.exclude('lodgement_datetime'))
    
)
epc_clean_df.glimpse()

Rows: 400
Columns: 13
$ uprn                       <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ lmk_key                    <str> '632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2', '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', '69223609222016052017263804808356', '1099459139642014100812510823040588', '876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d', 'e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15', '1019963559062013100615514564448457', '1485207013512016100421133096069648', '1389662563632016072714480291278400', 'aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505'
$ postcode                   <str> 'BL5 3WB', 'BL3 1PS', 'BL1 5WA', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP', 'BL3 5NP'
$ current_energy_rating      <str> 'B', 'C', 'C', 'D', 'C', 'C', 'C', 'D', 'E', 'D'
$ local_authority            <st

In [4]:
ca_la_df = get_ca.get_ca_la_df(year = 2023)
ca_la_codes = get_ca.get_ca_la_codes(ca_la_df)
postcode_file = get_ca.get_postcode_file(url = "https://www.arcgis.com/sharing/rest/content/items/3770c5e8b0c24f1dbe6d2fc6b46a0b18/data",
                      destination_directory = "data\\postcode_lookup")
postcodes_df = get_ca.get_postcode_df(postcode_file, ca_la_codes)

In [5]:
postcodes_df.glimpse()

Rows: 582364
Columns: 5
$ pcds   <str> 'B1 1AA', 'B1 1AD', 'B1 1AG', 'B1 1AH', 'B1 1AQ', 'B1 1AT', 'B1 1AY', 'B1 1AZ', 'B1 1BA', 'B1 1BB'
$ lsoacd <str> 'E01033625', 'E01033625', 'E01033616', 'E01033625', 'E01033616', 'E01033625', 'E01033625', 'E01033625', 'E01033615', 'E01033620'
$ msoacd <str> 'E02006899', 'E02006899', 'E02006896', 'E02006899', 'E02006896', 'E02006899', 'E02006899', 'E02006899', 'E02006896', 'E02006899'
$ ladcd  <str> 'E08000025', 'E08000025', 'E08000025', 'E08000025', 'E08000025', 'E08000025', 'E08000025', 'E08000025', 'E08000025', 'E08000025'
$ ladnm  <str> 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham', 'Birmingham'



In [19]:
# ca_la_df.select(pl.col('ladcd')).unique()

In [6]:
input_file = get_ca.get_geojson(url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/LLSOA_Dec_2021_PWC_for_England_and_Wales_2022/FeatureServer/0/query",
                      destination_directory = "data\\geojson")

In [7]:
ca_lsoa_codes = get_ca.get_ca_lsoa_codes(postcodes_df)

In [8]:
output_file = 'data\\geojson\\ca_lsoa.geojson'
property_name = 'LSOA21CD'

ca_lsoa_geojson = get_ca.filter_geojson(input_file, output_file, property_name, ca_lsoa_codes)


In [20]:
db.close()

In [21]:

db = Database('data/ca_epc.db', recreate=True)

In [22]:
db['ca_la_tbl'].create(
{
 'ladcd': str,
 'ladnm': str,
 'cauthcd': str,
 'cauthnm': str
 },
 pk = 'ladcd'
)

<Table ca_la_tbl (ladcd, ladnm, cauthcd, cauthnm)>

In [23]:
db['postcodes_tbl'].create(
    {"pcds": str,
     "lsoacd": str,
     "msoacd": str,
     "ladcd": str,
     "ladnm": str},
     pk = 'pcds'
    #  ,
    #  foreign_keys = [
    #      ('ladcd', 'ca_la_tbl', 'ladcd')
        #  ,
        #  ('lsoacd', 'ca_lsoa_tbl', 'LSOA21CD')
    #  ]
     )

<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [24]:
db['epc_clean_tbl'].create(
{'uprn': int,
 'lmk_key': str,
 'postcode': str,
 'current_energy_rating': str,
 'local_authority': str,
 'property_type': str,
 'transaction_type': str,
 'environment_impact_current': int,
 'co2_emissions_current': float,
 'tenure': str,
 'date': str,
 'year': int,
 'month': int},
 pk = 'lmk_key',
 foreign_keys=[
     ('postcode', 'postcodes_tbl', 'pcds')
 ]
)

<Table epc_clean_tbl (uprn, lmk_key, postcode, current_energy_rating, local_authority, property_type, transaction_type, environment_impact_current, co2_emissions_current, tenure, date, year, month)>

In [25]:
epc_clean_tbl_payload = epc_clean_df.to_dicts()
ca_la_tbl_payload = ca_la_df.to_dicts()
postcodes_tbl_payload = postcodes_df.to_dicts()

In [26]:
db['ca_la_tbl'].insert_all(ca_la_tbl_payload)
db['epc_clean_tbl'].insert_all(epc_clean_tbl_payload)
db['postcodes_tbl'].insert_all(postcodes_tbl_payload)


<Table postcodes_tbl (pcds, lsoacd, msoacd, ladcd, ladnm)>

In [47]:
for row in db['epc_clean_tbl'].rows:
    print(row)

{'uprn': None, 'lmk_key': '632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2', 'postcode': 'BL5 3WB', 'current_energy_rating': 'B', 'local_authority': 'E08000001', 'property_type': 'Flat', 'transaction_type': 'marketed sale', 'environment_impact_current': 87, 'co2_emissions_current': 0.8, 'tenure': 'Owner-occupied', 'date': '2023-08-31', 'year': 2023, 'month': 8}
{'uprn': 4210003725, 'lmk_key': '8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c', 'postcode': 'BL3 1PS', 'current_energy_rating': 'C', 'local_authority': 'E08000001', 'property_type': 'House', 'transaction_type': 'marketed sale', 'environment_impact_current': 74, 'co2_emissions_current': 3.0, 'tenure': 'Owner-occupied', 'date': '2021-04-03', 'year': 2021, 'month': 4}
{'uprn': 10001239702, 'lmk_key': '69223609222016052017263804808356', 'postcode': 'BL1 5WA', 'current_energy_rating': 'C', 'local_authority': 'E08000001', 'property_type': 'House', 'transaction_type': 'marketed sale', 'environment_i

In [48]:
print(db.schema)

CREATE TABLE [epc_ca_clean_tbl] (
   [UPRN] TEXT,
   [LMK_KEY] TEXT,
   [POSTCODE] TEXT,
   [CURRENT_ENERGY_RATING] TEXT,
   [LOCAL_AUTHORITY] TEXT,
   [PROPERTY_TYPE] TEXT,
   [LODGEMENT_DATETIME] TEXT,
   [TRANSACTION_TYPE] TEXT,
   [ENVIRONMENT_IMPACT_CURRENT] TEXT,
   [TENURE] TEXT
);
CREATE TABLE [epc_clean_tbl] (
   [uprn] INTEGER,
   [lmk_key] TEXT,
   [postcode] TEXT,
   [current_energy_rating] TEXT,
   [local_authority] TEXT,
   [property_type] TEXT,
   [transaction_type] TEXT,
   [environment_impact_current] INTEGER,
   [co2_emissions_current] FLOAT,
   [tenure] TEXT,
   [date] TEXT,
   [year] INTEGER,
   [month] INTEGER
);


In [49]:
db.close()