In [12]:
import polars as pl
import os
import re

In [54]:
root_dir = 'data/all-domestic-certificates'

In [57]:
bathbit = 'domestic-E06000055-Bedford'

In [61]:
path = os.path.join(root_dir, bathbit, 'domestic.csv')

In [62]:
os.path.exists(path)

False

In [3]:
ca_la_tbl = pl.read_csv('data/ca_la_tbl.csv')

In [74]:
la_list = ca_la_tbl['LAD22CD']

In [75]:
la_list

LAD22CD
str
"""E08000001"""
"""E08000002"""
"""E08000003"""
"""E08000004"""
"""E08000005"""
"""E08000006"""
"""E08000007"""
"""E08000008"""
"""E08000009"""
"""E08000010"""


In [76]:
all_dataframes = []

In [77]:
for item in la_list:
    for folder_name in os.listdir(root_dir):
        # Check if the folder name matches an item in la_vec
        if item in folder_name:
            file_path = os.path.join(root_dir, folder_name, "certificates.csv")
            # print(file_path)
            # Check if domestic.csv actually exists inside the folder
            if os.path.exists(file_path):
                q = (
                pl.scan_csv(file_path,
                 infer_schema_length=0) #all as strings
                    .select(pl.col(['LMK_KEY',
                    'POSTCODE',
                    'CURRENT_ENERGY_RATING',
                    'LOCAL_AUTHORITY',
                    'PROPERTY_TYPE',
                    'LODGEMENT_DATETIME',
                    'TRANSACTION_TYPE',
                    'ENVIRONMENT_IMPACT_CURRENT',
                    'TENURE',
                    'UPRN']))
                .with_columns([pl.col('LODGEMENT_DATETIME').str.to_datetime(),
                  pl.col('ENVIRONMENT_IMPACT_CURRENT').cast(pl.Int64),
                  pl.col('UPRN').cast(pl.Int64)])
                .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                .groupby('UPRN').last()
                )
                df = q.collect()
                all_dataframes.append(df)

In [71]:
all_dataframes

[shape: (81_915, 10)
 ┌──────────┬──────────┬──────────┬────────────┬───┬────────────┬────────────┬────────────┬─────────┐
 │ UPRN     ┆ LMK_KEY  ┆ POSTCODE ┆ CURRENT_EN ┆ … ┆ LODGEMENT_ ┆ TRANSACTIO ┆ ENVIRONMEN ┆ TENURE  │
 │ ---      ┆ ---      ┆ ---      ┆ ERGY_RATIN ┆   ┆ DATETIME   ┆ N_TYPE     ┆ T_IMPACT_C ┆ ---     │
 │ i64      ┆ str      ┆ str      ┆ G          ┆   ┆ ---        ┆ ---        ┆ URRENT     ┆ str     │
 │          ┆          ┆          ┆ ---        ┆   ┆ datetime[μ ┆ str        ┆ ---        ┆         │
 │          ┆          ┆          ┆ str        ┆   ┆ s]         ┆            ┆ i64        ┆         │
 ╞══════════╪══════════╪══════════╪════════════╪═══╪════════════╪════════════╪════════════╪═════════╡
 │ null     ┆ 632d66ab ┆ BL5 3WB  ┆ B          ┆ … ┆ 2023-08-31 ┆ marketed   ┆ 87         ┆ Owner-o │
 │          ┆ dfbd4f70 ┆          ┆            ┆   ┆ 09:36:01   ┆ sale       ┆            ┆ ccupied │
 │          ┆ 9d718519 ┆          ┆            ┆   ┆         

In [78]:
cons_df = pl.concat(all_dataframes)

In [79]:
cons_df.glimpse()

Rows: 4660486
Columns: 10
$ UPRN                                <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ LMK_KEY                             <str> 632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2, 8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c, 69223609222016052017263804808356, 1099459139642014100812510823040588, 876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d, e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15, 1019963559062013100615514564448457, 1485207013512016100421133096069648, 1389662563632016072714480291278400, aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505
$ POSTCODE                            <str> BL5 3WB, BL3 1PS, BL1 5WA, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP
$ CURRENT_ENERGY_RATING               <str> B, C, C, D, C, C, C, D, E, D
$ LOCAL_AUTHORITY                     <str> E0800000

In [12]:
q = (
    pl.scan_csv('data/domestic_cert_ca_tbl.csv',
                 infer_schema_length=0) #all as strings
    .select(pl.col(['LMK_KEY',
                    'POSTCODE',
                    'CURRENT_ENERGY_RATING',
                    'LOCAL_AUTHORITY',
                    'PROPERTY_TYPE',
                    'LODGEMENT_DATETIME',
                    'TRANSACTION_TYPE',
                    'ENVIRONMENT_IMPACT_CURRENT',
                    'TENURE',
                    'UPRN']))
    .with_columns([pl.col('LODGEMENT_DATETIME').str.to_datetime(),
                  pl.col('ENVIRONMENT_IMPACT_CURRENT').cast(pl.Int64),
                  pl.col('UPRN').cast(pl.Int64)])
    .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
    .groupby('UPRN').last()
)

In [13]:
%%time
epc_latest_certs = q.collect()

CPU times: total: 1min 7s
Wall time: 28.4 s


In [14]:
epc_latest_certs.glimpse()

Rows: 4659361
Columns: 10
$ UPRN                                        <i64> None, 30, 97, 100, 103, 104, 105, 106, 107, 109
$ LMK_KEY                                     <str> 551348c87dcc73ed08d7f9b1aacf7548ff9940715eba79f4f30e61bf93f00e8e, 945145029922013062317101139638477, 1647265119642018071017301852989308, c59230c17d9b975cd1ba5457629112006a624d024894858dd75c712939ace92f, 248960a413bfe4ea746cd67d64f65b144b2343128715131a5e508aa02234fada, 883728034532015041312250532978807, 1240896913412015041413072693950436, 1679813059832018111912231061978999, 1542211424712017050910121296030754, 1280502619602015021114473434359198
$ POSTCODE                                    <str> B31 5FQ, BS10 7RZ, BS10 6LU, BS10 6LU, BS10 6LU, BS10 6LX, BS10 6LU, BS10 6LX, BS10 6LU, BS10 6LU
$ CURRENT_ENERGY_RATING                       <str> B, G, D, D, C, D, C, C, D, C
$ LOCAL_AUTHORITY                             <str> E08000025, E06000023, E06000023, E06000023, E06000023, E06000023, E06000023, E06000023, E060

In [15]:
epc_latest_certs.write_csv('data/epc_subset_polars_last.csv')