In [82]:
import polars as pl
import os

In [83]:
root_dir = 'data/all-domestic-certificates'
la_list = pl.read_csv('data/ca_la_tbl.csv')['LAD22CD']
all_dataframes = []

Open the certificates csv in each folder where the la code is within a CA. Use lazy evaluation to select columns, change data types and group by UPRN, filtering for the most recent certificate. Collect the data and append dataframe to the list.

In [86]:
for item in la_list:
    for folder_name in os.listdir(root_dir):
        # Check if the folder name matches an item in la_list
        if item in folder_name:
            file_path = os.path.join(root_dir, folder_name, "certificates.csv")
            # Check if certificates.csv actually exists inside the folder
            if os.path.exists(file_path):
                q = (
                pl.scan_csv(file_path,
                 infer_schema_length=0) #all as strings
                    .select(pl.col(['LMK_KEY',
                    'POSTCODE',
                    'CURRENT_ENERGY_RATING',
                    'LOCAL_AUTHORITY',
                    'PROPERTY_TYPE',
                    'LODGEMENT_DATETIME',
                    'TRANSACTION_TYPE',
                    'ENVIRONMENT_IMPACT_CURRENT',
                    'TENURE',
                    'UPRN']))
                .with_columns([pl.col('LODGEMENT_DATETIME').str.to_datetime(),
                  pl.col('ENVIRONMENT_IMPACT_CURRENT').cast(pl.Int64),
                  pl.col('UPRN').cast(pl.Int64)])
                .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                .groupby('UPRN').last()
                )
                df = q.collect()
                all_dataframes.append(df)

In [88]:
cons_df = pl.concat(all_dataframes)

In [79]:
cons_df.glimpse()

Rows: 4660486
Columns: 10
$ UPRN                                <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ LMK_KEY                             <str> 632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2, 8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c, 69223609222016052017263804808356, 1099459139642014100812510823040588, 876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d, e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15, 1019963559062013100615514564448457, 1485207013512016100421133096069648, 1389662563632016072714480291278400, aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505
$ POSTCODE                            <str> BL5 3WB, BL3 1PS, BL1 5WA, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP
$ CURRENT_ENERGY_RATING               <str> B, C, C, D, C, C, C, D, E, D
$ LOCAL_AUTHORITY                     <str> E0800000

In [89]:
cons_df.write_csv('data/epc_subset_polars_last.csv')