In [1]:
import polars as pl
import os

In [2]:
root_dir = 'data/all-domestic-certificates'
la_list = pl.read_csv('data/ca_la_tbl.csv')['LAD22CD']
all_dataframes = []

Open the certificates csv in each folder where the la code is within a CA. Use lazy evaluation to select columns, change data types and group by UPRN, filtering for the most recent certificate. Collect the data and append dataframe to the list.

In [3]:
for item in la_list:
    for folder_name in os.listdir(root_dir):
        # Check if the folder name matches an item in la_list
        if item in folder_name:
            file_path = os.path.join(root_dir, folder_name, "certificates.csv")
            # Check if certificates.csv actually exists inside the folder
            if os.path.exists(file_path):
                q = (
                pl.scan_csv(file_path,
                 infer_schema_length=0) #all as strings
                    .select(pl.col(['LMK_KEY',
                    'POSTCODE',
                    'CURRENT_ENERGY_RATING',
                    'LOCAL_AUTHORITY',
                    'PROPERTY_TYPE',
                    'LODGEMENT_DATETIME',
                    'TRANSACTION_TYPE',
                    'ENVIRONMENT_IMPACT_CURRENT',
                    'TENURE',
                    'UPRN']))
                .with_columns([pl.col('LODGEMENT_DATETIME').str.to_datetime(),
                  pl.col('ENVIRONMENT_IMPACT_CURRENT').cast(pl.Int64),
                  pl.col('UPRN').cast(pl.Int64)])
                .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                .groupby('UPRN').last()
                )
                df = q.collect()
                all_dataframes.append(df)

In [4]:
cons_df = pl.concat(all_dataframes)

In [53]:
# cons_df.glimpse()

In [89]:
cons_df.write_csv('data/epc_subset_polars_last.csv')

In [7]:
postcodes_ca = ((cons_df
                .select(pl.col('POSTCODE'))
                .unique()
                )
                .to_series()
                .to_list()
                )

In [26]:
q_pc_lsoa = (pl.scan_csv('data/PCD_OA_LSOA_MSOA_LAD_FEB19_UK_LU.csv', infer_schema_length=0)
            .select([pl.col(['lsoa11cd', 'ladcd']),
                     pl.col('pcds').alias('POSTCODE')])
            .filter(pl.col('POSTCODE').is_in(postcodes_ca))         
                     )

In [27]:
pc_lsoa_df = q_pc_lsoa.collect()

In [28]:
pc_lsoa_df.shape

(327000, 3)

In [30]:
ca_lsoa = (
    pc_lsoa_df
    .select('lsoa11cd')
    .to_series()
    .to_list()
)

In [51]:
q_imd_lsoa = (pl.scan_csv('data/Index_of_Multiple_Deprivation_(Dec_2019)_Lookup_in_England.csv', infer_schema_length=0)
              .select([pl.col(['LSOA11CD', 'LAD19CD']),
                       pl.col('IMD19')
                        .cast(pl.Int64)
              ])
              .filter(pl.col('LSOA11CD').is_in(ca_lsoa)))

In [35]:
imd_lsoa_df = q_imd_lsoa.collect()

In [52]:
ca_pc_lsoa_imd_df = (
    cons_df.join(pc_lsoa_df, on = 'POSTCODE', how = 'inner')
    .join(imd_lsoa_df, left_on = 'lsoa11cd', right_on = 'LSOA11CD', how = 'inner')
)

In [49]:

ca_pc_lsoa_imd_df.columns = [item.lower() for item in ca_pc_lsoa_imd_df.columns]

In [50]:
ca_pc_lsoa_imd_df.glimpse()

Rows: 4537641
Columns: 14
$ uprn                                <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ lmk_key                             <str> 632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2, 8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c, 69223609222016052017263804808356, 1099459139642014100812510823040588, 876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d, e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15, 1019963559062013100615514564448457, 1485207013512016100421133096069648, 1389662563632016072714480291278400, aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505
$ postcode                            <str> BL5 3WB, BL3 1PS, BL1 5WA, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP
$ current_energy_rating               <str> B, C, C, D, C, C, C, D, E, D
$ local_authority                     <str> E0800000