In [82]:
import polars as pl
import os

In [83]:
root_dir = 'data/all-domestic-certificates'
la_list = pl.read_csv('data/ca_la_tbl.csv')['LAD22CD']
all_dataframes = []

Open the certificates csv in each folder where the la code is within a CA. Use lazy evaluation to select columns, change data types and group by UPRN, filtering for the most recent certificate. Collect the data and append dataframe to the list.

In [86]:
for item in la_list:
    for folder_name in os.listdir(root_dir):
        # Check if the folder name matches an item in la_list
        if item in folder_name:
            file_path = os.path.join(root_dir, folder_name, "certificates.csv")
            # Check if certificates.csv actually exists inside the folder
            if os.path.exists(file_path):
                q = (
                pl.scan_csv(file_path,
                 infer_schema_length=0) #all as strings
                    .select(pl.col(['LMK_KEY',
                    'POSTCODE',
                    'CURRENT_ENERGY_RATING',
                    'LOCAL_AUTHORITY',
                    'PROPERTY_TYPE',
                    'LODGEMENT_DATETIME',
                    'TRANSACTION_TYPE',
                    'ENVIRONMENT_IMPACT_CURRENT',
                    'TENURE',
                    'UPRN']))
                .with_columns([pl.col('LODGEMENT_DATETIME').str.to_datetime(),
                  pl.col('ENVIRONMENT_IMPACT_CURRENT').cast(pl.Int64),
                  pl.col('UPRN').cast(pl.Int64)])
                .sort(pl.col(['UPRN', 'LODGEMENT_DATETIME']))
                .groupby('UPRN').last()
                )
                df = q.collect()
                all_dataframes.append(df)

In [88]:
cons_df = pl.concat(all_dataframes)

In [79]:
cons_df.glimpse()

Rows: 4660486
Columns: 10
$ UPRN                                <i64> None, 4210003725, 10001239702, 10001239704, 10001239709, 10001239711, 10001239712, 10001239714, 10001239715, 10001239716
$ LMK_KEY                             <str> 632d66abdfbd4f709d71851927a2d79fce2fd90e4d67fe999bc22b5a9970bcc2, 8126d78434eb72180e023c7d522d811b8a11a9d6f206aa19a867f06df5d4599c, 69223609222016052017263804808356, 1099459139642014100812510823040588, 876abfcefe535b31e3e3e46264de08d97acec07eb72415f2fa967061afc6315d, e9c4c64a24ad61bbf2c4de0d3bd97725b8dde5d9b21a12e2cd0888e34ac6af15, 1019963559062013100615514564448457, 1485207013512016100421133096069648, 1389662563632016072714480291278400, aade8e478efe85ef7a8b090f1a4eedd4941217246ccb9019e35ef1d944af8505
$ POSTCODE                            <str> BL5 3WB, BL3 1PS, BL1 5WA, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP, BL3 5NP
$ CURRENT_ENERGY_RATING               <str> B, C, C, D, C, C, C, D, E, D
$ LOCAL_AUTHORITY                     <str> E0800000

In [89]:
cons_df.write_csv('data/epc_subset_polars_last.csv')

In [107]:
postcodes_ca = ((cons_df
                .select(pl.col('POSTCODE'))
                .unique()
                )
                .to_series()
                .to_list()
                )

In [109]:
postcode_chunks = [postcodes_ca[i:i+100] for i in range(0, len(postcodes_ca), 100)]

In [110]:
chunks[0]

['BD15 0LW',
 'L36 5XL',
 'WF10 5BU',
 'CB5 8QL',
 'B16 8HF',
 'LS1 5JW',
 'L21 1HS',
 'TS27 4RY',
 'L32 0SN',
 'B23 7DH',
 'HX3 5BJ',
 'DL13 4LN',
 'TS25 5QD',
 'SK8 5HH',
 'NE29 8LP',
 'M22 4LG',
 'NE37 1JS',
 'WS1 3RB',
 'S66 7QT',
 'DN2 6DA',
 'BL3 6TE',
 'OL6 8JW',
 'BS14 9RD',
 'BD3 7JU',
 'LS27 0DN',
 'WV12 5QJ',
 'BL1 3RE',
 'M35 9BP',
 'CH42 4QU',
 'B11 3QL',
 'B43 5QN',
 'WS9 8QX',
 'BS32 4JR',
 'HD3 3NY',
 'DL4 1AR',
 'SR7 9RS',
 'LS14 2AW',
 'CB4 2JA',
 'NE32 5QZ',
 'WV14 7NW',
 'BS16 7JF',
 'BL9 8DR',
 'BL5 2ND',
 'M41 8DZ',
 'LS17 9LA',
 'CH44 4EB',
 'M46 9NS',
 'SK14 3EB',
 'DH4 6EP',
 'WF9 4FR',
 'PE7 1XX',
 'NE34 9EN',
 'M22 9WW',
 'HD2 1SR',
 'TS10 5JN',
 'BS7 8HH',
 'L30 5SD',
 'BD5 0TJ',
 'BD13 5HG',
 'DL12 8NB',
 'BL9 7UN',
 'TS25 5AQ',
 'TS27 4DA',
 'M30 0YQ',
 'PE4 6ST',
 'BS4 3HH',
 'NE43 7RG',
 'B63 1JB',
 'L6 5JP',
 'TS3 9ES',
 'B42 1SJ',
 'SK7 2LF',
 'M4 4AL',
 'B92 7FG',
 'M24 2NZ',
 'CV2 5HE',
 'M35 9AJ',
 'WF8 1RP',
 'WF17 9BB',
 'LS15 8PZ',
 'CV5 6DN',
 '