In [None]:
import os
import zipfile
import pandas as pd
from tqdm import tqdm
from multiprocessing import Process, Queue, Lock, cpu_count

In [None]:
def process_csv_file(csv_file):
    df = pd.read_csv(csv_file, usecols=['LMK_KEY', 'BUILDING_REFERENCE_NUMBER', 'LODGEMENT_DATE', 'ADDRESS1', 'ADDRESS2', 'POSTCODE', 'NUMBER_HABITABLE_ROOMS', 'WINDOWS_DESCRIPTION', 'FLOOR_LEVEL', 'PROPERTY_TYPE', 'INSPECTION_DATE', 'CURRENT_ENERGY_EFFICIENCY', 'POTENTIAL_ENERGY_EFFICIENCY', 'BUILT_FORM', 'TOTAL_FLOOR_AREA', 'MAIN_FUEL', 'ADDRESS'])
    
    # Split 'ADDRESS1' by comma and create 'PAON' and 'SAON' columns
    address_split = df['ADDRESS1'].str.split(',', expand=True).fillna('')
    df['PAON'] = address_split[0].str.strip()
    df['SAON'] = address_split[1].str.strip()
    
    return df
def append_to_csv(df, output_csv, lock):
    with lock:
        header = not os.path.exists(output_csv)
        df.to_csv(output_csv, mode='a', header=header, index=False)

def producer(zip_file, queue):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        for file in tqdm(zip_ref.namelist(), desc='Processing CSV files'):
            if file.endswith('/certificates.csv'):
                with zip_ref.open(file) as csv_file:
                    try:
                        df = process_csv_file(csv_file)
                        queue.put(df)
                    except Exception as e:
                        print(f"Error processing {file}: {e}")
    queue.put(None)  # Signal to the consumer that the producer is done

def consumer(queue, output_csv, lock):
    while True:
        df = queue.get()
        if df is None:
            break
        append_to_csv(df, output_csv, lock)

def combine_epc_csv_files_parallel(zip_file, output_csv):
    queue = Queue(maxsize=10)
    lock = Lock()

    producer_process = Process(target=producer, args=(zip_file, queue))
    consumer_process = Process(target=consumer, args=(queue, output_csv, lock))

    producer_process.start()
    consumer_process.start()

    producer_process.join()
    consumer_process.join()

    print(f"Combined data saved to {output_csv}")


In [None]:
zip_file = 'all-domestic-certificates.zip'
output_csv = 'combined_epc_data.csv'
combine_epc_csv_files_parallel(zip_file, output_csv)