In [1]:
%matplotlib ipympl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
import pickle # to access dataframe faster than csv
import glob, re
import os
import csv

### Get LCLids to ignore due to limited data

In [2]:
file = open('preprocessed_dfs/batch_1_durations.pkl', 'rb')
df_b1 = pickle.load(file)

file = open('preprocessed_dfs/batch_2_durations.pkl', 'rb')
df_b2 = pickle.load(file)

file = open('preprocessed_dfs/batch_3_durations.pkl', 'rb')
df_b3 = pickle.load(file)

file = open('preprocessed_dfs/batch_4_durations.pkl', 'rb')
df_b4 = pickle.load(file)

windows_df_all = pd.concat([df_b1, df_b2, df_b3, df_b4])

windows_df_all.sort_values(by=['Duration'], inplace = True)
windows_ri_all = windows_df_all.reset_index()
windows_ri_all['LCLid'] = windows_ri_all['LCLid'].apply(lambda x: int(re.sub('\D', '', x)))
windows_ri_all.drop(columns = ['index'], inplace = True)

In [3]:
ignored_ids=windows_ri_all[windows_ri_all['Duration']==pd.Timedelta(0)]['LCLid']
ignored_MACs=[f'MAC{lclid:06d}' for lclid in ignored_ids]

### Clean data 

In [4]:
filenames = sorted(glob.glob('uk-smart-meter-data/Partitioned LCL Data/Small LCL Data/LCL-June2015v2_*.csv'))
#filenames = sorted(glob.glob('uk_smart_meter_cleaned_imputed/LCL-June2015v2_*'))
dest_path='uk_smart_meter_cleaned_imputed'

In [5]:
tot=0
tot_dropped=0
stdortou_dict={}
if False:
    for filepath in filenames:
        filename = filepath.split('/')[-1].split('\\')[-1].split('.')[0]
        data = pd.read_csv(filepath)
        data['DateTime']=pd.to_datetime(data['DateTime'])
        data['minutes'] = data['DateTime'].dt.minute
        ## Remove readings not at exact 30 minute intervals.
        filtered_data=data[(data['minutes']==0) | (data['minutes']==30)].drop(columns=['minutes'])
        
        ## Remove duplicates
        filtered_data=filtered_data.drop_duplicates(subset=['DateTime', 'LCLid'], keep='last',inplace=False)
        
        ## Remove data outside 2012-2013
        filtered_data=filtered_data[(filtered_data['DateTime'].dt.year >= 2012) & (filtered_data['DateTime'].dt.year <= 2013)]
        
        ## Ignore LCLids with limited data
        filtered_data=filtered_data[~filtered_data['LCLid'].isin(ignored_MACs)]

        for _, row in filtered_data.drop_duplicates(subset=['LCLid']).iterrows():
            lclid = row['LCLid']
            stdortou = row['stdorToU']
            if lclid not in stdortou_dict:
                stdortou_dict[lclid] = stdortou

        
        filtered_data=filtered_data.drop(columns=['stdorToU'])
        ## Save to CSV
        filtered_data.reset_index(drop=True,inplace=True)
        filtered_data.to_csv(os.path.join(dest_path,f"{filename}_cleaned"),index=False)
    stdortou_df=pd.DataFrame(list(stdortou_dict.items()),columns=['LCLid','stdorToU'])
    stdortou_df.to_csv(os.path.join(dest_path,'stdorTou_mapping.csv'),index=False)

### Aggregate over each hour

In [6]:
def get_filenum(filename):
    match=re.search(r'June2015v2_(\d+)_cleaned',filename)
    return int(match.group(1) if match else float('inf'))

In [7]:
filenames = sorted(glob.glob('uk_smart_meter_cleaned_imputed/LCL-June2015v2_*'))
filenames=sorted(filenames,key=get_filenum)

In [49]:
def process_chunk(chunk_data, prior_data=None, final_chunk=False):
    chunk_data['DateTime']=pd.to_datetime(chunk_data['DateTime'])
    
    if prior_data is not None:
        chunk_data=pd.concat([prior_data,chunk_data])
        
    if ~final_chunk:
        last_id=chunk_data['LCLid'].unique()[-1]
        prior_data=chunk_data[chunk_data['LCLid']==last_id]
        chunk_data=chunk_data[chunk_data['LCLid']!=last_id]
        
    pivoted_chunk = chunk_data.pivot(index='DateTime', columns='LCLid', values='KWH/hh (per half hour) ')
    aggregated_chunk = pivoted_chunk.resample('1h').sum(min_count=1)
    return prior_data,aggregated_chunk
    

In [50]:
if True:
    dest_path="uk_smart_meter_aggregated"
    chunk_size = 21
    data = pd.DataFrame()
    data_dict={}
    chunk_num=1
    for chunk_start in range(0,len(filenames),chunk_size):
        chunk_data=pd.DataFrame()
        prior_data=None
        final_chunk=False
        for filename in filenames[chunk_start:chunk_start+chunk_size]:
            #print(filename)
            chunk_data=pd.concat([chunk_data,pd.read_csv(filename)])
        if len(filenames)<=chunk_start+chunk_size:
            final_chunk=True
        prior_data,agg_chunk=process_chunk(chunk_data,prior_data,final_chunk)
        agg_chunk.to_csv(os.path.join(dest_path,f'aggregated_chunk_{chunk_num}.csv'))
        print(f"Chunk {chunk_num} saved.")
        chunk_num+=1


Chunk 1 saved.
Chunk 2 saved.
Chunk 3 saved.
Chunk 4 saved.
Chunk 5 saved.
Chunk 6 saved.
Chunk 7 saved.
Chunk 8 saved.
