In [1]:
%matplotlib ipympl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
import pickle # to access dataframe faster than csv
import glob, re
import os
import csv

### Get LCLids to ignore due to limited data

In [2]:
file = open('preprocessed_dfs/batch_1_durations.pkl', 'rb')
df_b1 = pickle.load(file)

file = open('preprocessed_dfs/batch_2_durations.pkl', 'rb')
df_b2 = pickle.load(file)

file = open('preprocessed_dfs/batch_3_durations.pkl', 'rb')
df_b3 = pickle.load(file)

file = open('preprocessed_dfs/batch_4_durations.pkl', 'rb')
df_b4 = pickle.load(file)

windows_df_all = pd.concat([df_b1, df_b2, df_b3, df_b4])

windows_df_all.sort_values(by=['Duration'], inplace = True)
windows_ri_all = windows_df_all.reset_index()
windows_ri_all['LCLid'] = windows_ri_all['LCLid'].apply(lambda x: int(re.sub('\D', '', x)))
windows_ri_all.drop(columns = ['index'], inplace = True)

In [3]:
ignored_ids=windows_ri_all[windows_ri_all['Duration']==pd.Timedelta(0)]['LCLid']
ignored_MACs=[f'MAC{lclid:06d}' for lclid in ignored_ids]

### Clean data 

In [35]:
filenames = sorted(glob.glob('uk-smart-meter-data/Partitioned LCL Data/Small LCL Data/LCL-June2015v2_*.csv'))
#filenames = sorted(glob.glob('uk_smart_meter_cleaned_imputed/LCL-June2015v2_*'))
dest_path='uk_smart_meter_cleaned_imputed'

In [46]:
tot=0
tot_dropped=0
stdortou_dict={}
if False:
    for filepath in filenames:
        filename = filepath.split('/')[-1].split('\\')[-1].split('.')[0]
        data = pd.read_csv(filepath)
        data['DateTime']=pd.to_datetime(data['DateTime'])
        data['minutes'] = data['DateTime'].dt.minute
        ## Remove readings not at exact 30 minute intervals.
        filtered_data=data[(data['minutes']==0) | (data['minutes']==30)].drop(columns=['minutes'])
        
        ## Remove duplicates
        filtered_data=filtered_data.drop_duplicates(subset=['DateTime', 'LCLid'], keep='last',inplace=False)
        
        ## Remove data outside 2012-2013
        filtered_data=filtered_data[(filtered_data['DateTime'].dt.year >= 2012) & (filtered_data['DateTime'].dt.year <= 2013)]
        
        ## Ignore LCLids with limited data
        filtered_data=filtered_data[~filtered_data['LCLid'].isin(ignored_MACs)]

        for _, row in filtered_data.drop_duplicates(subset=['LCLid']).iterrows():
            lclid = row['LCLid']
            stdortou = row['stdorToU']
            if lclid not in stdortou_dict:
                stdortou_dict[lclid] = stdortou

        
        filtered_data=filtered_data.drop(columns=['stdorToU'])
        ## Save to CSV
        filtered_data.reset_index(drop=True,inplace=True)
        filtered_data.to_csv(os.path.join(dest_path,f"{filename}_cleaned"),index=False)
    stdortou_df=pd.DataFrame(list(stdortou_dict.items()),columns=['LCLid','stdorToU'])
    stdortou_df.to_csv(os.path.join(dest_path,'stdorTou_mapping.csv'),index=False)

### Aggregate over each hour

In [6]:
def get_filenum(filename):
    match=re.search(r'June2015v2_(\d+)_cleaned',filename)
    return int(match.group(1) if match else float('inf'))

In [7]:
filenames = sorted(glob.glob('uk_smart_meter_cleaned_imputed/LCL-June2015v2_*'))
filenames=sorted(filenames,key=get_filenum)

In [19]:
lclids=[]
for filename in filenames:
    df=pd.read_csv(filename)
    lclids.extend(df['LCLid'].unique())
    

In [None]:
if True:
    number_of_files = 21
    data = pd.DataFrame()
    data_dict={}
    for filename in filenames[:number_of_files+1]:
        print(filename)
        with open(filename,mode='r') as file:
            reader=csv.DictReader(file)
            for row in reader:
                lclid=row['LCLid']
                datetime=pd.to_datetime(row['DateTime'])
                kwh=row['KWH/hh (per half hour) ']
                if lclid not in data_dict:
                    data_dict[lclid]=[]
                data_dict[lclid].append((datetime,kwh))


In [39]:
number_of_files = 20
data = pd.DataFrame()
for filename in filenames[:number_of_files+1]:
    data = pd.concat([data, pd.read_csv(filename)])


In [41]:
data['KWH/hh (per half hour) ']

0         0.000
1         0.000
2         0.000
3         0.000
4         0.000
          ...  
999290    0.051
999291    0.020
999292    0.050
999293    0.033
999294    0.020
Name: KWH/hh (per half hour) , Length: 20984980, dtype: float64

In [42]:
data['DateTime']=pd.to_datetime(data['DateTime'])
df = data.drop(columns=['stdorToU'])

In [43]:
print('Number of duplicates: ', df.duplicated(subset=['DateTime', 'LCLid']).sum())
df_unique = df.drop_duplicates(subset=['DateTime', 'LCLid'], keep='last')
# df_unique = df.groupby(['DateTime', 'LCLid'], as_index=False).mean().reset_index(drop=True)
# Counter number of duplicates
print('Number of duplicates: ', df_unique.duplicated(subset=['DateTime', 'LCLid']).sum())

Number of duplicates:  0
Number of duplicates:  0


In [44]:
pivoted = df_unique.pivot(index='DateTime', columns='LCLid', values='KWH/hh (per half hour) ')
pivoted['datetime'] = pivoted.index

In [45]:
pivoted.columns[:5]

Index(['MAC000002', 'MAC000003', 'MAC000004', 'MAC000006', 'MAC000007'], dtype='object', name='LCLid')

In [55]:
pivoted.dtypes

LCLid
MAC000002           float64
MAC000003           float64
MAC000004           float64
MAC000006           float64
MAC000007           float64
                  ...      
MAC000750           float64
MAC000752           float64
MAC000753           float64
MAC000756           float64
datetime     datetime64[ns]
Length: 617, dtype: object

In [59]:
pivoted=pivoted.drop(columns=['datetime'])
# Resample the data into 2-hour intervals and sum the values
aggregated_data = pivoted.resample('1h').sum(min_count=1)