__DATA HANDELING__

File is only used to create edited datasets. Allows for testing different sizes or resampling techniques, to run this the original files from hugging face need to be downloaded

In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
#Data

# Need to download the original files from huggingface
demand_data = pd.read_parquet('data/demand.parquet')
metadata = pd.read_parquet('data/metadata.parquet')
weather_data = pd.read_parquet('data/weather.parquet')
 
demand_data['timestamp'] = pd.to_datetime(demand_data['timestamp'])
metadata['location_id'] = metadata['location_id'].astype(str)
weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])
weather_data['location_id'] = weather_data['location_id'].astype(str)

# Sampling, increase if have more compute power or get rid of
demand_data_sampled = demand_data.sample(frac=0.8, random_state=42)
demand_weather_merged = pd.merge(demand_data_sampled, metadata, on='unique_id', how='inner')
demand_weather_merged = pd.merge(demand_weather_merged, weather_data, on=['timestamp', 'location_id'], how='left')

# Only want to see London Smart Meter
demand_weather_merged = demand_weather_merged[demand_weather_merged['dataset'] == 'London Smart Meter Data']

# Dropping empty or unused columns
demand_weather_merged.ffill(inplace=True)
demand_weather_merged.dropna(inplace=True)
demand_weather_merged.drop(['unique_id', 'location_id'], axis=1, inplace=True)

label_encoder = LabelEncoder()
demand_weather_merged['building_id_encoded'] = label_encoder.fit_transform(demand_weather_merged['building_id'])

demand_weather_merged['hour'] = demand_weather_merged['timestamp'].dt.hour
demand_weather_merged['day_of_week'] = demand_weather_merged['timestamp'].dt.dayofweek
demand_weather_merged['month'] = demand_weather_merged['timestamp'].dt.month

# Resampling to be able to load more data
def resample_building_data(group):
    group.set_index('timestamp', inplace=True)
    resampled = group.resample('W').agg({
        'temperature_2m': 'mean', 
        'precipitation': 'sum',
        'snow_depth': 'mean',
        'pressure_msl': 'mean',
        'cloud_cover': 'mean',
        'sunshine_duration': 'mean',
        'y': 'sum',
        'day_of_week': 'mean',
        'month': 'first',
        'building_id_encoded': 'first'
    })
    resampled.reset_index(inplace=True)
    return resampled

demand_weather_merged = demand_weather_merged.groupby('building_id').apply(resample_building_data)
demand_weather_merged.reset_index(drop=True, inplace=True)

demand_weather_merged.sort_values(by='timestamp', inplace=True)

# Save parquet file
demand_weather_merged.to_parquet('data/demand_weather_merged_0.8.parquet', index=False)