This script merges consumption, tariff, and weather data, and performs necessary data cleaning and aggregation.

Data sources:
- Dynamic Tariff Data: 'tariff_d.csv'
- Consumption Data for Group D: 'consumption_d.csv'
- London Weather Data for 2013: csv files that start with 'London,UK'

Note: Ensure all data files are in the same directory as this script.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import glob

# Load and process consumption and tariff data

In [2]:
tariff_data = pd.read_csv('tariff_d.csv', parse_dates=['GMT'])
consumption_data = pd.read_csv('consumption_d.csv', parse_dates=['GMT'])

# Filter data for 2013
tariff_data_filtered = tariff_data[tariff_data['GMT'].dt.year == 2013].copy()
consumption_data_filtered = consumption_data[
    (consumption_data['GMT'].dt.year == 2013) & 
    (consumption_data['GMT'] != '2013-01-01 00:00:00')
].copy()

# Convert consumption data to long format and handle missing values

In [3]:
consumption_melted = consumption_data_filtered.melt(id_vars=['GMT'], var_name='household_id', value_name='consumption')
all_households = pd.DataFrame(index=consumption_melted.groupby('household_id').size().index)
all_households['missing_values'] = consumption_melted[consumption_melted['consumption'].isnull()].groupby('household_id').size().reindex(all_households.index, fill_value=0)
all_households['percentage_missing'] = (all_households['missing_values'] / consumption_melted.groupby('household_id').size()) * 100
households_to_remove = all_households[all_households['percentage_missing'] > 5].index
consumption_melted_filtered = consumption_melted[~consumption_melted['household_id'].isin(households_to_remove)]

print(f"Number of households removed due to >5% missing values: {len(households_to_remove)}")
print(f"Number of unique remaining household_ids: {consumption_melted_filtered['household_id'].nunique()}")

Number of households removed due to >5% missing values: 18
Number of unique remaining household_ids: 1007


# Imputation

In [4]:
consumption_imputed = consumption_melted_filtered.copy()
consumption_imputed = consumption_imputed.set_index('GMT')
consumption_imputed['consumption'] = consumption_imputed.groupby('household_id')['consumption'].transform(lambda x: x.interpolate(method='time').ffill().bfill())
consumption_imputed = consumption_imputed.reset_index()

# Check for missing values after imputation
missing_values_after_imputation = consumption_imputed.isnull().sum()
print("Missing values in consumption_imputed after imputation:")
print(missing_values_after_imputation)

Missing values in consumption_imputed after imputation:
GMT             0
household_id    0
consumption     0
dtype: int64


# Aggregate data to hourly granularity and merge datasets

In [5]:
consumption_imputed['hour'] = (consumption_imputed['GMT'] + pd.Timedelta(minutes=30)).dt.floor('H')
tariff_data_filtered['hour'] = (tariff_data_filtered['GMT'] + pd.Timedelta(minutes=30)).dt.floor('H')
consumption_hourly = consumption_imputed.groupby(['hour', 'household_id'])['consumption'].sum().reset_index()
tariff_hourly = tariff_data_filtered.groupby('hour')['Price'].mean().reset_index()
merged_data = pd.merge(consumption_hourly, tariff_hourly, on='hour', how='left')
merged_data.rename(columns={'hour': 'timestamp', 'Price': 'tariff'}, inplace=True)
merged_data.sort_values(by=['timestamp', 'household_id'], inplace=True)
merged_data.set_index('timestamp', inplace=True)

In [6]:
# Save merged consumption and tariff data
merged_data.to_csv('1a) consumption_and_tariff_data_hourly.csv')

print("Consumption and tariff data processed and saved.")
print(f"Shape of merged data: {merged_data.shape}")
print(f"Columns in merged data: {merged_data.columns.tolist()}")

Consumption and tariff data processed and saved.
Shape of merged data: (8821320, 3)
Columns in merged data: ['household_id', 'consumption', 'tariff']


### Load and process weather data

In [7]:
# Get all the weather CSV files
csv_files = glob.glob('London,UK*.csv')

def read_csv_with_datetime_index(file):
    df = pd.read_csv(file)
    df['datetime'] = pd.to_datetime(df['datetime']) 
    df.set_index('datetime', inplace=True)
    return df

# Read and concatenate all CSV files, keep important columns
df_list = [read_csv_with_datetime_index(file) for file in csv_files]
merged_df = pd.concat(df_list)
merged_df.sort_index(inplace=True)
merged_df = merged_df[['solarradiation', 'windspeed', 'temp', 'precip', 'humidity']]

# Remove duplicate timestamps and handle daylight savings time change

In [8]:
# Remove rows with duplicate timestamps, keeping only the first occurrence
merged_df = merged_df[~merged_df.index.duplicated(keep='first')]
time_diffs = merged_df.index.to_series().diff().dropna()
unusual_entries = time_diffs[time_diffs != pd.Timedelta(hours=1)]

for timestamp, time_diff in unusual_entries.items():
    if time_diff == pd.Timedelta(hours=2):
        prev_timestamp = timestamp - pd.Timedelta(hours=2)
        next_timestamp = timestamp
        interpolated_row = merged_df.loc[prev_timestamp].combine(merged_df.loc[next_timestamp], lambda x, y: (float(x) + float(y)) / 2)
        interpolated_timestamp = prev_timestamp + pd.Timedelta(hours=1)
        merged_df.loc[interpolated_timestamp] = interpolated_row
        print(f"Interpolated missing hour at: {interpolated_timestamp}")

# Sort the DataFrame again to ensure the interpolated row is in the correct position
merged_df.sort_index(inplace=True)

Interpolated missing hour at: 2013-03-31 01:00:00


# Save and merge data

In [9]:
merged_df.to_csv('1b) merged_weather_data.csv')

print("Weather data processed and saved.")
print(f"Shape of weather data: {merged_df.shape}")
print(f"Columns in weather data: {merged_df.columns.tolist()}")

print("\nData preprocessing complete. Output files: '1a) consumption_and_tariff_data_hourly.csv' and '1b) merged_weather_data.csv'")

Weather data processed and saved.
Shape of weather data: (8760, 5)
Columns in weather data: ['solarradiation', 'windspeed', 'temp', 'precip', 'humidity']

Data preprocessing complete. Output files: '1a) consumption_and_tariff_data_hourly.csv' and '1b) merged_weather_data.csv'
