Process the raw Ausgrid solar data to a more usable format.<br>
Inspired by: https://github.com/pierre-haessig/ausgrid-solar-data

In [None]:
import os
import json
from collections import Counter

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [None]:
# Combined 3 years:
# 2010-2011 Solar home electricity data.csv
# 2011-2012 Solar home electricity data v2.csv
# 2012-2013 Solar home electricity data v2.csv
fname = 'solar.csv'

d_raw = pd.read_csv(fname, skiprows=1,
                    parse_dates=['date'], dayfirst=True,
                    na_filter=False, dtype={'Row Quality': str})

In [None]:
# Add a column to indicate if the date is a weekend
d_raw['Weekend'] = d_raw['date'].dt.dayofweek >= 5

# Add a column to indicate if the date is in Summer (November to March)
d_raw['Summer'] = d_raw['date'].dt.month.isin([11, 12, 1, 2, 3])

# Add a column to indicate if the date is in Winter (June to August)
d_raw['Winter'] = d_raw['date'].dt.month.isin([6, 7, 8])

In [None]:
d_raw.head()

#### Pick suitable customers for the Environment (Manual)

In [None]:
# Group by postcode for some consistency
postcode_counts = d_raw['Postcode'].value_counts()
print("Top 5 postcodes with most samples:")
print(postcode_counts.head(5))

In [None]:
# Get customers in a specific postcode
postcode = 2262

d_post = d_raw[d_raw['Postcode'] == postcode].copy()
d_post_cust = d_post['Customer'].unique().tolist()
print(f"Customers in postcode {postcode}: {len(d_post_cust)}")
print(d_post_cust)

In [None]:
# Print some statistics
for id in d_post_cust:
    
    d_customer = d_raw[d_raw['Customer'] == id]

    d_pv = d_customer[d_customer['Consumption Category'] == 'GG'].to_numpy()[:,5:53].flatten()
    d_d = d_customer[d_customer['Consumption Category'] == 'GC'].to_numpy()[:,5:53].flatten()

    print(f"Customer {id}")
    print(f"Generator Capacity: {d_customer['Generator Capacity'].iloc[0]} kWp")
    print(f"Average Generation: {d_pv.mean():.2f} kWh")
    print(f"Average Consumption: {d_d.mean():.2f} kWh")
    # print(d_d.shape, d_pv.shape)
    print("\n")

In [None]:
# Get big players, in search of liquidity
top_customers = d_raw.sort_values("Generator Capacity", ascending=False)["Customer"].unique()[:10]
top_capacities = [d_raw[d_raw["Customer"] == cid]["Generator Capacity"].iloc[0] for cid in top_customers]
print("Top 10 customers with highest generator capacity:")
for cid, cap in zip(top_customers, top_capacities):
    print(f"Customer {cid}: {cap} kWp")

In [None]:
# Plot net demand for a specific customer
# Should aim to get prosumers that provide good liquidity?
id = 119
is_prosumer = False

d_customer = d_raw[d_raw['Customer'] == id]

# Change filters as needed
d_pv = d_customer[(d_customer['Consumption Category'] == 'GG') & 
                  (d_customer['Summer']) & 
                  (~d_customer['Weekend'])].to_numpy()[:, 5:53]
d_pv_hourly = d_pv.reshape(d_pv.shape[0], 24, 2).sum(axis=2)

# Change filters as needed
d_d = d_customer[(d_customer['Consumption Category'] == 'GC') & 
                 (d_customer['Summer']) & 
                 (~d_customer['Weekend'])].to_numpy()[:, 5:53]
d_d_hourly = d_d.reshape(d_d.shape[0], 24, 2).sum(axis=2)

d_load = d_d_hourly - d_pv_hourly if is_prosumer else d_d_hourly

d_load_mean = d_load.astype(np.float32).mean(axis=0)
d_load_std = np.std(d_load.astype(np.float32), axis=0)

plt.figure(figsize=(10, 6))
plt.plot(range(24), d_load_mean, marker='o', label='Mean Net Demand')
plt.fill_between(range(24), d_load_mean - d_load_std, d_load_mean + d_load_std, color='gray', alpha=0.3, label='±1 Std Dev')
plt.xlabel('Hour of Day')
plt.ylabel('Net Demand (kWh)')
plt.title(f'Customer {id} (as {"Prosumer" if is_prosumer else "Consumer"})')
plt.grid(True)
plt.xticks(range(24))
plt.legend()
plt.show()

#### Pick agents from the "Clean Dataset"
Taken from Residential load and rooftop PV generation: an Australian distribution network dataset (https://doi.org/10.1080/14786451.2015.1100196)

In [None]:
clean_agents = [2, 13, 14, 20, 33, 35, 38, 39, 56,
                69, 73, 74, 75, 82, 87, 88, 101, 104,
                106, 109, 110, 119, 124, 130, 137, 141, 144,
                152, 153, 157, 161, 169, 176, 184, 188, 189,
                193, 201, 202, 204, 206, 207, 210, 211, 212,
                214, 218, 244, 246, 253, 256, 273, 276, 297]

clean_agents.remove(2) # Some missing data
clean_agents.remove(161) # Anamolous CL

In [None]:
os.makedirs('results/consumer', exist_ok=True)
os.makedirs('results/prosumer', exist_ok=True)

# Plot net demand for all agents (both as consumer and prosumer) and save as PNGs
for id in tqdm(clean_agents, desc="Processing clean agents"):

    d_customer = d_raw[d_raw['Customer'] == id]

    # Change filters as needed
    d_pv = d_customer[(d_customer['Consumption Category'] == 'GG') & 
                  (d_customer['Winter']) & 
                  (~d_customer['Weekend'])].to_numpy()[:, 5:53]

    d_pv_hourly = d_pv.reshape(d_pv.shape[0], 24, 2).sum(axis=2)

    # Change filters as needed
    d_d = d_customer[(d_customer['Consumption Category'] == 'GC') & 
                 (d_customer['Winter']) & 
                 (~d_customer['Weekend'])].to_numpy()[:, 5:53]
    
    d_d_hourly = d_d.reshape(d_d.shape[0], 24, 2).sum(axis=2)

    for is_prosumer in [False, True]:

        d_load = d_d_hourly - d_pv_hourly if is_prosumer else d_d_hourly

        d_load_mean = d_load.astype(np.float32).mean(axis=0)
        d_load_std = d_load.astype(np.float32).std(axis=0)

        plt.figure(figsize=(10, 6))
        plt.plot(range(24), d_load_mean, marker='o', label='Mean Net Demand')
        plt.fill_between(range(24), d_load_mean - d_load_std, d_load_mean + d_load_std, color='gray', alpha=0.3, label='±1 Std Dev')
        plt.xlabel('Hour of Day')
        plt.ylabel('Net Demand (kWh)')
        plt.title(f'Customer {id} (as {"Prosumer" if is_prosumer else "Consumer"})')
        plt.grid(True)
        plt.xticks(range(24))
        plt.legend()
        plt.savefig(f'results/{'prosumer' if is_prosumer else 'consumer'}/customer_{id}_net_demand.png')
        plt.close()


#### Save processed data to a json file (for MARL environment)

In [None]:
consumers = [69, 184, 189, 212]
prosumers = [110, 119, 193, 256]

json_data = dict()

d_sample = d_raw[(d_raw['Customer'] == prosumers[0]) &
                 (d_raw['Consumption Category'] == 'GC')]

d_sample = d_sample.reset_index(drop=True)

d_weekend = d_sample['Weekend'].to_list()
d_summer = d_sample['Summer'].to_list()
d_winter = d_sample['Winter'].to_list()

start_date = d_sample['date'].min().strftime('%d-%m-%Y')
end_date = d_sample['date'].max().strftime('%d-%m-%Y')

# Randomly pick one day from each week as test day
np.random.seed(5)  # For reproducibility
unique_weeks = d_sample['date'].dt.to_period('W').unique()
test_days = [int(d_sample[d_sample['date'].dt.to_period('W') == week].sample(1).index[0]) for week in unique_weeks]
train_days = [i for i in range(len(d_sample)) if i not in test_days]

json_data['meta'] = {'start_date': start_date,
                     'end_date': end_date,
                     'is_weekend': d_weekend,
                     'is_summer': d_summer,
                     'is_winter': d_winter,
                     'train_days': train_days,
                     'test_days': test_days}

for id in sorted(consumers + prosumers):
    
    d_customer = d_raw[d_raw['Customer'] == id]

    d_pv = d_customer[d_customer['Consumption Category'] == 'GG'].to_numpy()[:,5:53].flatten()
    d_d = d_customer[d_customer['Consumption Category'] == 'GC'].to_numpy()[:,5:53].flatten()

    # Aggregate half-hourly data to hourly by summing consecutive pairs
    d_pv = d_pv.reshape(-1, 2).sum(axis=1)
    d_d = d_d.reshape(-1, 2).sum(axis=1)

    json_data[id] = {'prosumer': True if id in prosumers else False,
                     'pv_capacity': d_customer['Generator Capacity'].iloc[0],
                     'pv': d_pv.tolist(),
                     'demand': d_d.tolist()} 

for id in sorted(consumers + prosumers):

    pv = json_data[id]['pv']
    demand = json_data[id]['demand']

    pv_scaler = MinMaxScaler()
    demand_scaler = MinMaxScaler()

    pv_scaled = pv_scaler.fit_transform(np.array(pv).reshape(-1, 1)).flatten().tolist()
    demand_scaled = demand_scaler.fit_transform(np.array(demand).reshape(-1, 1)).flatten().tolist()

    json_data[id]['pv_scaled'] = pv_scaled
    json_data[id]['demand_scaled'] = demand_scaled
    json_data[id]['pv_max'] = pv_scaler.data_max_[0]
    json_data[id]['demand_max'] = demand_scaler.data_max_[0]

# Save to JSON file
with open('group_4.json', 'w') as f:
    json.dump(json_data, f, indent=4)

In [None]:
# Check distribution of test days across weekdays (seed=5 and 42 are sweet)
weekdays = d_sample['date'].dt.dayofweek.to_list()

test_weekdays = [weekdays[day] for day in test_days]
day_distribution = Counter(test_weekdays)
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

for i, count in sorted(day_distribution.items()):
    print(f"{day_names[i]}: {count}")