Process the raw Ausgrid solar data to a more usable format.<br>
Inspired by: https://github.com/pierre-haessig/ausgrid-solar-data

In [1]:
import os
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Combined 3 years:
# 2010-2011 Solar home electricity data.csv
# 2011-2012 Solar home electricity data v2.csv
# 2012-2013 Solar home electricity data v2.csv
fname = 'solar.csv'

d_raw = pd.read_csv(fname, skiprows=1,
                    parse_dates=['date'], dayfirst=True,
                    na_filter=False, dtype={'Row Quality': str})

  d_raw = pd.read_csv(fname, skiprows=1,


In [3]:
d_raw.head()

Unnamed: 0,Customer,Generator Capacity,Postcode,Consumption Category,date,00:30,01:00,01:30,02:00,02:30,...,20:00,20:30,21:00,21:30,22:00,22:30,23:00,23:30,00:00,Row Quality
0,1,3.78,2076,GC,2010-07-01,0.303,0.471,0.083,0.121,0.361,...,0.54,0.406,0.543,0.495,0.216,0.378,0.128,0.078,0.125,
1,1,3.78,2076,CL,2010-07-01,1.25,1.244,1.256,0.744,0.019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.075,
2,1,3.78,2076,GG,2010-07-01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,1,3.78,2076,GC,2010-07-02,0.116,0.346,0.122,0.079,0.12,...,1.1,1.012,0.817,0.526,0.335,0.402,0.142,0.12,0.111,
4,1,3.78,2076,CL,2010-07-02,1.238,1.238,1.256,1.25,0.169,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.088,


#### Pick suitable customers for the Environment (Manual)

In [None]:
# Group by postcode for some consistency
postcode_counts = d_raw['Postcode'].value_counts()
print("Top 5 postcodes with most samples:")
print(postcode_counts.head(5))

In [None]:
# Get customers in a specific postcode
postcode = 2262

d_post = d_raw[d_raw['Postcode'] == postcode].copy()
d_post_cust = d_post['Customer'].unique().tolist()
print(f"Customers in postcode {postcode}: {len(d_post_cust)}")
print(d_post_cust)

In [None]:
# Print some statistics
for id in d_post_cust:
    
    d_customer = d_raw[d_raw['Customer'] == id]

    d_pv = d_customer[d_customer['Consumption Category'] == 'GG'].to_numpy()[:,5:53].flatten()
    d_d = d_customer[d_customer['Consumption Category'] == 'GC'].to_numpy()[:,5:53].flatten()

    print(f"Customer {id}")
    print(f"Generator Capacity: {d_customer['Generator Capacity'].iloc[0]} kWp")
    print(f"Average Generation: {d_pv.mean():.2f} kWh")
    print(f"Average Consumption: {d_d.mean():.2f} kWh")
    # print(d_d.shape, d_pv.shape)
    print("\n")

In [None]:
# Get big players, in search of liquidity
top_customers = d_raw.sort_values("Generator Capacity", ascending=False)["Customer"].unique()[:10]
top_capacities = [d_raw[d_raw["Customer"] == cid]["Generator Capacity"].iloc[0] for cid in top_customers]
print("Top 10 customers with highest generator capacity:")
for cid, cap in zip(top_customers, top_capacities):
    print(f"Customer {cid}: {cap} kWp")

In [None]:
# Plot net demand for a specific customer
# Should aim to get prosumers that provide good liquidity
id = 13
is_prosumer = True

d_customer = d_raw[d_raw['Customer'] == id]

d_pv = d_customer[d_customer['Consumption Category'] == 'GG'].to_numpy()[:,5:53]
d_pv_hourly = d_pv.reshape(d_pv.shape[0], 24, 2).sum(axis=2)

d_d = d_customer[d_customer['Consumption Category'] == 'GC'].to_numpy()[:,5:53]
d_d_hourly = d_d.reshape(d_d.shape[0], 24, 2).sum(axis=2)

d_load = d_d_hourly - d_pv_hourly if is_prosumer else d_d_hourly

d_load_mean = d_load.astype(np.float32).mean(axis=0)
d_load_std = np.std(d_load.astype(np.float32), axis=0)

plt.figure(figsize=(10, 6))
plt.plot(range(24), d_load_mean, marker='o', label='Mean Net Demand')
plt.fill_between(range(24), d_load_mean - d_load_std, d_load_mean + d_load_std, color='gray', alpha=0.3, label='±1 Std Dev')
plt.xlabel('Hour of Day')
plt.ylabel('Net Demand (kWh)')
plt.title(f'Customer {id} (as {"Prosumer" if is_prosumer else "Consumer"})')
plt.grid(True)
plt.xticks(range(24))
plt.legend()
plt.show()

#### Pick agents from the "Clean Dataset"
Taken from Residential load and rooftop PV generation: an Australian distribution network dataset (https://doi.org/10.1080/14786451.2015.1100196)

In [4]:
clean_agents = [2, 13, 14, 20, 33, 35, 38, 39, 56,
                69, 73, 74, 75, 82, 87, 88, 101, 104,
                106, 109, 110, 119, 124, 130, 137, 141, 144,
                152, 153, 157, 161, 169, 176, 184, 188, 189,
                193, 201, 202, 204, 206, 207, 210, 211, 212,
                214, 218, 244, 246, 253, 256, 273, 276, 297]

clean_agents.remove(2) # Some missing data
clean_agents.remove(161) # Anamolous CL

In [5]:
os.makedirs('results/consumer', exist_ok=True)
os.makedirs('results/prosumer', exist_ok=True)

# Plot net demand for all agents (both as consumer and prosumer) and save as PNGs
for id in tqdm(clean_agents, desc="Processing clean agents"):

    d_customer = d_raw[d_raw['Customer'] == id]

    d_pv = d_customer[d_customer['Consumption Category'] == 'GG'].to_numpy()[:, 5:53]
    d_pv_hourly = d_pv.reshape(d_pv.shape[0], 24, 2).sum(axis=2)

    d_d = d_customer[d_customer['Consumption Category'] == 'GC'].to_numpy()[:, 5:53]
    d_d_hourly = d_d.reshape(d_d.shape[0], 24, 2).sum(axis=2)

    for is_prosumer in [False, True]:

        d_load = d_d_hourly - d_pv_hourly if is_prosumer else d_d_hourly

        d_load_mean = d_load.astype(np.float32).mean(axis=0)
        d_load_std = d_load.astype(np.float32).std(axis=0)

        plt.figure(figsize=(10, 6))
        plt.plot(range(24), d_load_mean, marker='o', label='Mean Net Demand')
        plt.fill_between(range(24), d_load_mean - d_load_std, d_load_mean + d_load_std, color='gray', alpha=0.3, label='±1 Std Dev')
        plt.xlabel('Hour of Day')
        plt.ylabel('Net Demand (kWh)')
        plt.title(f'Customer {id} (as {"Prosumer" if is_prosumer else "Consumer"})')
        plt.grid(True)
        plt.xticks(range(24))
        plt.legend()
        plt.savefig(f'results/{'prosumer' if is_prosumer else 'consumer'}/customer_{id}_net_demand.png')
        plt.close()


Processing clean agents: 100%|██████████| 52/52 [00:16<00:00,  3.13it/s]


#### Save processed data to a json file (for MARL environment)

In [6]:
consumers = [14,69,101,119]
prosumers = [13,73,110,144]

json_data = dict()

for id in sorted(consumers + prosumers):
    
    d_customer = d_raw[d_raw['Customer'] == id]

    d_pv = d_customer[d_customer['Consumption Category'] == 'GG'].to_numpy()[:,5:53].flatten()
    d_d = d_customer[d_customer['Consumption Category'] == 'GC'].to_numpy()[:,5:53].flatten()

    # Aggregate half-hourly data to hourly by summing consecutive pairs
    d_pv = d_pv.reshape(-1, 2).sum(axis=1)
    d_d = d_d.reshape(-1, 2).sum(axis=1)

    json_data[id] = {'prosumer': True if id in prosumers else False,
                     'pv_capacity': d_customer['Generator Capacity'].iloc[0],
                     'start_date': d_customer['date'].min().strftime('%d-%m-%Y'),
                     'end_date': d_customer['date'].max().strftime('%d-%m-%Y'),
                     'pv': d_pv.tolist(),
                     'demand': d_d.tolist()}

# Save to JSON file
with open('group_3.json', 'w') as f:
    json.dump(json_data, f, indent=4)