In [None]:
# generate dummy customer smart meter data with daily usage using pandas

import pandas as pd
import numpy as np
import plotly.express as px

# generate a date range
date_range = pd.date_range(start='2024-08-01', end='2024-09-01', freq='15min')
date_range = date_range[:-1]

# create a dataframe with the date range
df = pd.DataFrame(index=date_range)

# generate random daily usage data
df['usage'] = np.sin(np.pi * (df.index.hour)/24) * 10 + np.random.randint(0, 5, len(df))

# add single anomaly days
anomaly_days = ['2024-08-10', '2024-08-20']
for day in anomaly_days:
    df.loc[day, 'usage'] = np.sin(np.pi * (df.loc[day, 'usage'].index.hour)/24) * 12 + np.random.randint(0, 5, len(df.loc[day, 'usage']))

# add prolonged anomaly period
prolonged_anomaly_days = ['2024-08-27', '2024-08-28', '2024-08-29', '2024-08-30', '2024-08-31']
for day in prolonged_anomaly_days:
    df.loc[day, 'usage'] = np.sin(np.pi * (df.loc[day, 'usage'].index.hour)/24) * 11.5 + np.random.randint(0, 5, len(df.loc[day, 'usage']))

# plot using plotly
fig = px.line(df, x=df.index, y='usage')
fig.show()

# save to csv
df.to_csv('smart_meter_data.csv')

In [None]:
# generate weather data
date_range = pd.date_range(start='2024-08-01', end='2024-09-01', freq='15min')
date_range = date_range[:-1]

df_weather = pd.DataFrame(index=date_range)

df_weather['temperature'] = np.sin(np.pi * (df_weather.index.hour)/24) * 10 + 15 + np.random.randint(0, 5, len(df_weather))

# add single anomaly days
anomaly_days = ['2024-08-10', '2024-08-20']
for day in anomaly_days:
    df_weather.loc[day, 'temperature'] = np.sin(np.pi * (df_weather.loc[day, 'temperature'].index.hour)/24) * 10 + 25 + np.random.randint(0, 5, len(df.loc[day, 'usage']))

fig = px.line(df_weather, x=df_weather.index, y='temperature')
fig.show()

# save to csv
df_weather.to_csv('weather_data.csv')

In [80]:
from scipy.stats import zscore


def detect_daily_anomalies(df):
    """
    Detects anomalies in energy usage from smart meter data.

    Simple anomaly detection using z-score exceeding 2 for daily usage.
    """

    # Aggregate the data to daily usage
    df_daily = df.resample("D").sum()

    # Calculate the Z-score for daily usage
    df_daily["zscore"] = zscore(df_daily["usage"])

    # Identify anomalies
    df_daily["anomalies"] = df_daily["zscore"] > 2

    # if there are any anomalies, return the dates
    if df_daily["anomalies"].any():
        anomalies = df_daily[df_daily["anomalies"]].index
        return anomalies
    else:
        return None
    
anomalies = detect_daily_anomalies(df)

In [None]:
from scipy.stats import zscore


def detect_prolonged_anomalies(df, min_consecutive_days=3, zscore_threshold=1.5):
    """
    Detects prolonged anomalies in energy usage from smart meter data.

    Anomaly detection using the average z-score exceeding a threshold for a prolonged period.

    Parameters:
    df (pd.DataFrame): DataFrame containing the energy usage data with a datetime index.
    min_consecutive_days (int): Minimum number of consecutive days to consider as a prolonged anomaly.
    zscore_threshold (float): Z-score threshold for detecting anomalies.

    Returns:
    pd.DataFrame: DataFrame with the anomaly_window column indicating prolonged anomalies.
    """

    # Aggregate the data to daily usage
    df_daily = df.resample("D").sum()

    # Calculate the Z-score for daily usage
    df_daily["zscore"] = zscore(df_daily["usage"])

    # Initialize a column to mark prolonged anomalies
    df_daily["prolonged_anomaly_length"] = 0

    # Sliding window to calculate the average Z-score over periods
    for window_size in range(min_consecutive_days, len(df_daily) + 1):
        avg_zscore = df_daily["zscore"].rolling(window=window_size).mean()
        prolonged_anomaly = (avg_zscore > zscore_threshold).astype(int) * window_size
        df_daily["prolonged_anomaly_length"] = np.maximum(
            df_daily["prolonged_anomaly_length"], prolonged_anomaly
        )

    # Create prolonged_anomaly_window column
    df_daily["anomaly_window"] = False
    for i in range(len(df_daily)):
        if df_daily["prolonged_anomaly_length"].iloc[i] > 0:
            start_idx = i - df_daily["prolonged_anomaly_length"].iloc[i] + 1
            df_daily.iloc[start_idx : i + 1, df_daily.columns.get_loc("anomaly_window")] = True

    # if there are any outliers, return the dates
    if df_daily["anomaly_window"].any():
        anomalies = df_daily[df_daily["anomaly_window"]].index
        # get start and end dates of each of the prolonged anomaly windows
        prolonged_anomalies = []
        for i in range(len(anomalies)):
            if i == 0:
                start_date = anomalies[i]
            elif anomalies[i] != anomalies[i - 1] + pd.Timedelta(days=1):
                prolonged_anomalies.append((start_date, anomalies[i - 1]))
                start_date = anomalies[i]
            elif i == len(anomalies) - 1:
                prolonged_anomalies.append((start_date, anomalies[i]))
        return prolonged_anomalies
    else:
        return None
    
# Detect prolonged anomalies in the smart meter data
prolonged_anomalies = detect_prolonged_anomalies(df)
print(prolonged_anomalies)

In [None]:
import pandas as pd

weather_data = pd.read_csv("weather_data.csv", index_col=0, parse_dates=True)

def analyse_weather_data(df, anomalies, prolonged_anomalies):
    """
    Analyse weather data to identify correlations with energy usage anomalies.

    Parameters:
    df (pd.DataFrame): DataFrame containing weather data with a datetime index.
    anomalies (list): List of dates with energy usage anomalies.
    prolonged_anomalies (list): List of tuples with start and end dates of prolonged anomalies.

    Returns:
    pd.DataFrame: DataFrame with weather data during anomalies.
    """

    # resample to daily data
    df_daily = df.resample("D").mean()

    # calculate average temperature
    average_temperature = df_daily["temperature"].mean()

    # calculate average temperature for each anomaly
    anomaly_temperatures = {}
    for anomaly in anomalies:
        anomaly_temperatures[anomaly] = df_daily.loc[anomaly, "temperature"]

    # calculate average temperature for each prolonged anomaly
    prolonged_anomaly_temperatures = {}
    for start_date, end_date in prolonged_anomalies:
        prolonged_anomaly_temperatures[(start_date, end_date)] = df_daily.loc[start_date:end_date, "temperature"].mean()

    average_temperature_str = "Average temperature: {:.2f}".format(average_temperature)
    anomaly_temperatures_str = "Temperature during anomalies: " + ", ".join(
        [f"[{date.strftime('%Y-%m-%d')}: {temperature:.2f}]" for date, temperature in anomaly_temperatures.items()]
    )
    prolonged_anomaly_temperatures_str = "Temperature during prolonged anomalies: " + ", ".join(
        [
            f"[{start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}: {temperature:.2f}]"
            for (start_date, end_date), temperature in prolonged_anomaly_temperatures.items()
        ]
    )
    return average_temperature_str, anomaly_temperatures_str, prolonged_anomaly_temperatures_str


# Analyse weather data during energy usage anomalies
average_temperature_str, anomaly_temperatures_str, prolonged_anomaly_temperatures_str = analyse_weather_data(weather_data, anomalies, prolonged_anomalies)

print(average_temperature_str)
print(anomaly_temperatures_str)
print(prolonged_anomaly_temperatures_str)

In [None]:
df_daily