In [None]:
# generate dummy customer smart meter data with daily usage using pandas

import pandas as pd
import numpy as np
import plotly.express as px

# generate a date range
date_range = pd.date_range(start='2024-08-01', end='2024-09-01', freq='15min')
date_range = date_range[:-1]

# create a dataframe with the date range
df = pd.DataFrame(index=date_range)

# generate random daily usage data
df['usage'] = np.sin(np.pi * (df.index.hour)/24) * 10 + np.random.randint(0, 5, len(df))

# add single anomaly days
anomaly_days = ['2024-08-10', '2024-08-20']
for day in anomaly_days:
    df.loc[day, 'usage'] = np.sin(np.pi * (df.loc[day, 'usage'].index.hour)/24) * 12 + np.random.randint(0, 5, len(df.loc[day, 'usage']))

# add prolonged anomaly period
prolonged_anomaly_days = ['2024-08-27', '2024-08-28', '2024-08-29', '2024-08-30', '2024-08-31']
for day in prolonged_anomaly_days:
    df.loc[day, 'usage'] = np.sin(np.pi * (df.loc[day, 'usage'].index.hour)/24) * 11.5 + np.random.randint(0, 5, len(df.loc[day, 'usage']))

# plot using plotly
fig = px.line(df, x=df.index, y='usage')
fig.show()

# save to csv
df.to_csv('smart_meter_data.csv')

In [None]:
from scipy.stats import zscore

# Aggregate the data to daily usage
df_daily = df.resample('D').sum()

# Calculate the Z-score for daily usage
df_daily['zscore'] = zscore(df_daily['usage'])

# Identify outliers (e.g., Z-score > 2 or < -2)
df_daily['outlier'] = df_daily['zscore'].abs() > 2

# add a column to the original dataframe to indicate if the day is an outlier
df['outlier'] = df.index.floor('D').isin(df_daily[df_daily['outlier']].index)

# plot original data with outliers highlighted with a red box covering the day
fig = px.line(df, x=df.index, y='usage', title='Daily Usage with Outliers Highlighted')
for outlier in df_daily[df_daily['outlier']].index:
    fig.add_vrect(x0=outlier, x1=outlier + pd.Timedelta(days=1), fillcolor='red', opacity=0.25, line_width=0, annotation_text='Outlier')
fig.show()

In [None]:
from scipy.stats import zscore


def detect_prolonged_anomalies(df, min_consecutive_days=3, zscore_threshold=1.5):
    """
    Detects prolonged anomalies in energy usage from smart meter data.

    Anomaly detection using the average z-score exceeding a threshold for a prolonged period.

    Parameters:
    df (pd.DataFrame): DataFrame containing the energy usage data with a datetime index.
    min_consecutive_days (int): Minimum number of consecutive days to consider as a prolonged anomaly.
    zscore_threshold (float): Z-score threshold for detecting anomalies.

    Returns:
    pd.DataFrame: DataFrame with the anomaly_window column indicating prolonged anomalies.
    """

    # Aggregate the data to daily usage
    df_daily = df.resample("D").sum()

    # Calculate the Z-score for daily usage
    df_daily["zscore"] = zscore(df_daily["usage"])

    # Initialize a column to mark prolonged anomalies
    df_daily["prolonged_anomaly_length"] = 0

    # Sliding window to calculate the average Z-score over periods
    for window_size in range(min_consecutive_days, len(df_daily) + 1):
        avg_zscore = df_daily["zscore"].rolling(window=window_size).mean()
        prolonged_anomaly = (avg_zscore > zscore_threshold).astype(int) * window_size
        df_daily["prolonged_anomaly_length"] = np.maximum(
            df_daily["prolonged_anomaly_length"], prolonged_anomaly
        )

    # Create prolonged_anomaly_window column
    df_daily["anomaly_window"] = False
    for i in range(len(df_daily)):
        if df_daily["prolonged_anomaly_length"].iloc[i] > 0:
            start_idx = i - df_daily["prolonged_anomaly_length"].iloc[i] + 1
            df_daily.iloc[start_idx : i + 1, df_daily.columns.get_loc("anomaly_window")] = True

    # if there are any outliers, return the dates
    if df_daily["anomaly_window"].any():
        anomalies = df_daily[df_daily["anomaly_window"]].index
        # get start and end dates of each of the prolonged anomaly windows
        prolonged_anomalies = []
        for i in range(len(anomalies)):
            if i == 0:
                start_date = anomalies[i]
            elif anomalies[i] != anomalies[i - 1] + pd.Timedelta(days=1):
                prolonged_anomalies.append((start_date, anomalies[i - 1]))
                start_date = anomalies[i]
            elif i == len(anomalies) - 1:
                prolonged_anomalies.append((start_date, anomalies[i]))
        return prolonged_anomalies
    else:
        return None
    
# Detect prolonged anomalies in the smart meter data
outliers = detect_prolonged_anomalies(df)
print(outliers)

In [58]:
outliers = outliers + outliers

In [None]:
prolonged_anomalies_str = ", ".join([
    "prolonged anomaly "
    + str(i+1)
    + ": "
    + outliers[i][0].strftime("%Y-%m-%d")
    + " to "
    + outliers[i][1].strftime("%Y-%m-%d")
    for i in range(len(outliers))
])

prolonged_anomalies_str