In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.ensemble import IsolationForest

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
import seaborn as sns
from sklearn.metrics import confusion_matrix

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sktime.forecasting.model_selection import (
    CutoffSplitter,
    ExpandingWindowSplitter,
    SingleWindowSplitter,
    SlidingWindowSplitter,
    temporal_train_test_split,
)
from sktime.utils.plotting import plot_series
from sktime.forecasting.base import ForecastingHorizon

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from statsmodels.nonparametric.smoothers_lowess import lowess
from pyod.models.knn import KNN

from datetime import datetime, timedelta
import requests


# Function

In [2]:

def detect_outliers(df, window_size, starting_date= False, ending_date= False, method = 'Rolling_Median',contamination = 0.01, plot = False):
    if not starting_date: starting_date = df.index.min()  
    if not ending_date: ending_date = df.index.max() 
    df = df[pd.to_datetime(starting_date):pd.to_datetime(ending_date)]
    if method == 'Rolling_Median':
        resid = df - df.rolling(window_size).median() 
        stdev = resid.std().to_numpy()
        upper_lim = 1.96 * stdev
        lower_lim = -1.96 * stdev
        outliers = np.where((resid > upper_lim) | (resid < lower_lim))[0]
    elif method =='STL':
        resid = df - lowess(df[df.columns[0]], df.index, return_sorted=False)[:, None]
        q_01 = resid.quantile(0.1)
        q_09 = resid.quantile(0.9)
        upper_lim = 2 * (q_09 - q_01)
        lower_lim = -2 * (q_09 - q_01)
        outliers = np.where((resid > upper_lim) | (resid < lower_lim))[0]
    elif method == 'IsolationForest':
        outliers = IsolationForest(contamination=contamination).fit_predict(df) == -1
        outliers = np.where(outliers)[0]
    elif method == 'KNN':
        outliers = KNN(contamination=contamination).fit(df).labels_ == 1
        outliers = np.where(outliers)[0]


    if plot == True:
        fig = px.scatter(df, title="Outliers Plot")
        # fig.update_traces(marker=dict(color=['red' if i in outliers else 'blue' for i in range(len(df))]))
        fig.update_traces(marker=dict(color=['red' if i in outliers else 'blue' for i in range(len(df))], 
                                    size=[5 if i in outliers else 1 for i in range(len(df))],
                                    line=dict(width=0)))
        fig.add_trace(go.Scatter(x=df.index, y=df[df.columns[0]], mode='lines', line=dict(color='blue', width=1)))
        fig.update_xaxes(rangeslider_visible=True)
        # Update scatter marker properties
        scatter_trace = fig.data[0]  
        # scatter_trace.marker.size = 2  

        fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="Daily", step="day", stepmode="backward"),  
                dict(count=1, label="1-Month", step="month", stepmode="backward"),
                dict(count=6, label="6-Month", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1-Year", step="year", stepmode="backward"),
                dict(step="all")
                        ])
            )
        )
        fig.update_layout(showlegend=False)

        # Add horizontal lines
        if method in ['STL', 'Rolling_Median']:
            fig.add_hline(y=upper_lim[0], line_dash="dash", line_color="red")
            fig.add_hline(y=lower_lim[0], line_dash="dash", line_color="red")

        fig.show()

    res = df.copy()
    res = res.reset_index(drop=True)
    res['is_outlier'] = False
    res.loc[res.index.isin(outliers), 'is_outlier'] = True
    res = pd.DataFrame(res)
    res.index = df.index

    anomaly_ratio = np.round(100 * (res['is_outlier']==True).sum() / res.shape[0],2)
    print(f'The share of anomalies from the whole data is {anomaly_ratio}%' )
    return res

In [63]:
def plot_confusion_matrix(ground_truth, predictions):
    cm = confusion_matrix(ground_truth, predictions)
    # Define labels for the confusion matrix
    labels = ['Normal', 'Anomalous']

    # Create a list to store text annotations
    text = []

    # Populate text annotations with the count values
    for i in range(len(labels)):
        for j in range(len(labels)):
            text.append(str(cm[i][j]))

    # Create a Plotly heatmap figure
    fig = go.Figure(data=go.Heatmap(
        z=cm,
        x=labels,
        y=labels,
        colorscale='Blues',
        text=text,
        showscale=False
    ))

    # Set heatmap properties
    fig.update_layout(
        title='Confusion Matrix',
        xaxis=dict(title='Predicted Labels'),
        yaxis=dict(title='True Labels'),
        annotations=[
            go.layout.Annotation(
                x=x_val,
                y=y_val,
                text=str(text_val),
                showarrow=False,
                font=dict(color='white' if cm[i][j] > cm.max() / 2 else 'black')
            )
            for i, y_val in enumerate(labels)
            for j, x_val in enumerate(labels)
            for text_val in [cm[i][j]]
        ]
    )

    # Show the Plotly figure
    fig.show()



def lagged_df(df, column_names=['price'], lags = 24):
    ddf = df.copy()
    for column in column_names:
        for lag in range(lags):
            ddf[f'{column}_lag({lag+1})'] = ddf[column].shift(lag+1)
    ddf.dropna(inplace=True)
    return ddf 

def lagged_12timestep_df(df, column_names=['price'], lags = 24):
    ddf = df.copy()
    for column in column_names:
        for lag in range(lags-12):
            ddf[f'{column}_lag({12+lag+1})'] = ddf[column].shift(12+lag+1)
    ddf.dropna(inplace=True)
    return ddf 

def anomaly_forecasting (labeled_df, model='IsolationForest', lags=24, test_share = 0.2, plot = False, output = 'prediction'):
    lagged_labeled_df = lagged_df(labeled_df, lags = lags)
    X_train = lagged_labeled_df.iloc[:int((1-test_share)*lagged_labeled_df.shape[0]),:].drop(columns=['price','is_outlier'])
    y_train = lagged_labeled_df.iloc[:int((1-test_share)*lagged_labeled_df.shape[0]),:]['is_outlier']
    X_test = lagged_labeled_df.iloc[int((1-test_share)*lagged_labeled_df.shape[0]):,:].drop(columns=['price','is_outlier'])
    y_test = lagged_labeled_df.iloc[int((1-test_share)*lagged_labeled_df.shape[0]):,:]['is_outlier']

    if model == 'IsolationForest':
        pipe = make_pipeline(StandardScaler(),IsolationForest(contamination=0.04))
    if model == 'LogisticRegression':
        pipe = make_pipeline(StandardScaler(), LogisticRegression())
        

    pipe.fit(X_train, y_train)
    # Predict on test set
    predictions = pipe.predict(X_test)
    # predictions = pd.Series(predictions).map({-1: False, 1: True})

    scoring = [
    "accuracy",
    "f1",
    "recall",
    "precision",
    ] 
    scores = cross_validate(
            pipe, X_train, y_train, return_train_score=True, scoring=scoring
            )
    if plot == 'confusion_matrix':
        plot_confusion_matrix(y_test.values, predictions)

    if output == 'scores':
        return pd.DataFrame(scores)
    elif output == 'prediction':
        return predictions

In [4]:
def invoke_api():
    url = 'https://api.aeso.ca/report/v1.1/price/poolPrice'
    start_date = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    end_date = datetime.today().strftime('%Y-%m-%d')

    headers = {
        'accept': 'application/json',
        'X-API-Key': 'eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ6MHo4MnIiLCJpYXQiOjE2ODM1NzQyMTh9.Gbod9kjeDwP4SOJibSFof63X7GGZxbZdBmBVrgE409w'
    }
    params = {
    'startDate': start_date,
    'endDate' : end_date
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        # Create DataFrame from the data
        df = pd.DataFrame(data)
        df = pd.DataFrame(df['return']['Pool Price Report'])
        df = df.drop(columns=['begin_datetime_utc', 'forecast_pool_price', 'rolling_30day_avg'])
        df.columns = ['Date', 'price']
        df = df.set_index('Date')
        # Process the DataFrame as needed
        # print(df)
    else:
        print('Error:', response.status_code)

    return df

In [5]:
def is_new_price_outlier():
    df = invoke_api()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df = df.dropna()
    ldf = lagged_df(df, lags = 24)
    input = ldf.drop(columns=['price']).tail(1)
    result = model.predict(input)
    return result[0]

In [74]:
def is_12_new_prices_outlier():
    df = invoke_api()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df = df.dropna()
    ldf = lagged_12timestep_df(df, lags = 24)
    input = ldf.drop(columns=['price']).tail(12)
    result = model.predict(input)
    return result

# Historical Data

In [51]:
ail_price = pd.read_csv('~/Slalom/data/ail_price.csv').iloc[:,3:]
ail_price = ail_price.rename(columns={'date':'Date'})
ail_price['Date'] = pd.to_datetime(ail_price['Date'])
ail_price = ail_price.set_index('Date').sort_index()
ail_price = ail_price[pd.to_datetime('2021-01-01 00:00:00'):pd.to_datetime('2023-03-31 22:00:00	')]
price = pd.DataFrame(ail_price['price'])

# Labling Data

In [61]:
labeled_price = detect_outliers(price, window_size = 24*7, method = 'STL', contamination=0.04)

The share of anomalies from the whole data is 4.37%


In [62]:
labeled_price

Unnamed: 0_level_0,price,is_outlier
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01 00:00:00,29.92,False
2021-01-01 01:00:00,27.48,False
2021-01-01 02:00:00,28.62,False
2021-01-01 03:00:00,33.55,False
2021-01-01 04:00:00,35.36,False
...,...,...
2023-03-31 18:00:00,50.86,False
2023-03-31 19:00:00,49.35,False
2023-03-31 20:00:00,51.28,False
2023-03-31 21:00:00,48.39,False


# Strategy I : Predicting One time step ahead 

In [64]:
lagged_labeled_price = lagged_df(labeled_price, lags = 24)
X_train = lagged_labeled_price.drop(columns=['price','is_outlier'])
y_train = lagged_labeled_price['is_outlier']

model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, class_weight= {False:1, True:15.87}))

model.fit(X_train, y_train)

In [65]:
is_new_price_outlier()

False

# Strategy II: Predicting 12 time step ahead

In [75]:
lagged_labeled_price = lagged_12timestep_df(labeled_price, lags = 24)
X_train = lagged_labeled_price.drop(columns=['price','is_outlier'])
y_train = lagged_labeled_price['is_outlier']

model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, class_weight= {False:1, True:15.87}))

model.fit(X_train, y_train)

In [76]:
is_12_new_prices_outlier()

array([False, False, False, False, False, False, False, False, False,
       False, False, False])