## Imports

In [None]:
!rm -rf qber-forecasting
!rm -rf deep_qber
!git clone https://github.com/rmnigm/qber-forecasting.git
!cp -r qber-forecasting/deep_qber deep_qber

In [None]:
!pip install catboost

In [12]:
import os
import random
import sys
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score

import catboost
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from catboost import EShapCalcType, EFeaturesSelectionAlgorithm
import warnings
from collections import deque, defaultdict

In [13]:
from google.colab import output
output.enable_custom_widget_manager()

## Feature Extraction

In [14]:
def get_dataset(first_rows, rows):
    window = deque([dict(row) for i, row in first_rows.iterrows()])
    features = defaultdict(list)
    for i, row in tqdm(rows.iterrows(), total=len(rows)):
        current_state = row
        window.append(dict(current_state))
        features_row = {}
        for i, x in enumerate(reversed(window)):
            for k, v in x.items():
                features[f'{k}_{i}_lag'].append(v)
        window.popleft()
    for k, v in features.items():
        features[k] = np.array(v).astype(np.float32)
    return features


def split(dataset, train_size, look_back):
    train_size = int(len(dataset) * train_size)
    test_size = len(dataset) - train_size
    data_train, data_test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(f"Training set size = {train_size}, testing set size = {test_size}")

    first_rows, rows = data_train[:look_back], data_train[look_back:]
    x_train = get_dataset(first_rows, rows)
    first_rows, rows = data_test[:look_back], data_test[look_back:]
    x_test = get_dataset(first_rows, rows)
    return x_train, x_test

In [15]:
path = "/content/qber-forecasting/datasets/qber_with_outliers.csv"
raw_dataframe = pd.read_csv(path)

info_path = "/content/qber-forecasting/datasets/outliers_info.csv"
info_dataframe = pd.read_csv(info_path)

In [16]:
print(f'Anomaly clusters cnt = {(info_dataframe["steps_to_anomaly"] == 1).sum()}')

Anomaly clusters cnt = 30


In [17]:
flag = 1
cluster = 0
lst = []
for i, row in info_dataframe.iterrows():
    if row['steps_to_anomaly'] == 0 and flag <= 3:
        lst.append(1)
        flag += 1
    else:
        lst.append(0)
    if row['steps_to_anomaly'] == 10:
        flag = 1
info_dataframe['anomaly_cluster'] = lst

In [18]:
with_steps = (raw_dataframe
              .set_index('index')
              .join(info_dataframe.set_index('index')[['anomaly_cluster', 'steps_to_anomaly']], on='index', how='left', rsuffix='_info')
)

In [19]:
# with_steps['outliers'] = with_steps['anomaly_cluster'].fillna(0)
with_steps['outliers'] = (with_steps['steps_to_anomaly'] == 0).astype(int)
dataframe = with_steps.drop(columns='anomaly_cluster')

In [20]:
dataframe.head()

Unnamed: 0_level_0,e_mu_current,e_mu_estimated,e_nu_1,e_nu_2,q_mu,q_nu1,q_nu2,outliers,steps_to_anomaly
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.01298,0.01164,0.01904,0.17794,0.550377,0.164911,0.008094,0,
1,0.01283,0.00961,0.01672,0.20868,0.564295,0.167629,0.006639,0,
2,0.01268,0.0059,0.01337,0.20442,0.564179,0.16411,0.007052,0,
3,0.01129,0.00988,0.01637,0.18453,0.573555,0.167174,0.006663,0,
4,0.01169,0.01338,0.01783,0.11478,0.569296,0.169658,0.006823,0,


In [None]:
# train_size = 0.75
# look_back = 8
# target_index = 0
# x_train, x_test = get_features_df(dataframe, train_size=train_size, look_back=look_back)

In [None]:
del x_train
del x_test

In [21]:
train_size = 0.75
look_back = 8
x_train, x_test = split(dataframe, train_size=train_size, look_back=look_back)

x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

Training set size = 138637, testing set size = 46213


100%|██████████| 138629/138629 [00:32<00:00, 4275.26it/s]
100%|██████████| 46205/46205 [00:05<00:00, 8492.42it/s]


### Schema

In [22]:
schema = [
    'e_mu_current_8_lag',
    'e_mu_current_7_lag',
    'e_mu_current_6_lag',
    'e_mu_current_5_lag',
    'e_mu_current_4_lag',
    'e_mu_current_3_lag',
    'e_mu_current_2_lag',
    'e_mu_current_1_lag',
    'e_mu_estimated_8_lag',
    'e_mu_estimated_7_lag',
    'e_mu_estimated_6_lag',
    'e_mu_estimated_5_lag',
    'e_mu_estimated_4_lag',
    'e_mu_estimated_3_lag',
    'e_mu_estimated_2_lag',
    'e_mu_estimated_1_lag',
    'e_mu_estimated_0_lag',
    'e_nu_1_8_lag',
    'e_nu_1_7_lag',
    'e_nu_1_6_lag',
    'e_nu_1_5_lag',
    'e_nu_1_4_lag',
    'e_nu_1_3_lag',
    'e_nu_1_2_lag',
    'e_nu_1_1_lag',
    'e_nu_1_0_lag',
    'e_nu_2_8_lag',
    'e_nu_2_7_lag',
    'e_nu_2_6_lag',
    'e_nu_2_5_lag',
    'e_nu_2_4_lag',
    'e_nu_2_3_lag',
    'e_nu_2_2_lag',
    'e_nu_2_1_lag',
    'e_nu_2_0_lag',
    'q_mu_8_lag',
    'q_mu_7_lag',
    'q_mu_6_lag',
    'q_mu_5_lag',
    'q_mu_4_lag',
    'q_mu_3_lag',
    'q_mu_2_lag',
    'q_mu_1_lag',
    'q_mu_0_lag',
    'q_nu1_8_lag',
    'q_nu1_7_lag',
    'q_nu1_6_lag',
    'q_nu1_5_lag',
    'q_nu1_4_lag',
    'q_nu1_3_lag',
    'q_nu1_2_lag',
    'q_nu1_1_lag',
    'q_nu1_0_lag',
    'q_nu2_8_lag',
    'q_nu2_7_lag',
    'q_nu2_6_lag',
    'q_nu2_5_lag',
    'q_nu2_4_lag',
    'q_nu2_3_lag',
    'q_nu2_2_lag',
    'q_nu2_1_lag',
    'q_nu2_0_lag'
    ]

## Base Regressor

In [23]:
target_col_name = 'e_mu_current_0_lag'

train_pool = Pool(x_train[schema], x_train[target_col_name])
test_pool = Pool(x_test[schema], x_test[target_col_name])

In [24]:
base_regressor = CatBoostRegressor()

In [None]:
base_regressor.fit(train_pool, eval_set=test_pool)
preds = base_regressor.predict(test_pool)

In [26]:
print(f'MAPE value:')
print(f'{mean_absolute_percentage_error(x_test[target_col_name], preds):.8f}')
print(f'MSE value:')
print(f'{mean_squared_error(x_test[target_col_name], preds):.8f}')
print(f'RMSE value:')
print(f'{mean_squared_error(x_test[target_col_name], preds, squared=False):.8f}')
print(f'R^2 value:')
print(f'{r2_score(x_test[target_col_name], preds):.8f}')

MAPE value:
0.10381961
MSE value:
0.00002238
RMSE value:
0.00473099
R^2 value:
0.67428846


## Detector Training

### Training

In [27]:
target_col_name = 'outliers_0_lag'

train_pool = Pool(x_train[schema], x_train[target_col_name])
test_pool = Pool(x_test[schema], x_test[target_col_name])

In [28]:
classifier = CatBoostClassifier()

In [None]:
classifier.fit(train_pool, eval_set=test_pool)
preds = classifier.predict(test_pool)

In [None]:
# summary = model.select_features(
#     train_pool,
#     eval_set=test_pool,
#     features_for_select=f'0-{len(x_train.drop(columns=drop_cols).columns)-1}',
#     num_features_to_select=20,
#     steps=3,
#     algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
#     shap_calc_type=EShapCalcType.Regular,
#     train_final_model=True,
#     logging_level='Silent',
#     plot=True
# )

In [30]:
preds = classifier.predict(test_pool)

In [31]:
for metric in (f1_score, accuracy_score, roc_auc_score, recall_score, precision_score):
    print(f'{metric.__name__} value:')
    print(f'{metric(x_test[target_col_name], preds):.8f}')

f1_score value:
0.62520730
accuracy_score value:
0.99021751
roc_auc_score value:
0.82337400
recall_score value:
0.65224913
precision_score value:
0.60031847


In [32]:
plotting_data = {
    'predicted': preds,
    'actual': x_test[target_col_name],
    'steps_to_anomaly': x_test['steps_to_anomaly_0_lag'],
    'value': x_test['e_mu_current_0_lag']
    }

plotting_data = pd.DataFrame(plotting_data).reset_index()
plotting_data['missed'] = plotting_data['predicted'] != plotting_data['actual']

In [33]:
vals = []


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for i in range(11):
        subset = plotting_data[plotting_data['steps_to_anomaly'] == i]
        val = {}
        for metric in (f1_score, accuracy_score, recall_score, precision_score):
            val[f'{metric.__name__}'] = metric(subset['actual'], subset['predicted'])
        val['cnt_total'] = subset['actual'].sum()
        val['cnt_missed'] = subset['missed'].sum()
        vals.append(val)

metrics_by_step_to_anomaly = pd.DataFrame(vals)

In [34]:
metrics_by_step_to_anomaly

Unnamed: 0,f1_score,accuracy_score,recall_score,precision_score,cnt_total,cnt_missed
0,0.789529,0.652249,0.652249,1.0,578.0,201
1,0.0,0.857143,0.0,0.0,0.0,2
2,0.0,0.928571,0.0,0.0,0.0,1
3,0.0,0.928571,0.0,0.0,0.0,1
4,0.0,0.928571,0.0,0.0,0.0,1
5,0.0,0.928571,0.0,0.0,0.0,1
6,0.0,0.785714,0.0,0.0,0.0,3
7,0.0,0.785714,0.0,0.0,0.0,3
8,0.0,0.785714,0.0,0.0,0.0,3
9,0.0,0.857143,0.0,0.0,0.0,2


In [None]:
for i, p in enumerate(plotting_data['missed']):
    if p:
      subset = plotting_data[i - 20:i + 20]
      fig, ax = plt.subplots(figsize=(16, 5))
      sns.lineplot(data=subset, x='index', y='value', ax=ax, color='orange')
      sns.scatterplot(data=subset[(subset['actual']==1) & (subset['missed']==False)], x='index', y='value', ax=ax, color='red', label='true positive')
      sns.scatterplot(data=subset[(subset['actual']==1) & (subset['missed']==True)], x='index', y='value', ax=ax, color='blue', label='false negative')
      sns.scatterplot(data=subset[(subset['actual']==0) & (subset['missed']==True)], x='index', y='value', ax=ax, color='green', label='false positive')
      plt.show()
      time.sleep(3)
      output.clear()

In [None]:
feature_importance = classifier.get_feature_importance()
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(6, 10))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(x_train[schema].columns)[sorted_idx])
plt.title('Feature Importance')
plt.show()

## Regression

In [None]:
train_size = 0.75
look_back = 8
x_train, x_test = split(dataframe, train_size=train_size, look_back=look_back)

x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

Training set size = 138637, testing set size = 46213


100%|██████████| 138629/138629 [00:52<00:00, 2636.84it/s]
100%|██████████| 46205/46205 [00:13<00:00, 3381.47it/s]


In [35]:
normal_train, normal_test = x_train[x_train['outliers_0_lag'] == 0], x_test[x_test['outliers_0_lag'] == 0]
anomaly_train, anomaly_test = x_train[x_train['outliers_0_lag'] == 1], x_test[x_test['outliers_0_lag'] == 1]

### Normal

In [40]:
target_col_name = 'e_mu_current_0_lag'

train_pool = Pool(normal_train[schema], normal_train[target_col_name])
test_pool = Pool(normal_test[schema], normal_test[target_col_name])

In [41]:
regressor_normal = CatBoostRegressor()

regressor_normal.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.110152
0:	learn: 0.0016715	test: 0.0014580	best: 0.0014580 (0)	total: 65.3ms	remaining: 1m 5s
1:	learn: 0.0015929	test: 0.0013999	best: 0.0013999 (1)	total: 138ms	remaining: 1m 8s
2:	learn: 0.0015245	test: 0.0013508	best: 0.0013508 (2)	total: 201ms	remaining: 1m 6s
3:	learn: 0.0014663	test: 0.0013095	best: 0.0013095 (3)	total: 256ms	remaining: 1m 3s
4:	learn: 0.0014166	test: 0.0012735	best: 0.0012735 (4)	total: 312ms	remaining: 1m 2s
5:	learn: 0.0013753	test: 0.0012420	best: 0.0012420 (5)	total: 365ms	remaining: 1m
6:	learn: 0.0013410	test: 0.0012177	best: 0.0012177 (6)	total: 438ms	remaining: 1m 2s
7:	learn: 0.0013129	test: 0.0011984	best: 0.0011984 (7)	total: 522ms	remaining: 1m 4s
8:	learn: 0.0012884	test: 0.0011785	best: 0.0011785 (8)	total: 618ms	remaining: 1m 8s
9:	learn: 0.0012676	test: 0.0011630	best: 0.0011630 (9)	total: 735ms	remaining: 1m 12s
10:	learn: 0.0012507	test: 0.0011502	best: 0.0011502 (10)	total: 855ms	remaining: 1m 16s
11:	learn: 0.0012359	t

<catboost.core.CatBoostRegressor at 0x7de5d9d62770>

In [42]:
normal_preds = regressor_normal.predict(test_pool)

print(f'MAPE value:')
print(f'{mean_absolute_percentage_error(normal_test[target_col_name], normal_preds):.8f}')
print(f'MSE value:')
print(f'{mean_squared_error(normal_test[target_col_name], normal_preds):.8f}')
print(f'RMSE value:')
print(f'{mean_squared_error(normal_test[target_col_name], normal_preds, squared=False):.8f}')
print(f'R^2 value:')
print(f'{r2_score(normal_test[target_col_name], normal_preds):.8f}')

MAPE value:
0.10089694
MSE value:
0.00000111
RMSE value:
0.00105348
R^2 value:
0.49860160


In [None]:
plotting_data = {
    'predicted': normal_preds,
    'actual': normal_test[target_col_name],
    'steps_to_anomaly': normal_test['steps_to_anomaly_0_lag'],
    'value': normal_test['e_mu_current_0_lag']
    }

plotting_data = pd.DataFrame(plotting_data).reset_index()
plotting_data['missed'] = plotting_data['predicted'] != plotting_data['actual']

In [None]:
subset = plotting_data[0:300]
fig, ax = plt.subplots(figsize=(16, 5))
sns.lineplot(data=subset, x='index', y='value', ax=ax, color='orange')
sns.lineplot(data=subset, x='index', y='predicted', ax=ax, color='blue')
sns.scatterplot(data=subset[subset['actual']==1], x='index', y='value', ax=ax, color='red')
plt.show()

### Outlier

In [36]:
target_col_name = 'e_mu_current_0_lag'

train_pool = Pool(anomaly_train[schema], anomaly_train[target_col_name])
test_pool = Pool(anomaly_test[schema], anomaly_test[target_col_name])

In [37]:
regressor_outlier = CatBoostRegressor()
regressor_outlier.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.058289
0:	learn: 0.0266066	test: 0.0678649	best: 0.0678649 (0)	total: 34.3ms	remaining: 34.2s
1:	learn: 0.0256980	test: 0.0658767	best: 0.0658767 (1)	total: 64.8ms	remaining: 32.3s
2:	learn: 0.0249134	test: 0.0641812	best: 0.0641812 (2)	total: 100ms	remaining: 33.4s
3:	learn: 0.0240664	test: 0.0622692	best: 0.0622692 (3)	total: 137ms	remaining: 34.1s
4:	learn: 0.0233706	test: 0.0611741	best: 0.0611741 (4)	total: 170ms	remaining: 33.9s
5:	learn: 0.0225902	test: 0.0595120	best: 0.0595120 (5)	total: 208ms	remaining: 34.4s
6:	learn: 0.0218269	test: 0.0587102	best: 0.0587102 (6)	total: 240ms	remaining: 34s
7:	learn: 0.0210211	test: 0.0584063	best: 0.0584063 (7)	total: 271ms	remaining: 33.6s
8:	learn: 0.0204040	test: 0.0575686	best: 0.0575686 (8)	total: 302ms	remaining: 33.3s
9:	learn: 0.0197603	test: 0.0562785	best: 0.0562785 (9)	total: 335ms	remaining: 33.1s
10:	learn: 0.0190682	test: 0.0550296	best: 0.0550296 (10)	total: 369ms	remaining: 33.2s
11:	learn: 0.0185449	t

<catboost.core.CatBoostRegressor at 0x7de5c8e70190>

In [38]:
anomaly_preds = regressor_outlier.predict(test_pool)

In [39]:
print(f'MAPE value:')
print(f'{mean_absolute_percentage_error(anomaly_test[target_col_name], anomaly_preds):.8f}')
print(f'MSE value:')
print(f'{mean_squared_error(anomaly_test[target_col_name], anomaly_preds):.8f}')
print(f'RMSE value:')
print(f'{mean_squared_error(anomaly_test[target_col_name], anomaly_preds, squared=False):.8f}')
print(f'R^2 value:')
print(f'{r2_score(anomaly_test[target_col_name], anomaly_preds):.8f}')

MAPE value:
0.13701090
MSE value:
0.00136570
RMSE value:
0.03695542
R^2 value:
0.70099154


## Composed Model

In [43]:
total_preds = list(anomaly_preds) + list(normal_preds)
total_labels = list(anomaly_test[target_col_name]) + list(normal_test[target_col_name])

In [44]:
print(f'MAPE value:')
print(f'{mean_absolute_percentage_error(total_labels, total_preds):.8f}')
print(f'MSE value:')
print(f'{mean_squared_error(total_labels, total_preds):.8f}')
print(f'RMSE value:')
print(f'{mean_squared_error(total_labels, total_preds, squared=False):.8f}')
print(f'R^2 value:')
print(f'{r2_score(total_labels, total_preds):.8f}')

MAPE value:
0.10134870
MSE value:
0.00001818
RMSE value:
0.00426382
R^2 value:
0.73543820


In [45]:
from collections import deque

class MultiRegressor:
    def __init__(self, regressors, classifier, num_classes, target_column, schema):
        self.regressors = regressors
        self.classifier = classifier
        self.num_classes: int = num_classes  # only 2 yet
        self.window = deque()
        self.target_column = target_column
        self.schema = schema

    def predict_batch(self, data):
        labels = self.classifier.predict(data)
        data['labels'] = labels
        results = []
        for n_class in range(self.num_classes):
            subset = data[data['labels'] == n_class]
            index = subset.reset_index()['index']
            preds = self.regressors[n_class].predict(subset)
            results.append(pd.DataFrame({'index': index, 'preds': preds}))
        return results

    def initiate(self, first_rows):
        self.window = deque([dict(row) for i, row in first_rows.iterrows()])

    def update(self, actual_target):
        self.window[-1][self.target_column] = actual_target
        self.window.popleft()

    def predict(self, current_state):
        self.window.append(dict(current_state))
        features = {}
        for i, x in enumerate(reversed(self.window)):
            for k, v in x.items():
                features[f'{k}_{i}_lag'] = v
        features = pd.Series(features)[self.schema]
        label = int(self.classifier.predict(features))
        pred = self.regressors[label].predict(features)
        return pred, label

In [46]:
model = MultiRegressor(regressors=(regressor_normal, regressor_outlier),
                       classifier=classifier,
                       num_classes=2,
                       target_column='e_mu_current',
                       schema=schema
                       )

In [47]:
look_back = 8

train_size = 0.75
train_size = int(len(dataframe) * train_size)
test_size = len(dataframe) - train_size

In [48]:
target_col = 'e_mu_current'
clear = dataframe.drop(columns=['steps_to_anomaly', 'outliers'])

true_labels = dataframe['outliers'][train_size:train_size+5000]
X_past, X = clear[train_size-look_back:train_size], dataframe[train_size:train_size+5000]

model.initiate(X_past)

predictions = []
labels = []
for i, row in tqdm(X.iterrows(), total=len(X)):
    real_value = row[target_col]
    x = row.drop(columns=[target_col])
    tmp = model.predict(x)
    predicted, label = model.predict(x)
    predictions.append(predicted)
    labels.append(label)
    model.update(real_value)

100%|██████████| 5000/5000 [05:32<00:00, 15.06it/s]


In [49]:
model_stats = pd.DataFrame({
    'actual': X[target_col],
    'predicted': predictions,
    'classified': labels,
    'label': true_labels,
    }).reset_index()

model_stats['mistaken'] = model_stats['classified'] != model_stats['label']

In [50]:
misses = model_stats['mistaken'].sum()
mape = mean_absolute_percentage_error(model_stats['actual'], model_stats['predicted'])
mse = mean_squared_error(model_stats['actual'], model_stats['predicted'])
rmse = mean_squared_error(model_stats['actual'], model_stats['predicted'], squared=False)
r2 = r2_score(model_stats['actual'], model_stats['predicted'])


print(f'{misses} anomalies missed or falsely detected')
for metric in (f1_score, accuracy_score, roc_auc_score, recall_score, precision_score):
    val = metric(model_stats['label'], model_stats['classified'])
    print(f'{metric.__name__: <15} = {val:.8f}')
print()
print(f'{"MAPE": <4} = {mape:.8f}')
print(f'{"MSE": <4} = {mse:.8f}')
print(f'{"RMSE": <4} = {rmse:.8f}')
print(f'{"R^2": <4} = {r2:.8f}')

175 anomalies missed or falsely detected
f1_score        = 0.70881864
accuracy_score  = 0.96500000
roc_auc_score   = 0.96030822
recall_score    = 0.95515695
precision_score = 0.56349206

MAPE = 0.06766012
MSE  = 0.00004941
RMSE = 0.00702931
R^2  = 0.87696820


In [51]:
model_stats.to_csv('model_stat.csv')