In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
from matplotlib import dates as md
import seaborn as sns
import plotly.graph_objs as go
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

from sklearn.metrics import r2_score
from sklearn import metrics

import lightgbm as lgb

from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from numpy.random import seed
seed(42)
import tensorflow as tf
tf.random.set_seed(42)

# Load dataset

In [None]:
path_GEPIII = '/kaggle/input/ashrae-energy-prediction'

In [None]:
df_power_meter = pd.read_csv(os.path.join(path_GEPIII,'train.csv'))
df_meta = pd.read_csv(os.path.join(path_GEPIII,'building_metadata.csv'))

In [None]:
df_meta = df_meta.merge(df_power_meter[['building_id','meter']].drop_duplicates(), on='building_id')
df_meta['merged_id'] = df_meta['building_id'].astype('str') + '_' + df_meta['meter'].astype('str')
df_meta

In [None]:
df_power_meter = df_power_meter.pivot_table(index='timestamp', columns=['building_id','meter'], values='meter_reading')
df_power_meter.index = pd.to_datetime(df_power_meter.index)
df_power_meter.columns = df_power_meter.columns.get_level_values(0).astype('str')+'_'+df_power_meter.columns.get_level_values(1).astype('str')
df_power_meter

In [None]:
df_count = pd.DataFrame(df_power_meter.count()).reset_index().rename(columns={'index':'merged_id',0:'count'})
df_meta = df_meta.merge(df_count, on='merged_id')
df_meta

In [None]:
# Train data: 54% of power meters (site0~9); Validation data10% of power meters (site10~12); test data: 36% of power meters ((site13~15))
df_power_meter = df_power_meter.loc['2016']
train_data = df_power_meter.loc[:, df_meta.loc[(df_meta['site_id']<10)&(df_meta['count']>8784*0.9), 'merged_id']].copy()
valid_data = df_power_meter.loc[:, df_meta.loc[(df_meta['site_id']<13)&(df_meta['site_id']>=10)&(df_meta['count']>8784*0.9), 'merged_id']].copy()
test_data = df_power_meter.loc[:, df_meta.loc[df_meta['count']>8784*0.9, 'merged_id']].drop(train_data.columns, axis=1).copy()

In [None]:
# Normalize meter readings for each meter
def normalize(df):
    mean = df.mean()
    df -= mean
    std = df.std()
    df /= std
    return df, mean, std

train_value, train_mean, train_std = normalize(train_data)
valid_value, valid_mean, valid_std = normalize(valid_data)
test_value, test_mean, test_std = normalize(test_data)

# Add noises
+- 1,2 and 3 std, 0.1% respectively

In [None]:
# Add noises to train data
train_value_noisy = train_value.copy()
train_value_labels = train_value.copy()

for meter_name in tqdm(train_value.columns):
    df_noisy_data = train_value_noisy[[meter_name]].copy()
    df_noisy_data['noise'] = 0
    
    #Add noises (+-1,2,3 std)
    std = df_noisy_data[meter_name].std()
    for multiplier_std in [-3,-2,-1,1,2,3]:
        random_hours = df_noisy_data[df_noisy_data['noise']==0].sample(frac=0.001, random_state=42).index
        df_noisy_data.loc[random_hours, meter_name] = df_noisy_data.loc[random_hours, meter_name] + multiplier_std*std
        df_noisy_data.loc[random_hours, 'noise'] = abs(multiplier_std)
    
    df_noisy_data[meter_name] = df_noisy_data[meter_name].fillna(method='ffill').fillna(method='bfill') 
    
    train_value_noisy[meter_name] = df_noisy_data[meter_name].copy()    
    train_value_labels[meter_name] = df_noisy_data['noise'].copy()    

In [None]:
# Add noises to valid data
valid_value_noisy = valid_value.copy()
valid_value_labels = valid_value.copy()

for meter_name in tqdm(valid_value.columns):
    df_noisy_data = valid_value_noisy[[meter_name]].copy()
    df_noisy_data['noise'] = 0
    
    #Add noises (+-1,2,3 std)
    std = df_noisy_data[meter_name].std()
    for multiplier_std in [-3,-2,-1,1,2,3]:
        random_hours = df_noisy_data[df_noisy_data['noise']==0].sample(frac=0.001, random_state=42).index
        df_noisy_data.loc[random_hours, meter_name] = df_noisy_data.loc[random_hours, meter_name] + multiplier_std*std
        df_noisy_data.loc[random_hours, 'noise'] = abs(multiplier_std)
    
    df_noisy_data[meter_name] = df_noisy_data[meter_name].fillna(method='ffill').fillna(method='bfill') 
    
    valid_value_noisy[meter_name] = df_noisy_data[meter_name].copy()     
    valid_value_labels[meter_name] = df_noisy_data['noise'].copy()        

In [None]:
# Add noises to test data
test_value_noisy = test_value.copy()
test_value_labels = test_value.copy()

for meter_name in tqdm(test_value.columns):
    df_noisy_data = test_value_noisy[[meter_name]].copy()
    df_noisy_data['noise'] = 0
    
    #Add noises (+-1,2,3 std)
    std = df_noisy_data[meter_name].std()
    for multiplier_std in [-3,-2,-1,1,2,3]:
        random_hours = df_noisy_data[df_noisy_data['noise']==0].sample(frac=0.001, random_state=42).index
        df_noisy_data.loc[random_hours, meter_name] = df_noisy_data.loc[random_hours, meter_name] + multiplier_std*std
        df_noisy_data.loc[random_hours, 'noise'] = abs(multiplier_std)
    
    df_noisy_data[meter_name] = df_noisy_data[meter_name].fillna(method='ffill').fillna(method='bfill') 
    
    test_value_noisy[meter_name] = df_noisy_data[meter_name].copy()      
    test_value_labels[meter_name] = df_noisy_data['noise'].copy()        

In [None]:
# Plot of before and after adding noises
for meter_name in train_value.sample(n=10, axis=1, random_state=42).columns:
    fig, axes = plt.subplots(1,2,figsize=(15,3))
    
    ymin = train_value_noisy[meter_name].min()*1.05
    ymax = train_value_noisy[meter_name].max()*1.05
    
    train_value[meter_name].fillna(method='ffill').fillna(method='bfill').plot(title=meter_name+' (raw data)', ylim=(ymin, ymax),
                                                                               ax=axes[0],color='blue')
    train_value_noisy[meter_name].fillna(method='ffill').fillna(method='bfill').plot(title=meter_name+' (add noise)', ylim=(ymin, ymax),
                                                                                     ax=axes[1],color='orange')
    plt.show()

# Build an autoencoder
We will build a convolutional reconstruction autoencoder model. The model will take input of shape (batch_size, sequence_length, num_features) and return output of the same shape.

In [None]:
# Prepare train data for autoencoder
train_value = train_value.fillna(method='ffill').fillna(method='bfill') 
train_value = train_value.dropna(axis=1, how='all')

x_train = np.reshape(train_value.T.values, train_value.T.values.shape+(1,))
x_train.shape

In [None]:
# Prepare noised train data for autoencoder
train_value_noisy = train_value_noisy[train_value.columns]
train_value_noisy = train_value_noisy.fillna(method='ffill').fillna(method='bfill') 

x_train_noisy = np.reshape(train_value_noisy.T.values, train_value_noisy.T.values.shape+(1,))
x_train_noisy.shape

In [None]:
# Prepare test data for autoencoder
test_value = test_value.fillna(method='ffill').fillna(method='bfill') 
test_value = test_value.dropna(axis=1, how='all')

x_test = np.reshape(test_value.T.values, test_value.T.values.shape+(1,))
x_test.shape

In [None]:
# Prepare noised test data for autoencoder
test_value_noisy = test_value_noisy[test_value.columns]
test_value_noisy = test_value_noisy.fillna(method='ffill').fillna(method='bfill') 

x_test_noisy = np.reshape(test_value_noisy.T.values, test_value_noisy.T.values.shape+(1,))
x_test_noisy.shape

In [None]:
# Prepare valid data for autoencoder
valid_value = valid_value.fillna(method='ffill').fillna(method='bfill') 
valid_value = valid_value.dropna(axis=1, how='all')

x_valid = np.reshape(valid_value.T.values, valid_value.T.values.shape+(1,))
x_valid.shape

In [None]:
# Prepare noised valid data for autoencoder
valid_value_noisy = valid_value_noisy[valid_value.columns]
valid_value_noisy = valid_value_noisy.fillna(method='ffill').fillna(method='bfill') 

x_valid_noisy = np.reshape(valid_value_noisy.T.values, valid_value_noisy.T.values.shape+(1,))
x_valid_noisy.shape

In [None]:
# Build 1D CNN autoencoder
model = keras.Sequential(
    [
        layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mae")
model.summary()

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# Train the model
- Input: noised train data
- Output: train data

In [None]:
history = model.fit(
    x_train_noisy,
    x_train,
    epochs=50,
    batch_size=128,
    #validation_split=0.20,
    validation_data=(x_valid_noisy, x_valid),
    #shuffle=False,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.ylim(0,1)

plt.legend()

In [None]:
# Get train MAE loss
x_train_pred = model.predict(x_train)
train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)

plt.hist(train_mae_loss, bins=50)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()

avg = np.mean(train_mae_loss)
print("Reconstruction error average: ", avg)

In [None]:
# Plot of reconstructed result (train data)
for idx in np.arange(0,1000,100):
    df_plot = pd.concat([pd.Series(x_train[idx].flatten()).rename('x_train'), pd.Series(x_train_noisy[idx].flatten()).rename('x_train_noisy'), pd.Series(x_train_pred[idx].flatten()).rename('x_train_pred')], axis=1)
    df_plot['labels'] = train_value_labels.iloc[:, idx].values
    df_plot['Squared error'] = (df_plot['x_train_pred'] - df_plot['x_train_noisy'])**2
    df_plot['Squared error'] = df_plot['Squared error']/df_plot['Squared error'].std()
    df_plot.iplot()
    #pd.concat([pd.Series(x_train_noisy[idx].flatten()).rename('x_train_noisy'), pd.Series(x_train_pred[idx].flatten()).rename('x_train_pred')], axis=1).plot(figsize=(20,3), alpha=0.7)
    plt.show()
    #sns.displot(df_plot[df_plot['labels']>0], x="Squared error", hue="labels", kind="kde")
    #plt.xlim(-5,20)
    #plt.show()    

# Apply trained autoencoder on test data

In [None]:
# Get test MAE loss
x_test_pred = model.predict(x_test_noisy)
test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)

test_pred = test_value.copy()
test_pred.loc[:,:] = x_test_pred[:,:,0].T

plt.hist(test_mae_loss, bins=50)
plt.xlabel("test MAE loss")
plt.ylabel("No of samples")
plt.show()

avg = np.mean(test_mae_loss)
print("Reconstruction error average: ", avg)

In [None]:
df_plot

In [None]:
# Plot of reconstructed result (test data)
for idx in np.arange(0,300,20):
    df_plot = pd.concat([pd.Series(x_test[idx].flatten()).rename('x_test'), pd.Series(x_test_noisy[idx].flatten()).rename('x_test_noisy'), pd.Series(x_test_pred[idx].flatten()).rename('x_test_pred')], axis=1)
    df_plot['labels'] = test_value_labels.iloc[:, idx].values
    df_plot['Squared error'] = (df_plot['x_test_pred'] - df_plot['x_test_noisy'])**2
    df_plot['Squared error'] = df_plot['Squared error']/df_plot['Squared error'].std()
    df_plot.iplot()
    #pd.concat([pd.Series(x_test_noisy[idx].flatten()).rename('x_test_noisy'), pd.Series(x_test_pred[idx].flatten()).rename('x_test_pred')], axis=1).plot(figsize=(20,3), alpha=0.7)
    plt.show()    
    sns.displot(df_plot[df_plot['labels']>0], x="Squared error", hue="labels", kind="kde")
    plt.xlim(-5,20)
    plt.show()    

In [None]:
x_valid_pred = model.predict(x_valid_noisy)
valid_mae_loss = np.mean(np.abs(x_valid_pred - x_valid), axis=1)

valid_pred = valid_value.copy()
valid_pred.loc[:,:] = x_valid_pred[:,:,0].T

In [None]:
error_train = []

# Plot of reconstructed result (train data)
for idx in tqdm(np.arange(0,len(x_train))):
    df_plot = pd.concat([pd.Series(x_train[idx].flatten()).rename('x_train'), pd.Series(x_train_noisy[idx].flatten()).rename('x_train_noisy'), pd.Series(x_train_pred[idx].flatten()).rename('x_train_pred')], axis=1)
    df_plot['labels'] = train_value_labels.iloc[:, idx].values
    df_plot['Squared error'] = (df_plot['x_train_pred'] - df_plot['x_train_noisy'])**2
    df_plot['Squared error'] = df_plot['Squared error']/df_plot['Squared error'].std()
    df_plot['merged_id'] = train_value_labels.columns[idx]
    error_train.append(df_plot)
    
error_train = pd.concat(error_train,axis=0,ignore_index=True)
error_train

In [None]:
error_test = []

# Plot of reconstructed result (test data)
for idx in tqdm(np.arange(0,len(x_test))):
    df_plot = pd.concat([pd.Series(x_test[idx].flatten()).rename('x_test'), pd.Series(x_test_noisy[idx].flatten()).rename('x_test_noisy'), pd.Series(x_test_pred[idx].flatten()).rename('x_test_pred')], axis=1)
    df_plot['labels'] = test_value_labels.iloc[:, idx].values
    df_plot['Squared error'] = (df_plot['x_test_pred'] - df_plot['x_test_noisy'])**2
    df_plot['Squared error'] = df_plot['Squared error']/df_plot['Squared error'].std()
    df_plot['merged_id'] = test_value_labels.columns[idx]
    error_test.append(df_plot)
    
error_test = pd.concat(error_test,axis=0,ignore_index=True)
error_test

In [None]:
error_valid = []

# Plot of reconstructed result (valid data)
for idx in tqdm(np.arange(0,len(x_valid))):
    df_plot = pd.concat([pd.Series(x_valid[idx].flatten()).rename('x_valid'), pd.Series(x_valid_noisy[idx].flatten()).rename('x_valid_noisy'), pd.Series(x_valid_pred[idx].flatten()).rename('x_valid_pred')], axis=1)
    df_plot['labels'] = valid_value_labels.iloc[:, idx].values
    df_plot['Squared error'] = (df_plot['x_valid_pred'] - df_plot['x_valid_noisy'])**2
    df_plot['Squared error'] = df_plot['Squared error']/df_plot['Squared error'].std()
    df_plot['merged_id'] = valid_value_labels.columns[idx]
    error_valid.append(df_plot)
    
error_valid = pd.concat(error_valid,axis=0,ignore_index=True)
error_valid

In [None]:
sns.displot(error_train[error_train['labels']>0], x="Squared error", hue="labels", kind="kde")
plt.xlim(-5,20)

In [None]:
sns.displot(error_valid[error_valid['labels']>0], x="Squared error", hue="labels", kind="kde")
plt.xlim(-5,20)

In [None]:
sns.displot(error_test[error_test['labels']>0].sample(10000), x="Squared error", hue="labels", kind="kde")
plt.xlim(-5,20)

In [None]:
error_valid['noise'] = (abs(error_valid['labels'])>0).astype('int')
fpr, tpr, thresholds = metrics.roc_curve(error_valid['noise'], error_valid['Squared error'], pos_label=1)

In [None]:
'''
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import RocCurveDisplay

precision, recall, thresholds = precision_recall_curve(error_valid['noise'], error_valid['Squared error'], pos_label=1)
pr_display = PrecisionRecallDisplay(precision=precision, recall=recall).plot()
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

optimal_idx = np.argmax(2*precision*recall/(precision+recall))
optimal_threshold = thresholds[optimal_idx]
print("AUC is:", metrics.auc(fpr, tpr))
print("Threshold value is:", optimal_threshold)
'''

In [None]:
def plot_roc_curve(fper, tper):
    plt.plot(fper, tper, color='red', label='ROC')
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()
    
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("AUC is:", metrics.auc(fpr, tpr))
print("Threshold value is:", optimal_threshold)
plot_roc_curve(fpr, tpr)

In [None]:
error_test['noise'] = (abs(error_test['labels'])>0).astype('int')

In [None]:
error_valid.groupby('labels')['Squared error'].agg(['mean','std'])

In [None]:
error_test.sample(frac=0.1, random_state=42).groupby('labels')['Squared error'].agg(['mean','std'])

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score

for optimal_threshold in np.arange(3.5,5.5,0.1):
    error_valid['detection'] = (error_valid['Squared error']>optimal_threshold).astype('int')
    f1 = f1_score(error_valid['noise'], error_valid['detection'])

    cm = confusion_matrix(error_valid['noise'], error_valid['detection'], labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
    disp.plot()
    plt.title('threshold: '+str(round(optimal_threshold,1))+' // f1 score: '+ str(round(f1,5)))
    plt.show()
    display(error_valid.pivot_table(index='detection',columns=['labels'],values='merged_id',aggfunc='count'))    

In [None]:
record_merged = []

for optimal_threshold in np.arange(1.0,8.0,0.1):
    error_valid['detection'] = (error_valid['Squared error']>optimal_threshold).astype('int')
    f1 = f1_score(error_valid['noise'], error_valid['detection'])

    cm = confusion_matrix(error_valid['noise'], error_valid['detection'], labels=[0,1])  
    
    record = pd.DataFrame(data=cm.reshape(1,-4),columns=['TN', 'FN', 'FP', 'TP'])
    record['optimal_threshold'] = optimal_threshold
    record['f1'] = f1    
    
    record_merged.append(record)    

In [None]:
record_merged=pd.concat(record_merged,axis=0,ignore_index=True)
record_merged['Precision'] = record_merged['TP']/(record_merged['TP']+record_merged['FP'])
record_merged['Recall'] = record_merged['TP']/(record_merged['TP']+record_merged['FP'])
record_merged['FP + FN'] = record_merged['FP'] + record_merged['FN']

record_merged.set_index('optimal_threshold')[['FN','FP', 'FP + FN']].plot(ylabel='counts')
record_merged.set_index('optimal_threshold')[['f1']].plot(ylabel='f1 score')

In [None]:
optimal_threshold = 5.0

error_test['detection'] = (error_test['Squared error']>optimal_threshold).astype('int')
f1 = f1_score(error_test['noise'], error_test['detection'])

cm = confusion_matrix(error_test['noise'], error_test['detection'], labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
disp.plot()
plt.title('threshold: '+str(round(optimal_threshold,1))+' // f1 score: '+ str(round(f1,4)))
plt.show()
display(error_test.pivot_table(index='detection',columns=['labels'],values='merged_id',aggfunc='count'))   

In [None]:
for merged_id in error_test['merged_id'].unique()[::20]:
    df_plot = error_test.loc[error_test['merged_id']==merged_id, ['x_test_noisy', 'x_test_pred']].copy()
    df_plot['detected anomalies'] = error_test.loc[error_test['merged_id']==merged_id, :].reset_index().pivot_table(index='index',columns='detection',values='x_test_noisy')[1].values
    df_plot = df_plot.iloc[:4000]
    plt.figure(figsize=(8,6))
    plt.plot(df_plot.index, df_plot['x_test_noisy'], '-b', label='x_test_noisy')
    plt.plot(df_plot.index, df_plot['x_test_pred'], '--g', label='x_test_pred')
    plt.plot(df_plot.index, df_plot['detected anomalies'], 'r*', label='detected anomalies')    
    plt.title(merged_id)
    plt.legend(loc='upper right')    
    plt.show()

In [None]:
for merged_id in error_test['merged_id'].unique()[::20]:
    df_plot = error_test.loc[error_test['merged_id']==merged_id, ['x_test_noisy', 'x_test_pred']].copy()
    df_plot['detected anomalies'] = error_test.loc[error_test['merged_id']==merged_id, :].reset_index().pivot_table(index='index',columns='detection',values='x_test_noisy')[1].values
    df_plot = df_plot.iloc[5000:5500]
    plt.figure(figsize=(8,6))
    plt.plot(df_plot.index, df_plot['x_test_noisy'], '-b', label='x_test_noisy')
    plt.plot(df_plot.index, df_plot['x_test_pred'], '--g', label='x_test_pred')
    plt.plot(df_plot.index, df_plot['detected anomalies'], 'r*', label='detected anomalies')    
    plt.title(merged_id)
    plt.legend(loc='upper right')    
    plt.show()