In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Context

In the Solar Energy Industry it is common to have **misproduction problems** regarding various topics such as dirty solar panels, inverter failures, sensor issues and more. In this Notebook I will compare two approaches. The first one using **Isolation Forest** and the second an **LSTM Autoencoder**, to see which approach is the most efficient to detect anomalies in an AC Power timeseries.

In [None]:
generation1 = pd.read_csv(os.path.join(dirname, filenames[-1]))
weather1 = pd.read_csv(os.path.join(dirname, filenames[-2]))
generation1['DATE_TIME'] = pd.to_datetime(generation1['DATE_TIME'], dayfirst=True)
weather1['DATE_TIME'] = pd.to_datetime(weather1['DATE_TIME'], dayfirst=True)


In [None]:
generation1

In [None]:
inverters = list(generation1['SOURCE_KEY'].unique())
print(f"total number of inverters {len(inverters)}")


# Inverter level Anomally detection

In [None]:
inverters[0]

In [None]:
inv_1 = generation1[generation1['SOURCE_KEY']==inverters[0]]
mask = ((weather1['DATE_TIME'] >= min(inv_1["DATE_TIME"])) & (weather1['DATE_TIME'] <= max(inv_1["DATE_TIME"])))
weather_filtered = weather1.loc[mask]

In [None]:
weather_filtered.shape

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=inv_1["DATE_TIME"], y=inv_1["AC_POWER"],
                    mode='lines',
                    name='AC Power'))

fig.add_trace(go.Scatter(x=weather_filtered["DATE_TIME"], y=weather_filtered["IRRADIATION"],
                    mode='lines',
                    name='Irradiation', 
                    yaxis='y2'))

fig.update_layout(title_text="Irradiation vs AC POWER",
                  yaxis1=dict(title="AC Power in kW",
                              side='left'),
                  yaxis2=dict(title="Irradiation index",
                              side='right',
                              anchor="x",
                              overlaying="y"
                             ))

fig.show()

### Graph observations
We can see that in June 7th and June 14th there are some misproduction areas that could be considered anomalies. Due to the fact that energy production should behave in a linear way to irradiation.

In [None]:
df = inv_1.merge(weather_filtered, on="DATE_TIME", how='left')
df = df[['DATE_TIME', 'AC_POWER', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']]
df

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
train_prp = .6
train = df.loc[:df.shape[0]*train_prp]
test = df.loc[df.shape[0]*train_prp:]

features = ['AC_POWER', "IRRADIATION"]
clf = IsolationForest(n_estimators=1000, max_samples='auto', contamination=.03, max_features=2, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(train[features])
pred = clf.predict(test[features])

test['anomaly'] = pred
anomalies = test[test['anomaly'] == -1][['AC_POWER']]
anomalies = anomalies.rename(columns={'AC_POWER':'anomalies'})
test = test.merge(anomalies, left_index=True, right_index=True, how='left')
test

In [None]:
test.anomaly.value_counts()

In [None]:
pred = clf.predict(train[features])

train['anomaly'] = pred
anomalies = train[train['anomaly'] == -1][['AC_POWER']]
anomalies = anomalies.rename(columns={'AC_POWER':'anomalies'})
train = train.merge(anomalies, left_index=True, right_index=True, how='left')
train

In [None]:
train.anomaly.value_counts()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=test["DATE_TIME"], y=test["AC_POWER"],
                    mode='lines',
                    name='AC Power'))

fig.add_trace(go.Scatter(x=test["DATE_TIME"], y=test["anomalies"],
                    name='Anomaly', 
                    mode='markers',
                    marker=dict(color="red",
                                size=11,
                                line=dict(color="red",
                                          width=2))))

fig.update_layout(title_text="Anomalies Detected using Isolation Forest",
                  yaxis1=dict(title="AC Power in kW"))

fig.show()

### Observation after building model
We see that the model does detect the misproduction areas in June 7th and June 14th, but also it is detecting anomalies in the peaks of most of the days. which personally I wouldn't consider anomalies.

In [None]:
x_min, x_max = (train['AC_POWER'].min(), train['AC_POWER'].max())
y_min, y_max = (train['IRRADIATION'].min(), train['IRRADIATION'].max())

xrange = np.linspace(x_min - (x_min*.2), x_max + (x_max*.2), 1000)
yrange = np.linspace(y_min - (y_min*.2), y_max + (y_max*.2), 1000)
xx, yy = np.meshgrid(xrange, yrange)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [None]:

n_train = train[['AC_POWER','IRRADIATION']].loc[train['anomaly']==1]
a_train = train[['AC_POWER','IRRADIATION']].loc[train['anomaly']==-1]
n_test = test[['AC_POWER','IRRADIATION']].loc[test['anomaly']==1]
a_test = test[['AC_POWER','IRRADIATION']].loc[test['anomaly']==-1]

trace_specs = [
    [n_train, 'anomaly', 'Train', 'square', 'green'],
    [a_train, 'not anomaly', 'Train', 'square', 'red'],
    [n_test, 'anomaly', 'Test', 'circle', 'blue'],
    [a_test, 'not anomaly', 'Test', 'circle', 'orange']]

fig = go.Figure(data=[
    go.Scatter(
        x=data["AC_POWER"], y=data['IRRADIATION'],
        name=f'{split} Split, Label {label}',
        mode='markers', marker_symbol=marker,
        marker=dict(color=color)
    )
    for data, label, split, marker, color in trace_specs
])


fig.add_trace(
    go.Contour(
        x=xrange,
        y=yrange,
        z=Z,
        showscale=False,
        colorscale='RdBu',
        opacity=0.4,
        name='Score',
        hoverinfo='skip'
    )
)
fig.update_layout(title="Isolation Forest Contour and Scatter Plot", 
                  yaxis=dict(title="Irradiation index"), 
                  xaxis=dict(title="Power in kW"))

fig.show()

### Observations
Here we can see how the Isolation Forest Model is behaving. The yellow dots show us the anomalies detected on the test dataset as well as the red squares that show us the anomalies detected on the training dataset. These points do not follow the contour pattern of the graph and we can clearly see that the yellow dots on the far left are the points from June 7th and June 14th.

# LSTM Autoencoder approach

In [None]:
df = df[["DATE_TIME", "AC_POWER", "AMBIENT_TEMPERATURE", "MODULE_TEMPERATURE", "IRRADIATION"]]
df_timestamp = df[["DATE_TIME"]]
df_ = df[["AC_POWER", "AMBIENT_TEMPERATURE", "MODULE_TEMPERATURE", "IRRADIATION"]]

In [None]:
train_prp = .6
train = df_.loc[:df_.shape[0]*train_prp]
test = df_.loc[df_.shape[0]*train_prp:]


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train)
X_test = scaler.transform(test)
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
from tensorflow.keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

In [None]:
def autoencoder_model(X):
    inputs = Input(shape=(X.shape[1], X.shape[2]))
    L1 = LSTM(16, activation='relu', return_sequences=True, kernel_regularizer=regularizers.l2(0.00))(inputs)
    L2 = LSTM(4, activation='relu', return_sequences=False)(L1)
    L3 = RepeatVector(X.shape[1])(L2)
    L4 = LSTM(4, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(16, activation='relu', return_sequences=True)(L4)
    output = TimeDistributed(Dense(X.shape[2]))(L5)
    model = Model(inputs=inputs, outputs=output)
    return model

In [None]:
model = autoencoder_model(X_train)
model.compile(optimizer='adam', loss='mae')
model.summary()

In [None]:
epochs = 100
batch = 10
history = model.fit(X_train, X_train, epochs=epochs, batch_size=batch, validation_split=.2, verbose=0).history


In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=[x for x in range(len(history['loss']))], y=history['loss'],
                    mode='lines',
                    name='loss'))

fig.add_trace(go.Scatter(x=[x for x in range(len(history['val_loss']))], y=history['val_loss'],
                    mode='lines',
                    name='validation loss'))

fig.update_layout(title="Autoencoder error loss over epochs",
                  yaxis=dict(title="Loss"),
                  xaxis=dict(title="Epoch"))

fig.show()

In [None]:
X_pred = model.predict(X_train)
X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
X_pred = scaler.inverse_transform(X_pred)
X_pred = pd.DataFrame(X_pred, columns=train.columns)

In [None]:
scores = pd.DataFrame()
scores['AC_train'] = train['AC_POWER']
scores["AC_predicted"] = X_pred["AC_POWER"]
scores['loss_mae'] = (scores['AC_train']-scores['AC_predicted']).abs()


In [None]:
fig = go.Figure(data=[go.Histogram(x=scores['loss_mae'])])
fig.update_layout(title="Error distribution", 
                 xaxis=dict(title="Error delta between predicted and real data [AC Power]"),
                 yaxis=dict(title="Data point counts"))
fig.show()

In [None]:
X_pred = model.predict(X_test)
X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
X_pred = scaler.inverse_transform(X_pred)
X_pred = pd.DataFrame(X_pred, columns=train.columns)
X_pred.index = test.index

In [None]:
scores = X_pred
scores['datetime'] = df_timestamp.loc[1893:]
scores['real AC'] = test['AC_POWER']
scores["loss_mae"] = (scores['real AC'] - scores['AC_POWER']).abs()
scores['Threshold'] = 200
scores['Anomaly'] = np.where(scores["loss_mae"] > scores["Threshold"], 1, 0)


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=scores['datetime'], 
                         y=scores['loss_mae'], 
                         name="Loss"))
fig.add_trace(go.Scatter(x=scores['datetime'], 
                         y=scores['Threshold'],
                         name="Threshold"))

fig.update_layout(title="Error Timeseries and Threshold", 
                 xaxis=dict(title="DateTime"),
                 yaxis=dict(title="Loss"))
fig.show()

In [None]:
scores['Anomaly'].value_counts()

In [None]:
anomalies = scores[scores['Anomaly'] == 1][['real AC']]
anomalies = anomalies.rename(columns={'real AC':'anomalies'})
scores = scores.merge(anomalies, left_index=True, right_index=True, how='left')

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=scores["datetime"], y=scores["real AC"],
                    mode='lines',
                    name='AC Power'))

fig.add_trace(go.Scatter(x=scores["datetime"], y=scores["anomalies"],
                    name='Anomaly', 
                    mode='markers',
                    marker=dict(color="red",
                                size=11,
                                line=dict(color="red",
                                          width=2))))

fig.update_layout(title_text="Anomalies Detected LSTM Autoencoder")

fig.show()

## Conclusion

We see that the LSTM Autoencoder approach is a more efficient way to detect anomalies, againts the Isolation Forest approach, perhaps with a larger dataset the Isolation tree could outperform the Autoencoder, having a faster and pretty good model to detect anomalies. 

We can see from the Isolation Forest graph how the model is detecting anomalies, highlighting the datapoints from June 7th and June 14th.
