In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!rm -f utils.py
!wget -nv https://github.com/minesh1291/stackoverflow/raw/master/machine_learning/utils.py
!pip install -q kneebow

In [None]:
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler 
from sklearn.cluster import DBSCAN 
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

from statsmodels.tsa.seasonal import seasonal_decompose

import keras
from keras import layers

import utils


In [None]:
SEED = 1291

In [None]:
satellite = pd.read_csv("/kaggle/input/statlog-landsat-satellite/satellite.mat.csv")
print(satellite.shape)
features = [c for c in satellite.columns if c.startswith("V")]
satellite.head()


In [None]:
# with out anomaly
idx = satellite["Y"]==0

fig, ax = plt.subplots(figsize=(24,5))
pd.DataFrame(satellite.loc[idx].iloc[:1400,1:-1].reset_index(drop=True)).plot(ax=ax, legend=False)

plt.show()
fig, ax = plt.subplots(figsize=(24,3))
satellite.loc[idx].loc[:1400,"Y"].reset_index(drop=True).plot(ax=ax)
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(24,5))
pd.DataFrame(satellite.iloc[-1400:,1:-1]).plot(ax=ax, legend=False)
plt.show()

fig, ax = plt.subplots(figsize=(24,3))
satellite.loc[-1400:,"Y"].plot(ax=ax)
plt.show()


In [None]:
train_org = satellite.iloc[:,1:-1]
train_y = satellite["Y"]

# train_org = satimage.iloc[:,1:-1]
# train_y = satimage["Y"]

In [None]:
# preprocessing

scaler = RobustScaler()

train_org_scaled = scaler.fit_transform(train_org)

fig, ax = plt.subplots(figsize=(24,4))
pd.DataFrame(train_org.values[-400:,1:5]).plot(ax=ax, legend=False)
plt.show()

fig, ax = plt.subplots(figsize=(24,4))
pd.DataFrame(train_org_scaled[-400:,1:5]).plot(ax=ax, legend=False)
plt.show()

fig, ax = plt.subplots(figsize=(24,2))
satellite["Y"].iloc[-400:].plot(ax=ax)
plt.show()



In [None]:
train_org = pd.DataFrame(train_org_scaled)

In [None]:
def get_resid(x, period=9):
    return seasonal_decompose(x, model='additive', period=period).resid.fillna(0)

train_resid = train_org.apply(get_resid)

In [None]:
TIME_STEPS = 150

# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
    output = []
    for i in range(len(values) - time_steps):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

x_train = create_sequences(train_org)
print("Training input shape: ", x_train.shape) # last <time_steps> n rows are omitted

In [None]:
train_org = train_org.iloc[:-TIME_STEPS]
train_y = satellite["Y"].iloc[:-TIME_STEPS]

train_org.shape, train_y.shape, train_y.value_counts()

In [None]:
l2 = keras.regularizers.L2(1e-3)

def get_AE():
    model = keras.models.Sequential([
        layers.Input(shape=(TIME_STEPS, 36)),
        layers.BatchNormalization(),
        layers.Conv1D(filters=64, kernel_size=15, padding='same', data_format='channels_last',
            dilation_rate=1, activation="linear", kernel_regularizer=l2),
        layers.BatchNormalization(),
        layers.Dropout(0.25),
        layers.Bidirectional(keras.layers.LSTM(32, activation="tanh", kernel_regularizer=l2)),
        layers.RepeatVector(TIME_STEPS),
        layers.Bidirectional(layers.LSTM(32, return_sequences=True, activation="tanh", kernel_regularizer=l2)),
        layers.BatchNormalization(),
        layers.Dropout(0.25),
        layers.Conv1D(filters=64, kernel_size=15, padding='same', data_format='channels_last',
            dilation_rate=1, activation="linear", kernel_regularizer=l2),
        layers.BatchNormalization(),
        layers.Dropout(0.25),
        layers.TimeDistributed(layers.Dense(36, activation='linear', kernel_regularizer=l2))
        
    ])
    adam = keras.optimizers.Adam(lr=1e-3, decay=1e-11)
    model.compile(optimizer=adam, loss="mse")
    return model

model = get_AE()
model.summary()

In [None]:
es = keras.callbacks.EarlyStopping(patience=4, min_delta=1e-5, verbose=1, restore_best_weights=True)
# cp = keras.callbacks.ModelCheckpoint(filepath="best_model.ckp", verbose=1, save_best_only=True)
callbacks=[es] #, cp]

np.random.seed(SEED)
model = get_AE()
model.fit(x=x_train, y=x_train, batch_size=512//2, validation_split=0.2, epochs=20, 
          callbacks=callbacks
         )

In [None]:
# model.save("best_model.ckp")

# model = keras.models.load_model("best_model.ckp")

In [None]:
pred_x_train = model.predict(x_train[[1,201,401]])

feat_idx = 2

plt.plot(x_train[0,:,feat_idx])
plt.plot(pred_x_train[0,:,feat_idx])
plt.show()
plt.plot(x_train[201,:,feat_idx])
plt.plot(pred_x_train[1,:,feat_idx])
plt.show()
plt.plot(x_train[401,:,feat_idx])
plt.plot(pred_x_train[2,:,feat_idx])
plt.show()


In [None]:
x_train_recons = model.predict(x_train)

In [None]:
# def make_encoder(ae_model):
#     encoder = keras.models.Sequential(
#         ae_model.layers[:5]
#     )
#     return encoder

# encoder = make_encoder(model)

# latent_out = encoder.predict(x_train.astype("float32"))
# latent_out.shape

In [None]:
# X_train = np.concatenate([satellite.iloc[:-TIME_STEPS,1:-1],latent_out], axis=1)

feats_dict= {
    "train_diff": train_org - x_train_recons[:,0,:],
    
#     "train_resid_w15": train_org.apply(lambda x: get_resid(x, period=15)),
#     "train_resid_w12": train_org.apply(lambda x: get_resid(x, period=12)),
    "train_resid_w9": train_org.apply(lambda x: get_resid(x, period=9)),
    "train_resid_w6": train_org.apply(lambda x: get_resid(x, period=6)),
    "train_resid_w3": train_org.apply(lambda x: get_resid(x, period=3)),
    
    "roll_diff_6": train_org - train_org.rolling(6, center=True).median().fillna(0),
    "roll_diff_9": train_org - train_org.rolling(9, center=True).median().fillna(0),
    "roll_diff_12": train_org - train_org.rolling(12, center=True).median().fillna(0),
#     "roll_diff_15": train_org - train_org.rolling(15).median().fillna(0),
    
    "roll_median_6":  train_org.rolling(6, center=True).median().fillna(0),
    "roll_median_9":  train_org.rolling(9, center=True).median().fillna(0),
    "roll_median_12":  train_org.rolling(12, center=True).median().fillna(0),
#     "roll_median_15":  train_org.rolling(15, center=True).median().fillna(0),
    
    "roll_std_15":  train_org.rolling(15, center=True).std().fillna(0),
    "roll_diff_lq_15":  train_org - train_org.rolling(15, center=True).quantile(0.15).fillna(0),
    "roll_diff_uq_15":  train_org - train_org.rolling(15, center=True).quantile(0.85).fillna(0),
}

data_ls = [train_org]
for k,v in feats_dict.items():
    data_ls.append(v)

X_train = np.concatenate(data_ls, axis=1)
X_train.shape

In [None]:
loc, eps = utils.get_eps(X_train)
print("using eps:", eps)

dbscan = DBSCAN(eps, n_jobs=-1)
pred_y_out = dbscan.fit_predict(X_train)
freq = np.unique(pred_y_out, return_counts=True)
freq

In [None]:
# plt.plot(X_train[:600]);
# plt.show();
# plt.plot(train_y[:600]);
# plt.show();

In [None]:
conta = freq[1][0]/freq[1].sum()
conta

In [None]:
isof = IsolationForest(n_estimators=100, max_features=0.7, contamination=2/6, random_state=SEED, max_samples="auto")
pred_y_out = isof.fit_predict(X_train)

In [None]:
# from statsmodels.tsa.seasonal import seasonal_decompose

# n_steps = 2300

# series = pd.DataFrame(x_train[:n_steps, 0, feat_idx])
# result = seasonal_decompose(series, model='additive', period=9)

# fig, ax = plt.subplots(5,1,figsize=(24,8), sharex=True)
# series.plot(ax=ax[0])
# result.trend.plot(ax=ax[1])
# result.seasonal.plot(ax=ax[2])
# result.resid.plot(style=".",ax=ax[3])
# ax[4].plot(train_y[:n_steps])

# plt.show()

In [None]:
n_steps = 1000

fig, ax = plt.subplots(figsize=(24,4))
plt.plot(x_train[:n_steps, 0, feat_idx])
plt.plot(x_train_recons[:n_steps, 0, feat_idx])
plt.legend(["x-feat-i", "recons"])
plt.show()

fig, ax = plt.subplots(figsize=(24,2))
plt.plot(train_y[:n_steps])
plt.legend(["true_y"])
plt.show()

fig, ax = plt.subplots(figsize=(24,2))
plt.plot(pred_y_out[:n_steps]*-1)
plt.legend(["pred_y"])
plt.show()


In [None]:
d = {-1:1,1:0}

def map_values(x):
    if x in d:
        return d[x]
    else:
        return 0

vectorize = np.vectorize(lambda x: map_values(x))

true_y = train_y.astype("int8")
pred_y = vectorize(pred_y_out)


In [None]:
# fig, ax = plt.subplots(2,1,figsize=(24,2))

# n_steps = 1600

# pd.Series(true_y)[:n_steps].plot(ax=ax[0])
# pd.Series(pred_y).rolling(9, center=True).median().fillna(0)[:n_steps].plot(ax=ax[1])

# plt.tight_layout()

In [None]:

cr = classification_report(true_y ,pred_y)

print(cr)
print(f"AUC: {roc_auc_score(true_y, pred_y):0.4f}")
confusion_matrix(true_y, pred_y)