<center><h3 style='color:red'>Getting Starts with Anomaly Detection and Keras</h3><br>KASSEM@ELCAISERI</center><hr>

# Introduction
This notebook introduces Anomaly Detection with Keras applying autoencoders.

An autoencoder is a special type of neural network that is trained to copy its input to its output. For example, given an image of a handwritten digit, an autoencoder first encodes the image into a lower dimensional latent representation, then decodes the latent representation back to an image. An autoencoder learns to compress the data while minimizing the reconstruction error.

# Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Wrapping 

In [None]:
PATH = '../input/breast-cancer-wisconsin-data/data.csv'
df = pd.read_csv(PATH)
df.head()

In [None]:
df.describe()

In [None]:
df.info()

# Data Cleaning

Drop unnecessary columns

In [None]:
drop_cols = ['id', 'Unnamed: 32']
df.drop(drop_cols, axis=1, inplace=True)

Convert str into float

In [None]:
rep_dict = {'B': 0.0, 'M': 1.0}
df['diagnosis'].replace(rep_dict, inplace=True);

In [None]:
print(f'Data size is {df.shape}.')

# EDA and Visualization

In [None]:
df['diagnosis'].value_counts().plot(kind='bar', figsize=(8, 4));
plt.title('Diagnosis Value Counts');
plt.xlabel('Diagnosis');
plt.ylabel('Frequency');
plt.xticks([0.0, 1.0], ['Benign', 'Malignant'], rotation=45);

In [None]:
df['anomaly'] = df['diagnosis'] == 1.0
anomaly = df[df['anomaly'] == True]
normal = df[df['anomaly'] == False]

In [None]:
sns.distplot(normal);
sns.distplot(anomaly);

plt.title('normal vs  anomaly Dist.');
plt.ylabel('Dist.');

# Normalize The Data

In [None]:
# The last element contains the labels
labels = df.iloc[:, 0]

# The other data points are the electrocadriogram data
data = df.iloc[:, 1:31]

labels = labels.astype(bool)

data = normalize(data)

# Split the data

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=42
)

In [None]:
normal_train_data = train_data[train_labels]
normal_test_data = test_data[test_labels]

anomalous_train_data = train_data[~train_labels]
anomalous_test_data = test_data[~test_labels]

# Build the model

In [None]:
class AnomalyDetector(Model):
    
    def __init__(self):
        super(AnomalyDetector, self).__init__()
        self.encoder = tf.keras.Sequential([
          layers.Dense(64, activation="relu"),
          layers.Dense(32, activation="relu"),
          layers.Dense(16, activation="relu")]) # bottleneck layer

        self.decoder = tf.keras.Sequential([
          layers.Dense(32, activation="relu"),
          layers.Dense(64, activation="relu"),
          layers.Dense(data.shape[1], activation="sigmoid")])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = AnomalyDetector()
autoencoder.compile(optimizer='adam', loss='mae')

In [None]:
history = autoencoder.fit(normal_train_data, normal_train_data, 
          epochs=20, 
          batch_size=8,
          validation_data=(test_data, test_data),
          shuffle=True)

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(history.history["loss"], label="Training Loss");
plt.plot(history.history["val_loss"], label="Validation Loss");
plt.legend();

In [None]:
i = 0 # you can chooce any sample from here
encoded_data = autoencoder.encoder(normal_test_data).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

plt.figure(figsize=(12, 8))
plt.plot(normal_test_data[i], 'b')
plt.plot(decoded_data[i], 'r')
plt.fill_between(np.arange(30), decoded_data[i], normal_test_data[i], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()

In [None]:
encoded_data = autoencoder.encoder(anomalous_test_data).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

plt.figure(figsize=(12, 8))
plt.plot(anomalous_test_data[0], 'b')
plt.plot(decoded_data[0], 'r')
plt.fill_between(np.arange(30), decoded_data[0], anomalous_test_data[0], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()

# Detect anomalies
Detect anomalies by calculating whether the reconstruction loss is greater than a fixed threshold. In this tutorial, you will calculate the mean average error for normal examples from the training set, then classify future examples as anomalous if the reconstruction error is higher than one standard deviation from the training set

In [None]:
reconstructions = autoencoder.predict(normal_train_data)
train_loss = tf.keras.losses.mae(reconstructions, normal_train_data)

plt.figure(figsize=(10, 8))
plt.hist(train_loss[None,:], bins=50)
plt.xlabel("Train loss")
plt.ylabel("No of examples")
plt.show()

Choose a threshold value that is one standard deviations above the mean.

In [None]:
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

In [None]:
reconstructions = autoencoder.predict(anomalous_test_data)
test_loss = tf.keras.losses.mae(reconstructions, anomalous_test_data)

plt.figure(figsize=(10, 8))
plt.hist(test_loss[None, :], bins=50)
plt.xlabel("Test loss")
plt.ylabel("No of examples")
plt.show()

# Predict and Print Stats

In [None]:
def predict(model, data, threshold):
    reconstructions = model(data)
    loss = tf.keras.losses.mae(reconstructions, data)
    return tf.math.less(loss, threshold)

def print_stats(predictions, labels):
    print("Accuracy = {}".format(accuracy_score(labels, predictions)))
    print("Precision = {}".format(precision_score(labels, predictions)))
    print("Recall = {}".format(recall_score(labels, predictions)))

In [None]:
preds = predict(autoencoder, test_data, threshold)
print_stats(preds, test_labels)

# What to do to get better results:
* Use a deeper network 
* Use LSTM in the network
* Feature importance and selection
* Try different batchsize
* Try to train it for longer time

# Reference:
* https://www.tensorflow.org/tutorials/generative/autoencoder
* https://www.youtube.com/watch?v=2K3ScZp1dXQ&ab_channel=TensorFlow
* https://www.anodot.com/blog/what-is-anomaly-detection/ (theory)

<h4>If find it interesting am wating for your feedback in the comments section, <span style='color:red'>UPVOTE</span> If you Like it</h4> <br>

Notebook still under modifications