<a href="https://colab.research.google.com/github/saradhasarah12/Deep_Learning_project_Showcase/blob/main/Outlier_Detection_using_Autoencoder_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, losses
from sklearn.metrics import classification_report

# Create Dataset with Outliers
X, y = make_classification(n_samples=100000, n_features=32, n_informative=32, n_redundant=0, n_repeated=0, n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.995, 0.005],
                           class_sep=0.5, random_state=0)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the number of records
print('The number of records in the training dataset is', X_train.shape[0])
print('The number of records in the test dataset is', X_test.shape[0])
print(f"The training dataset has {sorted(Counter(y_train).items())[0][1]} records for the majority class and {sorted(Counter(y_train).items())[1][1]} records for the minority class.")

# Create Autoencoder model for training
X_train_normal = X_train[np.where(y_train == 0)]

input_layer = tf.keras.layers.Input(shape=(32,))
encoder = tf.keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(4, activation='relu')
])(input_layer)
decoder = tf.keras.Sequential([
    layers.Dense(8, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(32, activation="sigmoid")
])(encoder)

autoencoder = tf.keras.Model(inputs=input_layer, outputs=decoder)

# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mae')

# Fit the autoencoder
history = autoencoder.fit(X_train_normal, X_train_normal,
                          epochs=20,
                          batch_size=64,
                          validation_data=(X_test, X_test),
                          shuffle=True)

# Visualize the Training and validation Loss
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()

# Setting Threshold to identify the anomalies
prediction = autoencoder.predict(X_test)
prediction_loss = tf.keras.losses.mae(prediction, X_test)
loss_threshold = np.percentile(prediction_loss, 98)
print(f'The prediction loss threshold for 2% of outliers is {loss_threshold:.2f}')

sns.histplot(prediction_loss, bins=30, alpha=0.8)
plt.axvline(x=loss_threshold, color='orange')

# Performance evaluation of the encoder model
threshold_prediction = [0 if i < loss_threshold else 1 for i in prediction_loss]
print(classification_report(y_test, threshold_prediction))
