<a href="https://colab.research.google.com/github/reshmaessudhakaran/AnomalyDetection_python/blob/main/anomalyAutoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt


In [2]:
!pip install statsmodels




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
class DataConsolidation:
    def __init__(self, frequency):
        self.frequency = frequency

    def consolidate(self, data):
        try:

            data['order_time'] = pd.to_datetime(data['order_time'], format='%d-%m-%Y %H:%M')

            if self.frequency == 'H':
                data['Date'] = data['order_time'].dt.floor('H')
            elif self.frequency == 'D':
                data['Date'] = data['order_time'].dt.floor('D')
            elif self.frequency == 'W':
                data['Date'] = data['order_time'].dt.to_period('W-SUN').apply(lambda r: r.start_time)
            else:
                raise ValueError(f"Unsupported frequency: {self.frequency}")

            data = data.groupby('Date')['total_price'].sum().reset_index()

        except Exception as e:
            print(f"Error consolidating data: {e}")
            return None
        return data


In [5]:
def get_current_week():
    return datetime.now().isocalendar()[1]

def get_last_retrained_week(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return int(file.readline().strip())
    return None

def update_last_retrained_week(file_path, week):
    current_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(file_path, 'w') as file:
        file.write(f"{week}\nRetrained on: {current_date}")


In [6]:




def read_and_preprocess_data(filepath, frequency):
    data = pd.read_csv(filepath)
    consolidator = DataConsolidation(frequency)
    consolidated_data = consolidator.consolidate(data)
    return consolidated_data

def scale_data(data, scaler=None):
    if scaler is None:
        scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = data[['total_price']].values
    scaled_data = scaler.fit_transform(dataset)
    return scaled_data, scaler, dataset

def create_train_test_datasets(scaled_data):
    x_data = np.expand_dims(scaled_data, axis=-1)
    return x_data

def build_and_train_autoencoder(x_train, autoencoder_model_path, callbacks=[]):
    input_layer_ae = Input(shape=(x_train.shape[1], x_train.shape[2]))
    encoded_ae = LSTM(32, activation='relu', return_sequences=True)(input_layer_ae)
    encoded_ae = LSTM(16, activation='relu', return_sequences=False)(encoded_ae)
    decoded_ae = Dense(x_train.shape[1], activation='linear')(encoded_ae)
    autoencoder = Model(input_layer_ae, decoded_ae)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])

    early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
    callbacks = [early_stopping] + callbacks

    autoencoder.fit(x_train, x_train, epochs=50, batch_size=32, callbacks=callbacks, verbose=1)
    autoencoder.save(autoencoder_model_path.replace('.h5', '.keras'))  # Save as .keras file
    return autoencoder

def calculate_reconstruction_errors(autoencoder, test_data):
    reconstructed_test_data = autoencoder.predict(test_data)
    reconstruction_errors = np.mean(np.abs(reconstructed_test_data - test_data), axis=(1, 2))
    return reconstruction_errors

def calculate_stats(window):
    mean = np.mean(window)
    std_dev = np.std(window)
    return mean, std_dev

def calculate_peak_hours(data):
    data['hour'] = data['Date'].dt.hour
    hourly_sales = data.groupby('hour')['total_price'].mean()
    peak_hours = hourly_sales[hourly_sales > hourly_sales.mean() + hourly_sales.std()].index.tolist()
    return peak_hours

def is_peak_hour(date, peak_hours):
    hour = date.hour
    return hour in peak_hours

def is_weekend(date):
    return date.weekday() >= 5  # Saturday and Sunday

def detect_anomalies_dynamic_threshold(data_points, reconstruction_errors, dates, window_size, peak_hours):
    anomalies = []
    consecutive_high_sales_count = 0

    for i in range(window_size, len(data_points)):
        window = data_points[i-window_size:i]
        mean, std_dev = calculate_stats(window)

        # Initial threshold
        lower_threshold = mean - 3 * std_dev
        upper_threshold = mean + 3 * std_dev

        # Adjust threshold based on context
        latest_point = data_points[i]
        current_date = dates[i]
        if is_weekend(current_date):
            lower_threshold -= 0.5 * std_dev
            upper_threshold += 0.5 * std_dev
        if is_peak_hour(current_date, peak_hours):
            lower_threshold -= 0.5 * std_dev
            upper_threshold += 0.5 * std_dev

        # Adjust threshold if the latest data point has been observed multiple times within the window
        count_latest_point = window.count(latest_point)
        if count_latest_point > 1:
            lower_threshold -= 0.5 * std_dev
            upper_threshold += 0.5 * std_dev

        # Detect anomaly
        if latest_point < lower_threshold or latest_point > upper_threshold:
            anomalies.append((i, latest_point, reconstruction_errors[i]))
        else:
            # Check for continuous high sales
            if latest_point > upper_threshold:
                consecutive_high_sales_count += 1
                if consecutive_high_sales_count >= 3:
                    anomalies = [anomaly for anomaly in anomalies if anomaly[1] != latest_point]
            else:
                consecutive_high_sales_count = 0

    return anomalies


In [7]:
def decompose_series(data, frequency):
    try:
        data.set_index('Date', inplace=True)
        decomposition = seasonal_decompose(data['total_price'], model='additive', period=frequency)
        trend = decomposition.trend
        seasonal = decomposition.seasonal
        residual = decomposition.resid

        # Fill NaN values (could be improved depending on the use case)
        trend = trend.fillna(method='bfill').fillna(method='ffill')
        seasonal = seasonal.fillna(method='bfill').fillna(method='ffill')
        residual = residual.fillna(method='bfill').fillna(method='ffill')

        return trend, seasonal, residual
    except Exception as e:
        print(f"Error in seasonal decomposition: {e}")
        return None, None, None


In [8]:
def plot_decomposition(trend, seasonal, residual, title_prefix=''):
    if trend is None or seasonal is None or residual is None:
        print("Error: One or more components are None. Cannot plot.")
        return

    plt.figure(figsize=(15, 10))

    plt.subplot(3, 1, 1)
    plt.plot(trend.index, trend, label='Trend')
    plt.title(f'{title_prefix}Trend Component')
    plt.legend(loc='upper left')

    plt.subplot(3, 1, 2)
    plt.plot(seasonal.index, seasonal, label='Seasonal')
    plt.title(f'{title_prefix}Seasonal Component')
    plt.legend(loc='upper left')

    plt.subplot(3, 1, 3)
    plt.plot(residual.index, residual, label='Residual')
    plt.title(f'{title_prefix}Residual Component')
    plt.legend(loc='upper left')

    plt.tight_layout()
    plt.show()


In [10]:
def main(train_filepath, test_filepath, autoencoder_model_path, output_filepath, week_file_path, frequency):
    current_week = get_current_week()
    last_retrained_week = get_last_retrained_week(week_file_path)

    retrain = False
    if last_retrained_week is None or current_week != last_retrained_week:
        retrain = True
        update_last_retrained_week(week_file_path, current_week)

    train_data = read_and_preprocess_data(train_filepath, frequency)
    test_data = read_and_preprocess_data(test_filepath, frequency)

    # Perform seasonal decomposition
    train_data['Date'] = pd.to_datetime(train_data['Date'])
    test_data['Date'] = pd.to_datetime(test_data['Date'])
    #trend, seasonal, residual = decompose_series(train_data, frequency)

    # Plot the decomposed components
    #plot_decomposition(trend, seasonal, residual, title_prefix='Train Data - ')

    scaled_train_data, scaler, _ = scale_data(train_data)
    scaled_test_data, _, _ = scale_data(test_data, scaler)

    x_train = create_train_test_datasets(scaled_train_data)
    x_test = create_train_test_datasets(scaled_test_data)

    if retrain:
        autoencoder = build_and_train_autoencoder(x_train, autoencoder_model_path, callbacks=[EarlyStopping(patience=10)])
        print("Autoencoder model retrained.")
    else:
        if os.path.exists(autoencoder_model_path):
            autoencoder = load_model(autoencoder_model_path)
            print("Loaded existing autoencoder model.")
        else:
            autoencoder = build_and_train_autoencoder(x_train, autoencoder_model_path, callbacks=[EarlyStopping(patience=10)])
            print("Autoencoder model trained and saved.")

    # Calculate reconstruction errors for anomaly detection
    reconstruction_errors = calculate_reconstruction_errors(autoencoder, x_test)

    # Detect anomalies with dynamic threshold
    data_points = test_data['total_price'].tolist()
    dates = pd.to_datetime(test_data['Date']).tolist()
    window_size = 10  # Define your window size
    peak_hours = calculate_peak_hours(train_data)
    anomalies = detect_anomalies_dynamic_threshold(data_points, reconstruction_errors, dates, window_size, peak_hours)

    # Print and save the results
    test_dates = test_data['Date']
    test_prices = test_data['total_price']

    results = pd.DataFrame({
        'Date': test_dates,
        'Price': test_prices,
        'Reconstruction Error': reconstruction_errors,
        'Anomaly': ["TRUE" if i in [anomaly[0] for anomaly in anomalies] else "FALSE" for i in range(len(test_prices))]
    })

    results.to_csv(output_filepath, index=False)
    print(results)

    return results


if __name__ == "__main__":
    train_filepath = '/content/drive/MyDrive/DatasetXTG/order_train_data.csv'
    test_filepath = '/content/drive/MyDrive/DatasetXTG/order_test_data.csv'
    autoencoder_model_path = 'autoencoder_model.h5'
    output_filepath = 'anomalies_syntheticdata.csv'
    week_file_path = 'last_retrained_week.txt'
    frequency = 'W'  # Change to 'D' for daily, 'W' for weekly, etc.

    results = main(train_filepath, test_filepath, autoencoder_model_path, output_filepath, week_file_path, frequency)
    print("Anomalies detected and saved to:", output_filepath)

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 0.4818 - mae: 0.6793
Epoch 2/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 176ms/step - loss: 0.4589 - mae: 0.6667

  current = self.get_monitor_value(logs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.4696 - mae: 0.6696 
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.4623 - mae: 0.6594 
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.4574 - mae: 0.6568
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.4607 - mae: 0.6589
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.4499 - mae: 0.6517
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.4561 - mae: 0.6597
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.4546 - mae: 0.6583
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.4451 - mae: 0.6506
Epoch 10/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━