Financial Time-Series Anomaly Detection

In [6]:
# Cell 2: Import Libraries
# Import all necessary libraries for data processing, modeling, and visualization
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')

In [7]:
# Cell 3: Data Preprocessing
# Load and preprocess the Yahoo Finance dataset
def load_and_preprocess_data(file_path):
    try:
        print(f"Loading data from {file_path}...")
        df = pd.read_excel(file_path, engine='openpyxl')
        print(f"Raw data shape: {df.shape}")
        print(f"Raw data columns: {list(df.columns)}")

        if 'Date' not in df.columns:
            raise ValueError("Expected 'Date' column not found in the dataset.")

        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.dropna(subset=['Date'])

        expected_columns = ['Date', 'Open', 'High', 'Low', 'Close*', 'Volume']
        available_columns = [col for col in expected_columns if col in df.columns]
        if len(available_columns) < len(expected_columns):
            missing = set(expected_columns) - set(available_columns)
            print(f"Warning: Missing columns {missing}. Proceeding with available columns.")

        df = df[available_columns].copy()

        if 'Close*' in df.columns:
            df.rename(columns={'Close*': 'Close'}, inplace=True)

        df = df.dropna()
        print(f"Data shape after dropping NA: {df.shape}")

        lengths = {col: len(df[col]) for col in df.columns}
        if len(set(lengths.values())) > 1:
            raise ValueError(f"Inconsistent column lengths: {lengths}")

        df = df.sort_values('Date').reset_index(drop=True)

        numeric_cols = [col for col in df.columns if col != 'Date']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        df = df.dropna()
        print(f"Final data shape: {df.shape}")

        return df
    except Exception as e:
        print(f"Error in load_and_preprocess_data: {str(e)}")
        raise

# Execute data preprocessing
file_path = '/content/yahoo_data.xlsx'  # Update if necessary
df = load_and_preprocess_data(file_path)

Loading data from /content/yahoo_data.xlsx...
Raw data shape: (1258, 7)
Raw data columns: ['Date', 'Open', 'High', 'Low', 'Close*', 'Adj Close**', 'Volume']
Data shape after dropping NA: (1258, 6)
Final data shape: (1258, 6)


In [8]:
# Cell 4: Calculate Financial Indicators
# Compute SMA, EMA, RSI, and Bollinger Bands
def calculate_indicators(df):
    try:
        df['SMA20'] = df['Close'].rolling(window=20).mean()
        df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()

        delta = df['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        df['RSI'] = 100 - (100 / (1 + rs))

        df['BB_Middle'] = df['Close'].rolling(window=20).mean()
        df['BB_Std'] = df['Close'].rolling(window=20).std()
        df['BB_Upper'] = df['BB_Middle'] + 2 * df['BB_Std']
        df['BB_Lower'] = df['BB_Middle'] - 2 * df['BB_Std']

        return df
    except Exception as e:
        print(f"Error in calculate_indicators: {str(e)}")
        raise

# Execute indicator calculation
df = calculate_indicators(df)
print("Financial indicators calculated. Columns:", list(df.columns))

Financial indicators calculated. Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'SMA20', 'EMA20', 'RSI', 'BB_Middle', 'BB_Std', 'BB_Upper', 'BB_Lower']


In [9]:
# Detect anomalies using Isolation Forest
def detect_anomalies(df):
    try:
        features = ['Close', 'SMA20', 'EMA20', 'RSI', 'BB_Upper', 'BB_Lower', 'Volume']
        X = df[features].dropna()

        iso_forest = IsolationForest(contamination=0.05, random_state=42)
        df.loc[X.index, 'Anomaly'] = iso_forest.fit_predict(X)

        df['Anomaly'] = df['Anomaly'].apply(lambda x: 1 if x == -1 else 0)

        return df
    except Exception as e:
        print(f"Error in detect_anomalies: {str(e)}")
        raise

# Execute anomaly detection
df = detect_anomalies(df)
print(f"Total anomalies detected: {df['Anomaly'].sum()}")

Total anomalies detected: 62


In [10]:
# Prepare data for LSTM model
def prepare_lstm_data(df, look_back=20):
    try:
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data = scaler.fit_transform(df[['Close']].values)

        X, y = [], []
        for i in range(look_back, len(scaled_data)):
            X.append(scaled_data[i-look_back:i, 0])
            y.append(scaled_data[i, 0])

        X = np.array(X)
        y = np.array(y)

        X = np.reshape(X, (X.shape[0], X.shape[1], 1))

        return X, y, scaler
    except Exception as e:
        print(f"Error in prepare_lstm_data: {str(e)}")
        raise

# Execute data preparation
X, y, scaler = prepare_lstm_data(df)
print(f"LSTM input shape: {X.shape}, Output shape: {y.shape}")

LSTM input shape: (1238, 20, 1), Output shape: (1238,)


In [11]:
# Build and train the LSTM model
def build_and_train_lstm(X, y):
    try:
        model = Sequential()
        model.add(LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)))
        model.add(LSTM(units=50))
        model.add(Dense(units=1))
        model.compile(optimizer='adam', loss='mean_squared_error')

        train_size = int(len(X) * 0.8)
        X_train, X_test = X[:train_size], X[train_size:]
        y_train, y_test = y[:train_size], y[train_size:]

        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

        return model, X_test, y_test
    except Exception as e:
        print(f"Error in build_and_train_lstm: {str(e)}")
        raise

# Execute model training
model, X_test, y_test = build_and_train_lstm(X, y)
print(f"Test set size: {len(X_test)}")

Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - loss: 0.1118
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 0.0046
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - loss: 0.0022
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0016
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0018
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0016
Epoch 7/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - loss: 0.0018
Epoch 8/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0022
Epoch 9/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0018
Epoch 10/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0017

In [12]:
# Generate predictions and detect significant deviations
def forecast_and_detect_deviations(df, model, X_test, y_test, scaler, look_back=20):
    try:
        predictions = model.predict(X_test, verbose=0)
        predictions = scaler.inverse_transform(predictions)
        actual = scaler.inverse_transform([y_test])[0]

        deviations = np.abs(predictions.flatten() - actual) / actual
        threshold = 0.05
        significant_deviations = deviations > threshold

        print(f"Length of predictions: {len(predictions)}")
        print(f"Length of actual: {len(actual)}")
        print(f"Length of deviations: {len(deviations)}")
        print(f"Length of significant_deviations: {len(significant_deviations)}")
        print(f"Expected Date length: {len(df['Date'].iloc[-len(X_test):])}")

        forecast_df = pd.DataFrame({
            'Date': df['Date'].iloc[-len(X_test):].values,
            'Actual': actual,
            'Predicted': predictions.flatten(),
            'Deviation': deviations,
            'Significant_Deviation': significant_deviations
        })

        return forecast_df
    except Exception as e:
        print(f"Error in forecast_and_detect_deviations: {str(e)}")
        raise

# Execute forecasting
forecast_df = forecast_and_detect_deviations(df, model, X_test, y_test, scaler)
print(f"Significant deviations detected: {forecast_df['Significant_Deviation'].sum()}")



Length of predictions: 248
Length of actual: 248
Length of deviations: 248
Length of significant_deviations: 248
Expected Date length: 248
Significant deviations detected: 25


In [13]:
# Generate and save visualizations
def visualize_results(df, forecast_df, output_dir='plots'):
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        sns.set(style='whitegrid')

        plt.figure(figsize=(14, 7))
        plt.plot(df['Date'], df['Close'], label='Close Price', color='blue')
        plt.plot(df['Date'], df['SMA20'], label='SMA (20-day)', color='green')
        plt.plot(df['Date'], df['EMA20'], label='EMA (20-day)', color='orange')
        plt.plot(df['Date'], df['BB_Upper'], label='Bollinger Upper', color='red', linestyle='--')
        plt.plot(df['Date'], df['BB_Lower'], label='Bollinger Lower', color='red', linestyle='--')
        plt.title('Stock Price and Indicators')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.legend()
        plt.savefig(f'{output_dir}/price_indicators.png')
        plt.close()

        plt.figure(figsize=(14, 4))
        plt.plot(df['Date'], df['RSI'], label='RSI', color='purple')
        plt.axhline(70, color='red', linestyle='--', label='Overbought (70)')
        plt.axhline(30, color='red', linestyle='--', label='Oversold (30)')
        plt.title('Relative Strength Index (RSI)')
        plt.xlabel('Date')
        plt.ylabel('RSI')
        plt.legend()
        plt.savefig(f'{output_dir}/rsi.png')
        plt.close()

        plt.figure(figsize=(14, 7))
        plt.plot(df['Date'], df['Close'], label='Close Price', color='blue', alpha=0.5)
        anomalies = df[df['Anomaly'] == 1]
        plt.scatter(anomalies['Date'], anomalies['Close'], color='red', label='Anomalies', marker='o')
        plt.title('Detected Anomalies')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.legend()
        plt.savefig(f'{output_dir}/anomalies.png')
        plt.close()

        plt.figure(figsize=(14, 7))
        plt.plot(forecast_df['Date'], forecast_df['Actual'], label='Actual Price', color='blue')
        plt.plot(forecast_df['Date'], forecast_df['Predicted'], label='Predicted Price', color='orange')
        deviations = forecast_df[forecast_df['Significant_Deviation']]
        plt.scatter(deviations['Date'], deviations['Actual'], color='red', label='Significant Deviations', marker='o')
        plt.title('LSTM Forecast and Significant Deviations')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.legend()
        plt.savefig(f'{output_dir}/forecast_deviations.png')
        plt.close()

        plt.figure(figsize=(14, 4))
        plt.plot(df['Date'], df['Volume'], label='Volume', color='gray')
        plt.title('Trading Volume')
        plt.xlabel('Date')
        plt.ylabel('Volume')
        plt.legend()
        plt.savefig(f'{output_dir}/volume.png')
        plt.close()

        print(f"Visualizations saved in '{output_dir}' directory.")
    except Exception as e:
        print(f"Error in visualize_results: {str(e)}")
        raise

# Execute visualization
visualize_results(df, forecast_df)

Visualizations saved in 'plots' directory.


In [14]:
# Print the final analysis report
print("\n=== Stock Anomaly Detection Report ===")
print(f"Dataset: Dow Jones Industrial Average")
print(f"Date Range: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")
print(f"Total Anomalies Detected: {df['Anomaly'].sum()}")
print(f"Significant Deviations in Forecast: {forecast_df['Significant_Deviation'].sum()}")
print("\nKey Observations:")
print("- Anomalies are often associated with extreme RSI values or prices outside Bollinger Bands.")
print("- Significant deviations in LSTM forecasts may indicate unexpected market movements.")
print("- High volatility periods (e.g., March 2020) show clusters of anomalies.")
print("\nVisualizations have been saved in the 'plots' directory:")
print("- price_indicators.png: Stock price with SMA, EMA, and Bollinger Bands")
print("- rsi.png: Relative Strength Index")
print("- anomalies.png: Detected anomalies")
print("- forecast_deviations.png: LSTM forecast with significant deviations")
print("- volume.png: Trading volume")
print("\nConclusion:")
print("The analysis highlights periods of potential market manipulation or unusual activity, particularly during volatile periods. Traders should investigate these anomalies further and exercise caution.")


=== Stock Anomaly Detection Report ===
Dataset: Dow Jones Industrial Average
Date Range: 2018-05-01 to 2023-04-28
Total Anomalies Detected: 62
Significant Deviations in Forecast: 25

Key Observations:
- Anomalies are often associated with extreme RSI values or prices outside Bollinger Bands.
- Significant deviations in LSTM forecasts may indicate unexpected market movements.
- High volatility periods (e.g., March 2020) show clusters of anomalies.

Visualizations have been saved in the 'plots' directory:
- price_indicators.png: Stock price with SMA, EMA, and Bollinger Bands
- rsi.png: Relative Strength Index
- anomalies.png: Detected anomalies
- forecast_deviations.png: LSTM forecast with significant deviations
- volume.png: Trading volume

Conclusion:
The analysis highlights periods of potential market manipulation or unusual activity, particularly during volatile periods. Traders should investigate these anomalies further and exercise caution.
