In [1]:
from pyspark.sql import SparkSession
import glob
import numpy as np
import pandas as pd
import os
import concurrent.futures
from tqdm import tqdm

In [2]:
#Set Environment accordingly
os.environ['JAVA_HOME'] = "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/"
os.environ['SPARK_HOME'] = "/Users/simran/Downloads/spark-3.5.5-bin-hadoop3/"

In [2]:
directory_path = "./data/full_history"
file_pattern = "*.csv"

In [3]:
#Convert Data to Pandas DataFrame to plot visualisations

all_files = glob.glob("./data/full_history/*.csv")

history_df_pd = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)


In [4]:
history_df_pd['Date'] = pd.to_datetime(history_df_pd['date'])

# set the 'Date' column as the index of the dataframe
history_df_pd.set_index('date', inplace=True)

In [5]:
#Compute Z-Score for Anomaly Detection

from scipy.stats import zscore

def detect_anomalies(df, column):
    df_copy = df.copy()

    # calculate Z-scores and add them as a new column
    df_copy['Z-score'] = zscore(df_copy[column])

    # find where the absolute Z-score is greater than 2 (common threshold for anomalies)
    anomalies = df_copy[abs(df_copy['Z-score']) > 2]
    return anomalies



In [6]:
#Data Visualisations

#Anomaly Visualisation

import matplotlib.pyplot as plt
import ipywidgets as widgets
import matplotlib.dates as mdate
from IPython.display import display


# Create the dropdown widget
dropdown = widgets.Dropdown(
    options=sorted(list(history_df_pd['StockName'].unique())),
    description='StockNames:'
)

# Function to update the plot based on dropdown selection
def update_plot(category):
    plt.clf() # Clear previous plot
    filtered_df = history_df_pd[history_df_pd['StockName'] == category]
    #filtered_df.loc[:,'date'] = pd.to_datetime(filtered_df['date'])


    sorted_filtered_df = filtered_df.sort_values(by='date')
    adj_close_anomalies = detect_anomalies(sorted_filtered_df, 'adj close')
    volume_anomalies = detect_anomalies(sorted_filtered_df, 'volume')
    # Plotting closing prices for a single stock
    _, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

    ax1.plot(sorted_filtered_df.index, sorted_filtered_df['adj close'], label='adj close', color='blue')
    ax1.scatter(adj_close_anomalies.index, adj_close_anomalies['adj close'], color='red', label='Anomalies')
    ax1.set_title(f'{category} Adjusted Close Price and Anomalies')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Adjusted Close Price')
    ax1.legend()

    # volume
    ax2.plot(sorted_filtered_df.index, sorted_filtered_df['volume'], label='volume', color='green')
    ax2.scatter(volume_anomalies.index, volume_anomalies['volume'], color='orange', label='Anomalies')
    ax2.set_title(f'{category} Trading Volume and Anomalies')
    ax2.set_xlabel('Date')
    ax2.set_ylabel('Volume')
    ax2.legend()

    locator = mdate.YearLocator()
    plt.gca().xaxis.set_major_locator(locator)

    plt.gcf().autofmt_xdate()


# Observe the dropdown value and update the plot
out = widgets.interactive_output(update_plot, {'category': dropdown})

# Display the dropdown and the initial plot

display(dropdown, out)

Dropdown(description='StockNames:', options=('A', 'AA', 'AAAU', 'AACG', 'AADR', 'AAL', 'AAMC', 'AAME', 'AAN', …

Output()