### **Stock Price Prediction and Sentiment Analysis Web Application using TFT and LSTM Models**

#### Installing Required Libraries

In [1]:
!pip install lightning pytorch_forecasting pyngrok flask keras praw

Collecting lightning
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting pytorch_forecasting
  Downloading pytorch_forecasting-1.2.0-py3-none-any.whl.metadata (13 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading lightning-2.4.0-py3-none-any.whl (810 kB)
[2K

#### Setting Up Ngrok for Secure Public Tunneling

In [2]:
from pyngrok import ngrok
# Run Ngrok directly with the authentication key and create a tunnel on port 5000
!ngrok authtoken 2oiCs48qXHjKVyjOKVxbdDwd1xm_6iaRD6qpPkWJA4XjjUed4

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


#### Imporing Required Libraries

In [3]:
# General-purpose utilities for data manipulation, file handling, and warnings
import copy
from flask import Flask, render_template, request
from pathlib import Path
import warnings
import numpy as np
import pandas as pd

# Libraries for financial data retrieval, date handling, API usage, and NLP tasks
import yfinance as yf
from datetime import datetime, timedelta
import praw
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import requests

# PyTorch Lightning and Pytorch Forecasting for model training, hyperparameter tuning, and evaluation
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
from pytorch_forecasting import (
    Baseline,
    TemporalFusionTransformer,
    TimeSeriesDataSet,
)
from pytorch_forecasting.data import GroupNormalizer, NaNLabelEncoder
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

# TensorFlow/Keras for building LSTM models and managing training workflows
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping as ESLSTM

# Visualization libraries for interactive data exploration and presentation
import plotly.graph_objects as go
import plotly.express as px

# Sklearn for preprocessing and calculating evaluation metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
)

import warnings
import logging
warnings.filterwarnings("ignore", category=UserWarning, module="praw")
logging.getLogger("praw").setLevel(logging.ERROR)

#### Configuring Hardware Preferences

In [4]:
import torch
if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

Number of GPUs available: 1
GPU 0: NVIDIA A100-SXM4-40GB


In [5]:
while True:
    gpu = input("Do you have a GPU (yes or no): ").strip().lower()  # Accept user input and normalize case
    if gpu in ["yes", "no", "y", "n"]:  # Validate input
        break  # Exit the loop if valid input is provided
    print("Invalid input. Please enter 'yes' or 'no'.")  # Display error message for invalid input

# Set GPU or CPU configurations based on user input
if gpu == "yes" or "y":
    gpu_info = "gpu"
    gpu_device = 0  # Use GPU for computations
    print("Great! You can use GPU for acceleration.")
else:
    gpu_info = "cpu"
    gpu_device = -1  # Default to CPU
    print("No problem, the code will run on CPU.")

Do you have a GPU (yes or no): y
Great! You can use GPU for acceleration.


### **Integrating Flask with Machine Learning Models for Stock Price Prediction and Analytics:**



* Initializing a Flask web application and establishing a public URL via Ngrok.
*   Loading and using machine learning models (TFT and LSTM) for stock predictions.
*   Fetching, cleaning, and preprocessing data from Reddit, news, and Yahoo Finance.
*   Implementing sentiment analysis using FinBERT.
*   Providing multiple application routes for predictions, model comparison, and trading strategy simulations.


### **Note:** Before running the cell below, ensure that all files from the `Stock_Web_UI` folder are uploaded to the session storage.

In [6]:
# Initialize Flask application
app = Flask(__name__, template_folder='/content')

# Open a tunnel on port 5000 for Flask app
public_url = ngrok.connect(5000)

# Load the FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=gpu_device)

#api keys
REDDIT_CLIENT_ID = '4PxXlcgsrXHwGGrpaOfD-g'
REDDIT_CLIENT_SECRET = 'Tsrv0kQt3sqT5s0qHKrJa_I-CnYkLw'
NEWS_API_KEY = "8a333f2dc4af46fc9a5200bbded71e61"

# Initialize the Reddit API client
reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID,
                     client_secret=REDDIT_CLIENT_SECRET,
                     user_agent='stocks_data')

# Function to fetch latest news
def fetch_latest_news(query, from_date, to_date, language="en", page_size=100):
    url = "https://newsapi.org/v2/everything"
    headers = {"Authorization": f"Bearer {NEWS_API_KEY}"}
    params = {
        "q": query,
        "from": from_date,
        "to": to_date,
        "language": language,
        "pageSize": page_size,
    }
    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        articles = data.get("articles", [])
        if articles:
            return pd.DataFrame(articles)
        else:
            print(f"No articles found for {query}.")
            return pd.DataFrame()
    else:
        print(f"Error {response.status_code}: {response.json().get('message', 'Unknown error')}")
        return pd.DataFrame()

# Function to fetch news for a specific ticker
def get_stock_news(ticker):
    start_date = "2024-12-01"
    query = ticker  # Using ticker directly as a query keyword
    to_date = datetime.now().strftime("%Y-%m-%d")
    date_limit = "2024-12-30"
    if to_date > date_limit:
        to_date = date_limit
    news_df = fetch_latest_news(query, start_date, to_date)
    return news_df

def scrape_subreddit_time_window(subreddit_name, search_query, start_time, end_time, limit=10000):
    subreddit = reddit.subreddit(subreddit_name)
    posts = subreddit.search(search_query, sort='new', limit=limit)
    posts_data = []

    for post in posts:
        post_time = datetime.utcfromtimestamp(post.created_utc)
        if start_time <= post_time <= end_time:
            posts_data.append({
                'title': post.title,
                'created_utc': post_time,
                'selftext': post.selftext,
                'subreddit': subreddit_name
            })

    return posts_data

def get_latest_reddit_data(ticker):
    subreddits = ['stocks', 'WallStreetBets', 'investing', 'finance', 'StockMarket', 'Nasdaq']

    # Multiple search queries
    if ticker == "TSLA":
      search_queries = [
          '(Tesla OR $TSLA OR Elon Musk OR electric vehicle OR autonomous OR SpaceX)',
          '(Tesla Stocks OR TeslaMarket OR Tesla Finance OR Tesla closing price)'
      ]
    elif ticker == "AAPL":
      search_queries = [
          '(Apple OR $AAPL OR iPhone OR Tim Cook)',
          '(Apple Stocks OR AppleMarket OR Apple Finance OR Apple closing price)',
          '(Apple Inc OR AAPL stock OR Apple investment OR Apple trading OR Apple Nasdaq)'
      ]

    # Defining a time window for scraping
    start_date = datetime(2024, 12, 1)  # Starting date
    end_date = datetime.today()  # Ending date
    date_limit = datetime(2024, 12, 30)
    if end_date > date_limit:
        end_date = date_limit
    window_size = timedelta(days=7)  # 3-month window

    # Initializing an empty list to store all the posts data
    all_posts_data = []

    # Loop through each search query, date range, and subreddit to collect data
    for search_query in search_queries:
        current_start = start_date
        while current_start < end_date:
            current_end = current_start + window_size
            for subreddit in subreddits:
                posts_data = scrape_subreddit_time_window(subreddit, search_query, current_start, current_end)
                all_posts_data.extend(posts_data)
            current_start = current_end

    # Convert the collected data into a DataFrame
    df_reddit = pd.DataFrame(all_posts_data)
    return df_reddit

def get_latest_data(ticker):
    # Fetch new data for predictions from Yahoo Finance
    start_date = '2024-12-01'
    print(f"\n ---> Preparing data for {ticker} stock: ")
    print(f"Fetching  Reddit and news data from {start_date} \n")
    end_date = pd.Timestamp.now().strftime('%Y-%m-%d')  # Current date
    date_limit = '2024-12-30'
    if end_date > date_limit:
        end_date = date_limit
    data = yf.download(ticker, start=start_date, end=end_date, interval='1d')
    data.reset_index(inplace=True)
    data['Date'] = pd.to_datetime(data['Date']).dt.date
    data.columns = data.columns.get_level_values(0)
    # Drop the 'Ticker' column if it exists
    data = data.drop(columns=['Ticker'], errors='ignore')
    print("\nFetching................")
    if ticker == "TSLA":
      news_data = get_stock_news(ticker)
      reddit_data = get_latest_reddit_data(ticker)
    elif ticker == "AAPL":
      news_data = get_stock_news(ticker)
      reddit_data = get_latest_reddit_data(ticker)

    print(f"Fetched Reddit and news data from {start_date} to {end_date} \n")
    print(f"Started sentiment analysis on Reddit and news data \n")

    # Fill missing values in the relevant columns with empty strings
    news_data['title'] = news_data['title'].fillna("")
    news_data['description'] = news_data['description'].fillna("")
    news_data['content'] = news_data['content'].fillna("")
    # Combine the columns into a single column for sentiment analysis
    news_data['text'] = news_data['title'] + " " + news_data['description'] + " " + news_data['content']
    # Apply sentiment analysis to the combined column
    news_data[['sentiment', 'sentiment_score']] = news_data['text'].apply(lambda x: pd.Series(get_sentiment(x)))
    # Drop the intermediate combined text column
    news_data.drop(columns=['text'], inplace=True)

    reddit_data['body'] = reddit_data['title'] + reddit_data['selftext']
    reddit_data['body'] = reddit_data['body'].fillna("")
    reddit_data[['sentiment', 'sentiment_score']] = reddit_data['body'].apply(lambda x: pd.Series(get_sentiment(x)))

    merged_data = data_cleaning_and_preprocessing(data, reddit_data, news_data)

    print(f"Sentiment Analysis is Done and merged with stock data from {start_date} to {end_date} for {ticker}")
    print("\n")

    return merged_data

# Function to apply sentiment analysis
def get_sentiment(text):
    # Apply sentiment analysis and return the label and score
    result = sentiment_analyzer(text[:512])[0]  # Limiting to 512 characters for BERT model
    return result['label'], result['score']

def data_cleaning_and_preprocessing(data, reddit_df, news_df):
    # Convert 'Date' columns to datetime for each dataframe
    reddit_df['created_utc'] = pd.to_datetime(reddit_df['created_utc'])
    news_df['seendate'] = pd.to_datetime(news_df['publishedAt'])
    news_df.set_index('seendate', inplace=True)
    reddit_df.set_index('created_utc', inplace=True)

    reddit_data_aggregated = reddit_df.groupby(pd.Grouper(freq='B')).agg({
        'sentiment': 'max',
    }).rename(columns={'sentiment': 'reddit_sentiment'})

    news_data_aggregated = news_df.groupby(pd.Grouper(freq='B')).agg({
        'sentiment': 'max',
    }).rename(columns={'sentiment': 'news_sentiment'})

    # Convert 'seendate' to date only by removing the time component
    news_data_aggregated.index = news_data_aggregated.index.date
    news_data_aggregated.index = pd.to_datetime(news_data_aggregated.index).normalize()

    merged_data = data.merge(reddit_data_aggregated, how='left', left_index=True, right_index=True)
    merged_data = merged_data.merge(news_data_aggregated, how='left', left_index=True, right_index=True)

    merged_data.reset_index(inplace=True)
    return merged_data
#############################################################################################################################

# Load data for Tesla
def get_tsla_predictions(ticker, have_lastest_data=False):
    fp = "/content/tsla_data_with_sentiment_analysis_from_18_to_24.csv"
    df = pd.read_csv(fp)

    columns_to_keep = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']
    df = df[columns_to_keep]
    if have_lastest_data:
      tesla_data = get_latest_data(ticker)
      tesla_data = tesla_data[columns_to_keep]
      merged_df = pd.concat([df, tesla_data]).reset_index(drop=True)
      data = merged_df.copy()
    else:
      data = df.copy()

    # Preprocess data
    data['Date'] = pd.to_datetime(data['Date'])
    data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']] = data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']].ffill()

    # Prepare the data for TFT
    data = data.merge(
        data[['Date']].drop_duplicates(ignore_index=True).rename_axis('time_idx').reset_index(),
        on='Date',
        how='left'
    )
    max_prediction_length = 30
    max_encoder_length = 180
    data['group_id'] = 0  # Assigning a constant group
    training_cutoff = data['time_idx'].max() - max_prediction_length

    # Define the TimeSeriesDataSet for the TFT model
    training = TimeSeriesDataSet(
        data[lambda x: x.time_idx <= training_cutoff],
        time_idx="time_idx",
        target="Close",
        group_ids=["group_id"],
        max_encoder_length=max_encoder_length,
        min_encoder_length=max_encoder_length // 2,
        max_prediction_length=max_prediction_length,
        min_prediction_length=max_prediction_length,
        time_varying_known_categoricals=[],
        time_varying_known_reals=['Open', 'Volume'],
        time_varying_unknown_categoricals=['reddit_sentiment', 'news_sentiment'],
        time_varying_unknown_reals=['Close'],
        target_normalizer=GroupNormalizer(groups=["group_id"], transformation="softplus"),
        lags={'Close': [7, 15, 30, 45, 60, 90]},
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        categorical_encoders={
            "reddit_sentiment": NaNLabelEncoder(),
            "news_sentiment": NaNLabelEncoder()
        }
    )
    # Load TFT model
    model_path = '/content/tesla_trained_tft_model.pth'

    # Define the Temporal Fusion Transformer
    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=0.0007693367817925835,
        hidden_size=24,
        attention_head_size=3,
        dropout=0.19608882013889972,
        hidden_continuous_size=24,
        loss=QuantileLoss(),
        log_interval=10,
        optimizer="Adam",
        reduce_on_plateau_patience=4,
    )

    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        tft.load_state_dict(state_dict)
    else:
        print(f"Model file not found at {model_path}. Please check the path.")

    validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)
    val_dataloader = validation.to_dataloader(train=False, batch_size=128, num_workers=8)
    predictions = tft.predict(val_dataloader, return_y=True, trainer_kwargs=dict(accelerator=gpu_info))

    return data, predictions, tft, val_dataloader

###########################################################################################################################

# Load data for Apple
def get_aapl_predictions(ticker, have_lastest_data=False):
    fp = "/content/aapl_data_with_sentiment_analysis_from_18_to_24.csv"
    df = pd.read_csv(fp)

    columns_to_keep = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']
    df = df[columns_to_keep]
    if have_lastest_data:
      apple_data = get_latest_data(ticker)
      apple_data = apple_data[columns_to_keep]
      merged_df = pd.concat([df, apple_data]).reset_index(drop=True)
      data = merged_df.copy()
    else:
      data = df.copy()

    # Preprocess data
    data['Date'] = pd.to_datetime(data['Date'])
    data.set_index('Date', inplace=True)
    data = data.resample('B').asfreq()
    data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']] = data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']].ffill()
    data.reset_index(inplace=True)

    data['Date'] = pd.to_datetime(data['Date'])
    # Set 'Date' as the index to enable resampling for missing dates
    data.set_index('Date', inplace=True)

    # Fill missing values in numeric columns with interpolation
    data['Open'] = data['Open'].interpolate(method='linear')
    data['High'] = data['High'].interpolate(method='linear')
    data['Low'] = data['Low'].interpolate(method='linear')
    data['Close'] = data['Close'].interpolate(method='linear')
    data['Adj Close'] = data['Adj Close'].interpolate(method='linear')
    data['Volume'] = data['Volume'].interpolate(method='linear')

    # Forward fill missing values for specified numerical columns
    data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']] = data[['Open', 'High', 'Low', 'Close',
                                    'Adj Close', 'Volume', 'reddit_sentiment', 'news_sentiment']].ffill()

    # Reset the index if you need 'Date' as a column again
    data.reset_index(inplace=True)

    data = data.merge(
    data[['Date']].drop_duplicates(ignore_index=True).rename_axis('time_idx').reset_index(),
    on='Date',
    how='left')

    # Define the dataset for the TFT model
    max_prediction_length = 30
    min_encoder_length = 0
    max_encoder_length = 180

    data['group_id'] = 0  # Assigning a constant group

    # Define the TimeSeriesDataSet object
    training_cutoff = data['time_idx'].max() - max_prediction_length

    # Instantiate and fit encoders with add_nan=True to handle unknown categories
    reddit_sentiment_encoder = NaNLabelEncoder(add_nan=True)
    news_sentiment_encoder = NaNLabelEncoder(add_nan=True)

    # Fit encoders on the relevant columns with NaN values filled
    reddit_sentiment_encoder.fit(data['reddit_sentiment'].fillna("Unknown"))
    news_sentiment_encoder.fit(data['news_sentiment'].fillna("Unknown"))

    # Create training set
    training = TimeSeriesDataSet(
        data[lambda x: x.time_idx <= training_cutoff],
        time_idx="time_idx",
        target="Close", # target variable
        group_ids=["group_id"], # static covariates
        max_encoder_length=max_encoder_length, # maximum size of lookup window
        min_encoder_length=max_encoder_length//2,
        max_prediction_length=max_prediction_length, # maximum size of horizon window
        min_prediction_length=max_prediction_length,
        time_varying_known_categoricals=[],
        time_varying_known_reals=['Open', 'Volume'],
        time_varying_unknown_categoricals=['reddit_sentiment', 'news_sentiment'],
        time_varying_unknown_reals=['Close'],
        target_normalizer=GroupNormalizer(
            groups=["group_id"], transformation="softplus"
        ),  # use softplus transformation and normalize by group
        lags={'Close': [1,2,4,7,15,30,45,60]}, # add lagged values of target variable
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        categorical_encoders={
            # Properly handle unknown categories
            "reddit_sentiment": reddit_sentiment_encoder,
            "news_sentiment": news_sentiment_encoder
        }
    )

    # Define the Temporal Fusion Transformer
    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=0.000760235330401722,
        hidden_size=41,
        attention_head_size=3,
        dropout=0.1383927412069838,
        hidden_continuous_size=22,
        loss=QuantileLoss(),
        log_interval=10,
        optimizer="Adam",
        reduce_on_plateau_patience=4,
    )

    model_path = '/content/aapl_trained_tft_model.pth'

    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        tft.load_state_dict(state_dict)
    else:
        print(f"Model file not found at {model_path}. Please check the path.")

    validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)
    val_dataloader = validation.to_dataloader(train=False, batch_size=128, num_workers=8)
    predictions = tft.predict(val_dataloader, return_y=True, trainer_kwargs=dict(accelerator=gpu_info))

    return data, predictions, tft, val_dataloader

###########################################################################################################################

def get_lstm_model(stock_data, ticker):
    print(f"\n ---> Started Training LSTM Model for {ticker}")
    # Parse dates and set the index
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    stock_data.set_index('Date', inplace=True)

    # Use only the 'Close' column for prediction
    close_prices = stock_data[['Close']].values

    # Scale the data to be between 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(close_prices)

    # Prepare the dataset for the LSTM
    sequence_length = 90  # Using 90 days of historical data to predict the next day
    X = []
    y = []

    for i in range(sequence_length, len(scaled_data)):
        X.append(scaled_data[i-sequence_length:i, 0])
        y.append(scaled_data[i, 0])

    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    # Split the data into training and testing sets
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(units=100, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.25))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.20))
    model.add(Dense(units=25))
    model.add(Dense(units=1))

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

    # Set up EarlyStopping callback
    early_stopping = ESLSTM(
        monitor='val_loss',   # Monitor the validation loss
        patience=10,
        verbose=1,
        restore_best_weights=True
    )

    # Train the model with early stopping
    history = model.fit(
        X_train, y_train,
        epochs=200,
        batch_size=128,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping], # Add the early stopping callback
        verbose=1
    )

    # Make predictions
    lstm_predictions = model.predict(X_test)
    predictions = scaler.inverse_transform(lstm_predictions)  # Invert scaling to get original values

    # Invert scaling for y_test as well
    y_test_scaled = scaler.inverse_transform(y_test.reshape(-1, 1))

    return stock_data, y_test, y_test_scaled, lstm_predictions, predictions

#####################################################################################################################

def get_actuals_and_preds(predictions):
    # Move tensors to CPU before converting to NumPy
    actuals = predictions.y[0].cpu().detach().numpy().flatten()
    preds = predictions.output.cpu().detach().numpy().flatten()
    return actuals, preds

#####################################################################################################################

tsla_data, tft_tsla_predictions, tsla_tft_model, tsla_val_dataloader = get_tsla_predictions("TSLA", True)
aapl_data, tft_aapl_predictions, aapl_tft_model, aapl_val_dataloader = get_aapl_predictions("AAPL", True)
lstm_tsla_data, tsla_y_test, tsla_y_test_scaled, lstm_tsla_predictions, tsla_predictions = get_lstm_model(tsla_data, "TSLA")
lstm_aapl_data, aapl_y_test, aapl_y_test_scaled, lstm_aapl_predictions, aapl_predictions = get_lstm_model(aapl_data, "AAPL")

to_date = datetime.now().strftime("%Y-%m-%d")
date_limit = "2024-12-30"
if to_date > date_limit:
  print(f"It can fetch till {date_limit} due to rate limit for historical data of news and Reddit data. We can stock predictions till 30th Dec 2024.")
print("Financial Insights & Predictions Web app URL:", public_url)
######################################################################################################################

# Home route for user input
@app.route('/')
def home():
    return render_template('home.html')

# Prediction route
@app.route('/predict', methods=['POST'])
def predict():
    ticker = request.form['ticker']
    model_choice = request.form['model']
    if ticker.upper() not in ['AAPL', 'TSLA']:
        return render_template('error.html', ticker=ticker, model=model_choice, error="Only 'TSLA' and 'AAPL' stocks are supported at this time.")

    if ticker.upper() == 'TSLA':
        data = tsla_data.copy()
        actuals, preds = get_actuals_and_preds(tft_tsla_predictions)

    elif ticker.upper() == 'AAPL':
        data = aapl_data.copy()
        actuals, preds = get_actuals_and_preds(tft_aapl_predictions)

    if model_choice == 'TFT':
        data.reset_index(inplace=True)
        validation_dates = data['Date'][-len(actuals):].values
        rmse = np.sqrt(mean_squared_error(actuals, preds))
        r2 = r2_score(actuals, preds)
        mae = mean_absolute_error(actuals, preds)

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=validation_dates, y=actuals, mode='lines', name='Actual Values', line=dict(color='blue')))
        fig.add_trace(go.Scatter(x=validation_dates, y=preds, mode='lines', name='Predicted Values', line=dict(color='green')))
        fig.update_layout(title= f"Predictions vs Actuals Closing prices for {ticker}", xaxis_title="Date", yaxis_title="Stock Price", legend_title="Legend", template="plotly_dark")
        fig_html = fig.to_html(full_html=False)

        return render_template('predict.html', ticker=ticker, model=model_choice, mae=mae, r2=r2, rmse=rmse, plot=fig_html)

    elif model_choice == 'LSTM':
      if ticker.upper() == 'TSLA':
          lstm_data, y_test, y_test_scaled, lstm_predictions, predictions = lstm_tsla_data, tsla_y_test, tsla_y_test_scaled, lstm_tsla_predictions, tsla_predictions
      elif ticker.upper() == 'AAPL':
          lstm_data, y_test, y_test_scaled, lstm_predictions, predictions = lstm_aapl_data, aapl_y_test, aapl_y_test_scaled, lstm_aapl_predictions, aapl_predictions

      # Calculate the Mean Absolute Error (MAE) for the last 30 days
      days = 30
      lstm_mae = mean_absolute_error(y_test_scaled[-days:], predictions[-days:])
      lstm_rmse = np.sqrt(mean_squared_error(y_test_scaled[-days:], predictions[-days:]))
      lstm_r2 = r2_score(y_test_scaled[-days:], predictions[-days:])

      lstm_data.reset_index(inplace=True)
      lstm_data['Date'] = pd.to_datetime(lstm_data['Date'])

      # Prepare the plot with Plotly for the last 30 days
      fig = go.Figure()

      # Use Date column for x-axis values
      fig.add_trace(go.Scatter(
          x=lstm_data['Date'][-len(y_test):][-days:],
          y=y_test_scaled.flatten()[-days:],
          mode='lines',
          name='Actual Prices'
      ))
      fig.add_trace(go.Scatter(
          x=lstm_data['Date'][-len(y_test):][-days:],
          y=predictions.flatten()[-days:],
          mode='lines',
          name='Predicted Prices'
      ))

      # Update the layout
      fig.update_layout(
          title= f"Last 30 Days Stock Price Prediction for {ticker}",
          xaxis_title="Date",
          yaxis_title="Stock Price",
          legend_title="Legend",
          template="plotly"
      )
      fig_html = fig.to_html(full_html=False)

      return render_template('predict_lstm.html', ticker=ticker, model=model_choice, lstm_mae=lstm_mae, lstm_r2=lstm_r2, lstm_rmse=lstm_rmse, plot=fig_html)


# Trading Strategy Simulation Route
@app.route('/strategy', methods=['POST'])
def strategy():
    ticker = request.form['ticker']
    investment_amount = float(request.form['investment'])
    model_choice = "TFT"
    data = tsla_data.copy()
    actuals, preds = get_actuals_and_preds(tft_tsla_predictions)
    if ticker.upper() == 'AAPL':
        data = aapl_data.copy()
        actuals, preds = get_actuals_and_preds(tft_aapl_predictions)

    data.reset_index(inplace=True)
    dates = data['Date'][-len(actuals):].values
    trading_df = pd.DataFrame({
        'Date': dates,
        'Actual': actuals[:30],
        'Predicted': preds[:30]
    })
    trading_df.set_index('Date', inplace=True)
    trading_df['Daily Return'] = trading_df['Actual'].pct_change()
    trading_df['Signal'] = np.where(trading_df['Predicted'].diff() > 0, 1, -1)
    trading_df['Strategy Return'] = trading_df['Signal'].shift(1) * trading_df['Daily Return']
    trading_df['Cumulative Strategy Return'] = (1 + trading_df['Strategy Return']).cumprod() - 1

    final_return = trading_df['Cumulative Strategy Return'].iloc[-1]
    final_balance = investment_amount * (1 + final_return)
    earnings = final_balance - investment_amount
    profit_percentage = (earnings / investment_amount) * 100

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=trading_df.index, y=trading_df['Actual'], mode='lines', name='Actual Prices', line=dict(color='blue')))
    fig.add_trace(go.Scatter(x=trading_df.index, y=trading_df['Predicted'], mode='lines', name='Predicted Prices', line=dict(color='green')))
    buy_signals = trading_df[trading_df['Signal'] == 1]
    fig.add_trace(go.Scatter(x=buy_signals.index, y=buy_signals['Predicted'], mode='markers', name='Buy Signal', marker=dict(color='green', symbol='triangle-up', size=10)))
    sell_signals = trading_df[trading_df['Signal'] == -1]
    fig.add_trace(go.Scatter(x=sell_signals.index, y=sell_signals['Predicted'], mode='markers', name='Sell Signal', marker=dict(color='red', symbol='triangle-down', size=10)))

    fig.update_layout(title= f"Trading Strategy with Buy and Sell Signals for {ticker}", xaxis_title="Date", yaxis_title="Stock Price", template="plotly_dark")
    fig_html = fig.to_html(full_html=False)

    strategy_results = {
        'initial_investment': f"${investment_amount:,.2f}",
        'final_balance': f"${final_balance:,.2f}",
        'earnings': f"${earnings:,.2f}",
        'profit_percentage': f"{profit_percentage:.2f}%"
    }

    return render_template('strategy.html', strategy_results=strategy_results, plot=fig_html)

# Compare Models Route
@app.route('/compare_models', methods=['POST'])
def compare_models():
    ticker = request.form['ticker']
    if ticker.upper() not in ['AAPL', 'TSLA']:
        return render_template('error.html', ticker=ticker, error="Only 'TSLA' and 'AAPL' stocks are supported at this time.")

    # Initialize variables for the selected ticker
    if ticker.upper() == 'TSLA':
        tft_predictions = tft_tsla_predictions
        lstm_data, y_test, y_test_scaled, lstm_predictions, predictions = lstm_tsla_data, tsla_y_test, tsla_y_test_scaled, lstm_tsla_predictions, tsla_predictions
    elif ticker.upper() == 'AAPL':
        tft_predictions = tft_aapl_predictions
        lstm_data, y_test, y_test_scaled, lstm_predictions, predictions = lstm_aapl_data, aapl_y_test, aapl_y_test_scaled, lstm_aapl_predictions, aapl_predictions

    # Process TFT predictions
    actuals, preds = get_actuals_and_preds(tft_predictions)
    tft_mae = mean_absolute_error(actuals, preds)
    tft_rmse = np.sqrt(mean_squared_error(actuals, preds))
    tft_r2 = r2_score(actuals, preds)

    # Process LSTM predictions (last 30 days only)
    days = 30
    lstm_mae = mean_absolute_error(y_test_scaled[-days:], predictions[-days:])
    lstm_rmse = np.sqrt(mean_squared_error(y_test_scaled[-days:], predictions[-days:]))
    lstm_r2 = r2_score(y_test_scaled[-days:], predictions[-days:])

    # Plotly figure for comparison
    fig = go.Figure()

    # Add TFT metrics
    fig.add_trace(go.Bar(
        x=["MAE", "RMSE", "R² Score"],
        y=[tft_mae, tft_rmse, tft_r2],
        name="TFT Model",
        marker=dict(color='blue')
    ))

    # Add LSTM metrics
    fig.add_trace(go.Bar(
        x=["MAE", "RMSE", "R² Score"],
        y=[lstm_mae, lstm_rmse, lstm_r2],
        name="LSTM Model",
        marker=dict(color='green')
    ))

    # Update layout for clarity
    fig.update_layout(
        title=f"Model Comparison of {ticker.upper()}",
        xaxis_title="Metric",
        yaxis_title="Value",
        barmode='group',
        template="plotly_dark"
    )

    # Generate Plotly HTML
    fig_html = fig.to_html(full_html=False)

    # Prepare results to display
    comparison_results = {
        'tft': {
            'mae': f"{tft_mae:.2f}",
            'rmse': f"{tft_rmse:.2f}",
            'r2': f"{tft_r2:.2f}"
        },
        'lstm': {
            'mae': f"{lstm_mae:.2f}",
            'rmse': f"{lstm_rmse:.2f}",
            'r2': f"{lstm_r2:.2f}"
        }
    }

    return render_template(
        'compare_models.html',
        ticker=ticker.upper(),
        comparison_results=comparison_results,
        plot=fig_html
    )

if __name__ == '__main__':
    app.run(port=5000)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

[*********************100%***********************]  1 of 1 completed


 ---> Preparing data for TSLA stock: 
Fetching  Reddit and news data from 2024-12-01 







Fetching................
Fetched Reddit and news data from 2024-12-01 to 2024-12-11 

Started sentiment analysis on Reddit and news data 



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Sentiment Analysis is Done and merged with stock data from 2024-12-01 to 2024-12-11 for TSLA




/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
  state_dict = torch.load(model_path)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: You are using a CUDA device ('NVIDIA 


 ---> Preparing data for AAPL stock: 
Fetching  Reddit and news data from 2024-12-01 


Fetching................





Fetched Reddit and news data from 2024-12-01 to 2024-12-11 

Started sentiment analysis on Reddit and news data 

Sentiment Analysis is Done and merged with stock data from 2024-12-01 to 2024-12-11 for AAPL




/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
  state_dict = torch.load(model_path)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES:


 ---> Started Training LSTM Model for TSLA


  super().__init__(**kwargs)


Epoch 1/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 80ms/step - loss: 0.2215 - val_loss: 0.0035
Epoch 2/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0106 - val_loss: 0.0034
Epoch 3/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0056 - val_loss: 0.0041
Epoch 4/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0047 - val_loss: 0.0031
Epoch 5/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0034 - val_loss: 0.0021
Epoch 6/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0032 - val_loss: 0.0022
Epoch 7/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0031 - val_loss: 0.0018
Epoch 8/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0027 - val_loss: 0.0020
Epoch 9/200
[1m11/11[0m [32m━━━━━━━━━

  super().__init__(**kwargs)


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - loss: 0.1425 - val_loss: 0.0691
Epoch 2/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0116 - val_loss: 0.0040
Epoch 3/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0032 - val_loss: 0.0013
Epoch 4/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0019 - val_loss: 0.0019
Epoch 5/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0013 - val_loss: 0.0038
Epoch 6/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0012 - val_loss: 0.0019
Epoch 7/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 9.5109e-04 - val_loss: 0.0017
Epoch 8/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 8.9583e-04 - val_loss: 8.6559e-04
Epoch 9/200
[1m11/11[0m [32m━━━━━━━━━

 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:24:46] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:24:47] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:25:01] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:25:27] "POST /strategy HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:25:46] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:25:53] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:26:05] "POST /strategy HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:26:14] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:26:18] "POST /compare_models HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:26:28] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Dec/2024 00:26:33] "POST /compare_models HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.

#### Comprehensive Stock Data Analysis and Visualization

In [7]:
def analyze_stock_data(data, stock_name):
    data['Date'] = pd.to_datetime(data['Date'])

    # General Overview
    print(f"Dataset Overview for {stock_name}:\n")
    print(f"Total Rows and Columns: {data.shape}")
    print(f"\nColumn Data Types:\n{data.dtypes}")
    print(f"\nMissing Values:\n{data.isnull().sum()}")

    # Descriptive Statistics
    print(f"\nDescriptive Statistics:\n{data.describe()}")

    # Time Range
    print(f"\nTime Range for {stock_name}:")
    print(f"Start Date: {data['Date'].min()}")
    print(f"End Date: {data['Date'].max()}")

    # Highest and Lowest Prices
    highest_price = data['High'].max()
    highest_price_date = data.loc[data['High'].idxmax(), 'Date']
    lowest_price = data['Low'].min()
    lowest_price_date = data.loc[data['Low'].idxmin(), 'Date']
    print(f"\nHighest Price: {highest_price} on {highest_price_date}")
    print(f"Lowest Price: {lowest_price} on {lowest_price_date}")

    # Highest and Lowest Volumes
    highest_volume = data['Volume'].max()
    highest_volume_date = data.loc[data['Volume'].idxmax(), 'Date']
    lowest_volume = data['Volume'].min()
    lowest_volume_date = data.loc[data['Volume'].idxmin(), 'Date']
    print(f"\nHighest Volume: {highest_volume} on {highest_volume_date}")
    print(f"Lowest Volume: {lowest_volume} on {lowest_volume_date}")

    # Sentiment Distributions
    reddit_sentiment_counts = data['reddit_sentiment'].value_counts()
    news_sentiment_counts = data['news_sentiment'].value_counts()
    print(f"\nReddit Sentiment Distribution:\n{reddit_sentiment_counts}")
    print(f"\nNews Sentiment Distribution:\n{news_sentiment_counts}")

    # Visualizations

    # 1. Closing Price Trend
    fig_close = go.Figure()
    fig_close.add_trace(go.Scatter(x=data['Date'], y=data['Close'], mode='lines', name='Close Price', line=dict(color='green')))
    fig_close.update_layout(title=f'{stock_name} Closing Prices Over Time', xaxis_title='Date', yaxis_title='Price (USD)')
    fig_close.show()

    # 2. Volume Trend
    fig_volume = go.Figure()
    fig_volume.add_trace(go.Scatter(x=data['Date'], y=data['Volume'], mode='lines', name='Volume', line=dict(color='green')))
    fig_volume.update_layout(title=f'{stock_name} Volume Over Time', xaxis_title='Date', yaxis_title='Volume')
    fig_volume.show()

    # 3. Sentiment Distribution
    sentiment_counts = pd.DataFrame({
        'Sentiment': reddit_sentiment_counts.index,
        'Reddit': reddit_sentiment_counts.values,
        'News': news_sentiment_counts.reindex(reddit_sentiment_counts.index, fill_value=0).values
    })
    fig_sentiment = px.bar(sentiment_counts, x='Sentiment', y=['Reddit', 'News'], barmode='group',
                           title=f'Sentiment Distribution for {stock_name}', labels={'value': 'Count', 'variable': 'Source'})
    fig_sentiment.show()

#### Visualizing Feature Importance in Temporal Fusion Transformer (TFT) Model

In [8]:
def get_feature_imp_graph(tft, val_dataloader, stock_name):
  # Using `mode="raw"` to get all outputs for interpretation
  raw_predictions = tft.predict(val_dataloader, mode="raw", return_x=True)

  # Unpack only what is necessary from raw_predictions
  pred = raw_predictions[0]  # This contains the prediction values
  x = raw_predictions[1]  # This contains input data used for interpretation

  interpretation = tft.interpret_output(pred, reduction="mean")

  # Replace these lists with your actual column names used during training
  static_features = ['group_id']  # Example static features
  encoder_features = ['Open', 'Volume', 'Close']  # Replace with your encoder feature names
  decoder_features = ['reddit_sentiment', 'news_sentiment']  # Replace with your decoder feature names

  # Verify lengths
  print("Static Variables:", len(interpretation["static_variables"]))
  print("Static Features:", len(static_features))

  print("Encoder Variables:", len(interpretation["encoder_variables"]))
  print("Encoder Features:", len(encoder_features))

  print("Decoder Variables:", len(interpretation["decoder_variables"]))
  print("Decoder Features:", len(decoder_features))

  # Adjust features_mapping to match the actual lengths
  features_mapping = {
      "Static Features": static_features,
      "Encoder Features": encoder_features,
      "Decoder Features": decoder_features,
  }

  # Extract feature importance
  features = {
      "Static Features": interpretation["static_variables"].cpu().numpy(),
      "Encoder Features": interpretation["encoder_variables"].cpu().numpy(),
      "Decoder Features": interpretation["decoder_variables"].cpu().numpy(),
  }

  # Flatten the data for plotting with column names
  feature_names = []
  feature_importance = []

  for category, values in features.items():
      for i, value in enumerate(values):
          # Ensure the index does not exceed the length of the features list
          if i < len(features_mapping[category]):
              feature_names.append(f"{category}: {features_mapping[category][i]}")
              feature_importance.append(value)

  # Create DataFrame for feature importance
  importance_df = pd.DataFrame({
      "Feature": feature_names,
      "Importance": feature_importance
  }).sort_values(by="Importance", ascending=False)

  # Plot Feature Importance with Plotly
  fig = px.bar(importance_df, x="Importance", y="Feature", orientation='h', title=f"{stock_name} TFT model Feature Importance", template="plotly_dark")
  fig.update_layout(xaxis_title="Importance", yaxis_title="Feature")
  fig.show()

#### Visualizations

In [9]:
tsla_data.reset_index(inplace=True)

In [21]:
tsla_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,reddit_sentiment,news_sentiment,time_idx,group_id
0,2018-01-02,20.799999,21.474001,20.733334,21.368668,21.368668,65283000,Neutral,Positive,0,0
1,2018-01-03,21.400000,21.683332,21.036667,21.150000,21.150000,67822500,Positive,Neutral,1,0
2,2018-01-04,20.858000,21.236668,20.378668,20.974667,20.974667,149194500,Positive,Positive,2,0
3,2018-01-05,21.108000,21.149332,20.799999,21.105333,21.105333,68868000,Neutral,Positive,3,0
4,2018-01-08,21.066668,22.468000,21.033333,22.427334,22.427334,147891000,Neutral,Positive,4,0
...,...,...,...,...,...,...,...,...,...,...,...
1742,2024-12-04,353.000000,358.100006,348.600006,357.929993,357.929993,50810900,Positive,Positive,1742,0
1743,2024-12-05,359.869995,375.429993,359.500000,369.489990,369.489990,81403600,Positive,Positive,1743,0
1744,2024-12-06,377.420013,389.489990,370.799988,389.220001,389.220001,81455800,Positive,Positive,1744,0
1745,2024-12-09,397.609985,404.799988,378.010010,389.790009,389.790009,96359200,Positive,Positive,1745,0


In [11]:
analyze_stock_data(data=tsla_data, stock_name="Tesla Inc")

Dataset Overview for Tesla Inc:

Total Rows and Columns: (1747, 12)

Column Data Types:
index                        int64
Date                datetime64[ns]
Open                       float64
High                       float64
Low                        float64
Close                      float64
Adj Close                  float64
Volume                       int64
reddit_sentiment            object
news_sentiment              object
time_idx                     int64
group_id                     int64
dtype: object

Missing Values:
index               0
Date                0
Open                0
High                0
Low                 0
Close               0
Adj Close           0
Volume              0
reddit_sentiment    0
news_sentiment      0
time_idx            0
group_id            0
dtype: int64

Descriptive Statistics:
             index                           Date         Open         High  \
count  1747.000000                           1747  1747.000000  1747.000000   
m

In [12]:
get_feature_imp_graph(tsla_tft_model, tsla_val_dataloader, "Tesla")

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Static Variables: 3
Static Features: 1
Encoder Variables: 12
Encoder Features: 3
Decoder Variables: 7
Decoder Features: 2


In [13]:
aapl_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,reddit_sentiment,news_sentiment,time_idx,group_id
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-02,42.540001,43.075001,42.314999,43.064999,40.524345,102223600.0,Positive,Positive,0,0
2018-01-03,43.132500,43.637501,42.990002,43.057499,40.517288,118071600.0,Neutral,Positive,1,0
2018-01-04,43.134998,43.367500,43.020000,43.257500,40.705486,89738400.0,Neutral,Neutral,2,0
2018-01-05,43.360001,43.842499,43.262501,43.750000,41.168930,94640000.0,Positive,Positive,3,0
2018-01-08,43.587502,43.902500,43.482498,43.587502,41.016026,82271200.0,Positive,Positive,4,0
...,...,...,...,...,...,...,...,...,...,...
2024-12-04,242.869995,244.110001,241.250000,243.009995,243.009995,44383900.0,Neutral,Positive,1806,0
2024-12-05,243.990005,244.539993,242.130005,243.039993,243.039993,40033900.0,Neutral,Positive,1807,0
2024-12-06,242.910004,244.630005,242.080002,242.839996,242.839996,36870600.0,Neutral,Positive,1808,0
2024-12-09,241.830002,247.240005,241.750000,246.750000,246.750000,44649200.0,Neutral,Positive,1809,0


In [14]:
aapl_data.reset_index(inplace=True)

In [15]:
analyze_stock_data(data=aapl_data, stock_name="Apple Inc")

Dataset Overview for Apple Inc:

Total Rows and Columns: (1811, 11)

Column Data Types:
Date                datetime64[ns]
Open                       float64
High                       float64
Low                        float64
Close                      float64
Adj Close                  float64
Volume                     float64
reddit_sentiment            object
news_sentiment              object
time_idx                     int64
group_id                     int64
dtype: object

Missing Values:
Date                0
Open                0
High                0
Low                 0
Close               0
Adj Close           0
Volume              0
reddit_sentiment    0
news_sentiment      0
time_idx            0
group_id            0
dtype: int64

Descriptive Statistics:
                                Date         Open         High          Low  \
count                           1811  1811.000000  1811.000000  1811.000000   
mean   2021-06-21 14:24:19.083379200   123.129478   124.46

In [16]:
get_feature_imp_graph(aapl_tft_model, aapl_val_dataloader, "Apple")

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Static Variables: 3
Static Features: 1
Encoder Variables: 14
Encoder Features: 3
Decoder Variables: 6
Decoder Features: 2


In [17]:
def compare_models(ticker, tft_predictions, lstm_data, y_test, y_test_scaled, lstm_predictions, predictions):
    # Process TFT predictions
    actuals, preds = get_actuals_and_preds(tft_predictions)  # Assuming this function is defined elsewhere
    tft_mae = mean_absolute_error(actuals, preds)
    tft_rmse = np.sqrt(mean_squared_error(actuals, preds))
    tft_r2 = r2_score(actuals, preds)

    # Process LSTM predictions (last 30 days only)
    days = 30
    lstm_mae = mean_absolute_error(y_test_scaled[-days:], predictions[-days:])
    lstm_rmse = np.sqrt(mean_squared_error(y_test_scaled[-days:], predictions[-days:]))
    lstm_r2 = r2_score(y_test_scaled[-days:], predictions[-days:])

    # Plotly figure for comparison
    fig = go.Figure()

    # Add TFT metrics
    fig.add_trace(go.Bar(
        x=["MAE", "RMSE", "R² Score"],
        y=[tft_mae, tft_rmse, tft_r2],
        name="TFT Model",
        marker=dict(color='blue')
    ))

    # Add LSTM metrics
    fig.add_trace(go.Bar(
        x=["MAE", "RMSE", "R² Score"],
        y=[lstm_mae, lstm_rmse, lstm_r2],
        name="LSTM Model",
        marker=dict(color='green')
    ))

    # Update layout for clarity
    fig.update_layout(
        title=f"Model Comparison of {ticker.upper()}",
        xaxis_title="Metric",
        yaxis_title="Value",
        barmode='group',
        template="plotly_dark"
    )

    fig.show()

In [18]:
compare_models("TSLA", tft_tsla_predictions, lstm_tsla_data, tsla_y_test, tsla_y_test_scaled, lstm_tsla_predictions, tsla_predictions)

In [19]:
compare_models("AAPL", tft_tsla_predictions, lstm_aapl_data, aapl_y_test, aapl_y_test_scaled, lstm_aapl_predictions, aapl_predictions)