In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from scipy.stats import norm
from scipy.linalg import svd
import matplotlib.pyplot as plt
import os
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Fetch stock data from Yahoo Finance
def fetch_stock_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    return data

# Compute technical indicators
def compute_technical_indicators(data):
    data['SMA30'] = data['Close'].rolling(window=1).mean()
    data['SMA100'] = data['Close'].rolling(window=1).mean()
    vol_window = 1
    data['Volatility'] = data['Close'].pct_change().rolling(window=vol_window).std() * np.sqrt(252)
    return data.dropna()

# Black-Scholes option pricing model
def black_scholes(S, K, T, r, sigma, option_type='call'):
    d1 = (np.log(S / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    if option_type == 'call':
        return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)
    elif option_type == 'put':
        return K * np.exp(-r * T) * norm.cdf(-d2) - S * norm.cdf(-d1)

# Add option pricing features
def add_option_pricing_features(data):
    strike_price = data['Close'] * 1.05
    time_to_maturity = 30 / 252
    risk_free_rate = 0.01
    option_type = 'call'
    data['Option_Price'] = black_scholes(data['Close'], strike_price, time_to_maturity, risk_free_rate, data['Volatility'], option_type)
    data['Implied_Volatility'] = data['Volatility']
    return data

# Normalize and preprocess data
def preprocess_data(data):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    return data_scaled, scaler

# Create sequences for time-series prediction
def create_sequences(data, seq_len, pred_len):
    sequences = []
    for i in range(len(data) - seq_len - pred_len + 1):
        sequences.append(data[i:i + seq_len + pred_len])
    return np.array(sequences)

# Create a Hankel matrix
def create_hankel_matrix(time_series, window_size):
    return np.array([time_series[i:i + window_size] for i in range(len(time_series) - window_size + 1)])

# Perform SVD on the Hankel matrix
def hankel_svd(hankel_matrix):
    U, Sigma, Vt = svd(hankel_matrix, full_matrices=False)
    return U, Sigma, Vt

# Build LSTM model
def build_model(input_shape, output_len):
    model = Sequential([
        LSTM(180, activation='relu', return_sequences=False),
        Dense(180), LeakyReLU(),
        Dense(360), LeakyReLU(),
        Dense(360), LeakyReLU(),
        Dense(output_len)
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

# Fetch news and perform sentiment analysis
def fetch_news_and_sentiment(ticker, start_date, end_date):
    headlines, dates = [], []
    analyzer = SentimentIntensityAnalyzer()
    curr_page = 1

    while len(headlines) < 1000:
        api_url = f"https://newsapi.org/v2/everything?q={ticker}&from={start_date}&to={end_date}&apiKey=7e9676fa277d423f8deff9a4894bd717&page={curr_page}&language=en"
        response = requests.get(api_url)
        news_data = response.json()
        print(news_data)
        if "articles" in news_data:
            for article in news_data['articles']:
                headlines.append(article['title'])
                dates.append(article['publishedAt'][:10])
        curr_page += 1
        if len(news_data['articles']) == 0: break

    sentiment_data = []
    for headline, date in zip(headlines, dates):
        score = analyzer.polarity_scores(headline)
        sentiment_data.append({'date': date, 'sentiment': score['compound']})
    return sentiment_data

# Predict and plot results

def predict_and_plot(m, ticker, data, s, model, seq_len, pred_len, scaler, cut_off):
    plt.figure(figsize=(14, 7))

    rmse_values = []  # To store RMSE values for each section
    section_start_idx = []  # To store the start indices of each 30-day section for placement of RMSE labels
    #s.shape[0]
    pred_count = 0
    red_x,red_y, black_y, blue_x, blue_y, green_y = [],[],[],[],[],[]

    for idx in range(0,s.shape[0], pred_len):  # Loop through each 30-day block
        print("IDX",idx)
    # for idx in range(1): # only first 30 after being trained
        # Input remains in 3D for predictio
        future_input = s[idx, :seq_len, :].reshape(1, seq_len, -1)  # Keep the 3D shape
        future = model.predict(future_input)[0]  # Predict future "Close" prices
        # Pad the prediction with zeros for the "Volume" and "SMA" columns
        future_padded = np.hstack((future.reshape(-1, 1), np.zeros((future.shape[0], data.shape[1] - 1))))  # Adjust for PCA features

        # Inverse transform the padded prediction
        future_transform = scaler.inverse_transform(future_padded)[:, 0]  # Extract "Close"

        # Extract and inverse transform the true "Close" values
        future_true = s[idx, seq_len:, 0].reshape(-1, 1)  # Use only "Close"
        future_true_padded = np.hstack((future_true, np.zeros((future_true.shape[0], data.shape[1] - 1))))  # Adjust for PCA features
        future_true_transform = scaler.inverse_transform(future_true_padded)[:, 0]  # Extract "Close"

        # Calculate RMSE for this 30-day section
        rmse = np.sqrt(mean_squared_error(future_true_transform, future_transform))
        rmse_values.append(rmse)
        section_start_idx.append(idx)  # Store the start index for the RMSE label
        if idx< cut_off:
            red_x +=[idx]
            red_y+=[future_transform]
            black_y+=[future_true_transform]
        else:
            blue_x +=[idx]
            blue_y+=[future_transform]
            green_y+=[future_true_transform]

        # # Plot the predictions
        # plt.scatter(idx, future_transform, 
        # # label="Train Prediction" if idx < cut_off else "Test Prediction", 
        #     color="red" if idx < cut_off else "blue")


        # # plt.plot(np.arange(idx, idx + pred_len), future_transform, 
        # #         # label="Train Prediction" if idx < cut_off else "Test Prediction", 
        # #          color="red" if idx < cut_off else "blue")

        # # Plot the true values
        # plt.scatter(idx, future_true_transform, 
        #          #label="Train True" if idx < cut_off else "Test True", 
        #          color="black" if idx < cut_off else "green")

        
        if idx >= cut_off:
            pred_count+=1
    plt.plot(red_x, red_y, color = "red")
    plt.plot(red_x, black_y, color = "black")
    plt.plot(blue_x, blue_y, color = "blue")
    plt.plot(blue_x, green_y, color = "green")
    # Add legend and grid
    # plt.legend(["Train Prediction", "Train True", "Test Prediction", "Test True"])
    plt.plot([], [], color="red", label="Train Prediction")  # Dummy plot for legend
    plt.plot([], [], color="black", label="Train True")      # Dummy plot for legend
    plt.plot([], [], color="blue", label="Test Prediction")  # Dummy plot for legend
    plt.plot([], [], color="green", label="Test True")      # Dummy plot for legend
    plt.legend()
    # for idx in range(0, s.shape[0], pred_len): 
    # Add vertical line to separate each 30-day section
        # plt.axvline(x=idx + pred_len, color='gray', linestyle='--', linewidth=1)
    # Title and labels
    
    plt.title(f"{seq_len}-{pred_len} Predictions for {ticker}, model {m}")
    plt.xlabel("Trading Days")
    plt.ylabel("Price")

    # Add RMSE labels below the x-axis at the start of each 30-day section
    # for i, rmse in zip(section_start_idx, rmse_values):
    #     plt.text(i + pred_len / 2, min(plt.ylim()), f"{rmse:.0f}", color="teal", fontsize=10, ha='center', va='top')
    # plt.text(- 5- pred_len / 2, min(plt.ylim()), f"RMSE", color="teal", fontsize=10, ha='center', va='top')
    # print(rmse_values)
    # plt.grid(True)

    if m == "LSTM w/ 2 features (close price, sentiment)":
        dir = f"{seq_len}-{pred_len}/sentLSTM_2"
        if not os.path.exists(dir):
                os.makedirs(dir)
        plt.savefig(f"{dir}/{ticker}.png")
    return np.mean(rmse_values[-pred_count]) # testing rmse


# Main workflow
# def main(tickers):
#     m = "sentLSTM w/ 1 feature"
#     test_rmse = []
#     for ticker in tickers:
#         tsla = fetch_stock_data(ticker, start_date="2024-11-03", end_date="2024-12-01")
#         tsla = compute_technical_indicators(tsla)
#         tsla = add_option_pricing_features(tsla)

#         sentiment_data = fetch_news_and_sentiment(ticker, "2024-11-03", "2024-12-01")
#         sentiment_df = pd.DataFrame(sentiment_data)
#         sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
#         tsla.reset_index(inplace=True)  # Make the index a regular column
#         tsla['Date'] = pd.to_datetime(tsla['Date'])  # Ensure Date is a proper datetime type
#         sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])  # Ensure date compatibility

#         # Merge on the Date and date columns
#         tsla = pd.merge(tsla, sentiment_df, left_on='Date', right_on='date', how='left')
#         tsla['sentiment'] = tsla['sentiment'].fillna(0)  # Fill missing sentiment with 0
#         #'Volume', 'SMA30', 'SMA100', 'Volatility', 

#         features = ['Close', 'sentiment']
#         data = tsla[features].copy()
#         processed_data, scaler = preprocess_data(data)
        
def main(tickers):
    m = "LSTM w/ 2 features (close price, sentiment)"
    test_rmse = []
    for ticker in tickers:
        tsla = fetch_stock_data(ticker, start_date="2024-11-03", end_date="2024-12-01")
        if tsla.empty:
            print(f"No data fetched for {ticker}")
            continue

        tsla = compute_technical_indicators(tsla)
        if tsla.empty:
            print(f"Technical indicators computation resulted in empty data for {ticker}")
            continue

        tsla = add_option_pricing_features(tsla)
        if tsla.empty:
            print(f"Option pricing features resulted in empty data for {ticker}")
            continue

        sentiment_data = fetch_news_and_sentiment(ticker, "2024-11-03", "2024-12-01")
        sentiment_df = pd.DataFrame(sentiment_data)
        # Group by date and average sentiment
        sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])  # Ensure 'date' is in datetime format
        sentiment_df = sentiment_df.groupby('date', as_index=False)['sentiment'].mean()

        print("SLA,", sentiment_df)
        sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
        tsla.reset_index(inplace=True)
        tsla['Date'] = pd.to_datetime(tsla['Date'])
        sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])

        tsla = pd.merge(tsla, sentiment_df, left_on='Date', right_on='date', how='left')
        tsla['sentiment'] = tsla['sentiment'].fillna(0)
        print(tsla.shape)
        features = ['Close', 'sentiment']
        # features = ['Close', 'Volume', 'SMA30', 'SMA100', 'Volatility', 'Option_Price']
        data = tsla[features].copy()
        if data.empty:
            print(f"No valid data after filtering for {ticker}")
            continue

        print(f"Data shape before preprocessing: {data.shape}")
        processed_data, scaler = preprocess_data(data)

        window_size = 2
        hankel_matrix = create_hankel_matrix(data['Close'].values, window_size)
        U, Sigma, Vt = hankel_svd(hankel_matrix)
        dmd_modes = U[:, :2]
        processed_data_trimmed = processed_data[-dmd_modes.shape[0]:]
        data_combined = np.hstack((processed_data_trimmed, dmd_modes))

        data_combined = processed_data_trimmed

        seq_len, pred_len = 1, 1
        sequences = create_sequences(data_combined, seq_len, pred_len)

        cut_off = int(0.8 * sequences.shape[0])
        print("cutoff",cut_off)
        print("Data.shape", data_combined.shape)

        X_train = sequences[:cut_off, :seq_len, :]
        X_test = sequences[cut_off:, :seq_len, :]
        y_train = sequences[:cut_off, seq_len:, 0]
        y_test = sequences[cut_off:, seq_len:, 0]

        model = build_model(input_shape=(seq_len, X_train.shape[2]), output_len=pred_len)
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

        test_rmse += [predict_and_plot(m, ticker, data_combined, sequences, model, seq_len, pred_len, scaler, cut_off)]
    print(f"Average Test RMSE: {np.mean(test_rmse)}")



2024-12-03 19:33:51.908369: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-03 19:33:59.631184: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733276041.046626    1190 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733276041.900399    1190 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-03 19:34:04.910726: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
# Example usage
main(["TSLA"])


[*********************100%***********************]  1 of 1 completed

Technical indicators computation resulted in empty data for TSLA
Average Test RMSE: nan



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
