## Imports

In [30]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


## Helper functions

In [31]:
def preprocess_data(prices):
    prices = prices.values.reshape(-1, 1)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_prices = scaler.fit_transform(prices)
    return scaled_prices, scaler

def create_sequences(data, dates, look_back):
    X, y, y_dates = [], [], []
    for i in range(look_back, len(data)):
        X.append(data[i - look_back:i])
        y.append(data[i])
        y_dates.append(dates[i])
    return np.array(X), np.array(y), y_dates

def build_lstm_model(input_shape):
    model = Sequential([
        LSTM(units=50, return_sequences=False, input_shape=input_shape),
        Dense(units=1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def generate_signals(predictions, dates, threshold):
    df_predictions = pd.DataFrame({
        'date': dates,
        'prediction': predictions.squeeze()
    })
    df_predictions['date'] = pd.to_datetime(df_predictions['date'])
    df_predictions.sort_values('date', inplace=True)
    df_predictions.reset_index(drop=True, inplace=True)
    df_predictions['next_day_prediction'] = df_predictions['prediction'].shift(-1)
    df_predictions['predicted_change'] = df_predictions['next_day_prediction'] - df_predictions['prediction']
    df_predictions['pct_change'] = df_predictions['predicted_change'] / df_predictions['prediction']
    df_predictions.dropna(subset=['next_day_prediction'], inplace=True)

    def get_signal(row):
        if row['pct_change'] > threshold:
            return 'buy'
        elif row['pct_change'] < -threshold:
            return 'sell'
        else:
            return 'hold'

    df_predictions['signal'] = df_predictions.apply(get_signal, axis=1)
    return df_predictions[['date', 'signal']]


## Read the List of S&P 500 Stocks

In [32]:
# Directory containing the CSV files
csv_directory = 'all_sp500_csvs_2010-24'

# List of tickers based on the CSV filenames
tickers = [filename.replace('.csv', '') for filename in os.listdir(csv_directory) if filename.endswith('.csv')]

# Sort the tickers
tickers.sort()


Define Parameters

In [33]:
# Define parameters
start_date = '2010-01-01'
split_date = '2022-12-31'  # Adjust as needed
end_date = '2023-10-20'     # Adjust to the latest available date
look_back = 10
threshold = 0.01  # Adjust the threshold as needed


## Automate the LSTM Model for Each Stock

In [None]:
# Initialize dictionaries to store signals
signals_dict = {}

for ticker in tqdm(tickers):
    try:
        print(f"\nProcessing ticker: {ticker}")
        # Read data from CSV, skipping the first three rows
        csv_path = os.path.join(csv_directory, f"{ticker}.csv")
        data = pd.read_csv(csv_path, skiprows=3, header=None)
        data.columns = ['Date', 'Price', 'Adj Close', 'Close', 'High', 'Low', 'Volume']
        data['Date'] = pd.to_datetime(data['Date'])
        data.set_index('Date', inplace=True)
        data.sort_index(inplace=True)

        # Convert the index to timezone-naive
        data.index = data.index.tz_localize(None)

        # Filter data based on start and end dates
        data = data.loc[(data.index >= pd.to_datetime(start_date)) & (data.index <= pd.to_datetime(end_date))]

        # Check if data is sufficient
        if data.empty or len(data) < look_back + 1:
            print(f"Skipping {ticker}: insufficient data after filtering")
            continue  # Skip if insufficient data

        # Print date range for debugging
        print(f"{ticker} data from {data.index.min().date()} to {data.index.max().date()}")

        # Preprocess data
        scaled_prices, scaler = preprocess_data(data['Close'])
        dates_all = data.index

        # Create sequences
        X, y, y_dates = create_sequences(scaled_prices, dates_all, look_back)

        # Check if sequences are created
        if len(X) == 0:
            print(f"Skipping {ticker}: no sequences created")
            continue

        # Split into train and test based on dates
        split_indices = np.where(pd.to_datetime(y_dates) >= pd.to_datetime(split_date))[0]
        if len(split_indices) == 0:
            print(f"Skipping {ticker}: no data after split_date")
            continue
        split_index = split_indices[0]
        X_train, X_test = X[:split_index], X[split_index:]
        y_train, y_test = y[:split_index], y[split_index:]
        y_dates_train, y_dates_test = y_dates[:split_index], y_dates[split_index:]

        if len(X_test) == 0:
            print(f"Skipping {ticker}: no test data after split_date")
            continue  # Skip if no test data

        # Build and train model
        print(f"Training model for {ticker}")
        model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
        early_stopping = EarlyStopping(monitor='loss', patience=5)
        model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1, callbacks=[early_stopping])

        # Predict
        predictions = model.predict(X_test)
        predictions_unscaled = scaler.inverse_transform(predictions)

        # Generate signals
        signals = generate_signals(predictions_unscaled, y_dates_test, threshold)
        signals.set_index('date', inplace=True)
        signals.rename(columns={'signal': ticker}, inplace=True)

        # Store signals
        signals_dict[ticker] = signals[[ticker]]
        print(f"Added signals for {ticker}")

    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        continue


  0%|          | 0/503 [00:00<?, ?it/s]


Processing ticker: A
A data from 2010-01-04 to 2023-10-20
Training model for A
Epoch 1/20


## Aggregate the Signals into a DataFrame

In [27]:
# Concatenate signals
signal_df = pd.concat(signals_dict.values(), axis=1)
signal_df.sort_index(inplace=True)


Filter the DataFrame for the Desired Date Range

In [28]:
# Define the date range for which we want the signals
start_signal_date = '2023-01-01'
end_signal_date = '2023-01-31'

# Filter the signal DataFrame
signal_df = signal_df.loc[(signal_df.index >= pd.to_datetime(start_signal_date)) & (signal_df.index <= pd.to_datetime(end_signal_date))]


In [29]:
# Display the signal DataFrame
print(signal_df)
# Save the signal DataFrame to a CSV file
signal_df.to_csv('signals_test_output.csv')

print("Signals saved to 'signals_test_output.csv'")


               A  AAPL  ABBV  ABNB   ABT  ACGL   ACN  ADBE   ADI   ADM
date                                                                  
2023-01-03  hold  hold  hold  hold  hold  hold  hold  hold  hold  hold
2023-01-04  hold  hold  hold  hold  hold  hold  hold  hold  hold  sell
2023-01-05  hold  hold  hold  hold  hold  hold  hold  hold  hold  sell
2023-01-06  hold  hold  hold  hold  hold  hold  hold  hold  hold  sell
2023-01-09  hold  hold  hold  hold  hold  hold  hold  hold  hold  sell
2023-01-10  hold  hold  hold  hold  hold  hold  hold  hold  hold  hold
2023-01-11  hold  hold  hold  hold  hold  hold  hold  hold  hold  hold
2023-01-12  hold  hold  sell   buy  hold  hold   buy  hold  hold  hold
2023-01-13  hold  hold  sell   buy  hold  hold  hold  hold  hold   buy
2023-01-17  hold  hold  sell   buy  hold  hold  hold  hold  hold   buy
2023-01-18  hold  hold  sell   buy  hold  hold  hold  hold  hold  hold
2023-01-19  hold  hold  sell   buy  hold  hold  hold  hold  hold  hold
2023-0