## Imports

In [180]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping
from tqdm import tqdm
import warnings
import matplotlib as plt
import matplotlib
warnings.filterwarnings('ignore')

## Helper functions

In [181]:
def preprocess_data(prices):
    prices = prices.values.reshape(-1, 1)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_prices = scaler.fit_transform(prices)
    return scaled_prices, scaler

def create_sequences(data, look_back):
    X = []
    for i in range(look_back, len(data)):
        X.append(data[i - look_back:i])
    return np.array(X)

def build_lstm_model(input_shape):
    model = Sequential([
        LSTM(units=50, return_sequences=False, input_shape=input_shape),
        Dense(units=1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def generate_future_signals(df_predictions, threshold):
    df_predictions = df_predictions.copy()
    df_predictions['next_day_prediction'] = df_predictions['prediction'].shift(-1)
    df_predictions['predicted_change'] = df_predictions['next_day_prediction'] - df_predictions['prediction']
    df_predictions['pct_change'] = df_predictions['predicted_change'] / df_predictions['prediction']
    df_predictions.dropna(subset=['next_day_prediction'], inplace=True)

    def get_signal(row):
        if row['pct_change'] > threshold:
            return 'buy'
        elif row['pct_change'] < -threshold:
            return 'sell'
        else:
            return 'hold'

    df_predictions['signal'] = df_predictions.apply(get_signal, axis=1)
    return df_predictions[['date', 'signal']]

## Read the List of S&P 500 Stocks

In [182]:
# Directory containing the CSV files
csv_directory = 'clean_csvs'

# List of tickers based on the CSV filenames
tickers = [filename.replace('.csv', '') for filename in os.listdir(csv_directory) if filename.endswith('.csv')]
tickers = tickers[:3]
# Sort the tickers
tickers.sort()

Define Parameters

In [183]:
start_date = '2010-01-01'
end_date = '2023-12-31'
look_back = 10
threshold = 0.01  # Adjust the threshold as needed

## Automate the LSTM Model for Each Stock

In [184]:
# Initialize dictionaries to store signals
signals_dict = {}

for ticker in tqdm(tickers):
    try:
        print(f"\nProcessing ticker: {ticker}")

        # Read data from CSV
        csv_path = os.path.join(csv_directory, f"{ticker}.csv")
        data = pd.read_csv(csv_path, parse_dates=['Date'])
        data.set_index('Date', inplace=True)
        data.sort_index(inplace=True)

        # Filter data based on start and end dates
        data = data.loc[(data.index >= pd.to_datetime(start_date)) & (data.index <= pd.to_datetime(end_date))]

        if data.empty or len(data) < look_back + 1:
            print(f"Skipping {ticker}: insufficient data after filtering")
            continue

        # Preprocess data
        scaled_prices, scaler = preprocess_data(data['Close'])

        # Create sequences
        X_train = create_sequences(scaled_prices, look_back)
        y_train = scaled_prices[look_back:]

        if len(X_train) == 0:
            print(f"Skipping {ticker}: no sequences created")
            continue

        # Split data for validation
        val_split = int(0.8 * len(X_train))  # Use 80% of data for training, 20% for validation
        X_train, X_val = X_train[:val_split], X_train[val_split:]
        y_train, y_val = y_train[:val_split], y_train[val_split:]

        # Build and train model
        print(f"Training model for {ticker}")
        model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
        early_stopping = EarlyStopping(monitor='val_loss', patience=5)
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=20,  # Increased epochs
            batch_size=16,  # Reduced batch size
            verbose=1,
            callbacks=[early_stopping]
        )


        # Generate future predictions
        from pandas.tseries.offsets import BDay
        future_dates = pd.bdate_range(start='2024-01-01', end='2024-01-31')
        n_future = len(future_dates)

        last_sequence = scaled_prices[-look_back:]
        current_sequence = last_sequence.reshape(1, look_back, 1)
        future_predictions = []

        for _ in range(n_future):
            next_pred_scaled = model.predict(current_sequence)
            future_predictions.append(next_pred_scaled[0, 0])
            current_sequence = np.concatenate((current_sequence[:, 1:, :], next_pred_scaled.reshape(1, 1, 1)), axis=1)

        # Convert future predictions back to original prices
        future_predictions_unscaled = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1))

        # Create a DataFrame with future dates and predictions
        future_df = pd.DataFrame({
            'date': future_dates,
            'prediction': future_predictions_unscaled.squeeze()
        })

        # Generate signals
        signals = generate_future_signals(future_df, threshold)
        signals.set_index('date', inplace=True)
        signals.rename(columns={'signal': ticker}, inplace=True)

        # Store signals
        if not signals.empty:
            signals_dict[ticker] = signals[[ticker]]
            print(f"Added signals for {ticker}")
        else:
            print(f"No signals generated for {ticker}")

    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        continue


  0%|          | 0/3 [00:00<?, ?it/s]


Processing ticker: A
Training model for A
Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0124 - val_loss: 0.0017
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.3077e-04 - val_loss: 8.1853e-04
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.2982e-04 - val_loss: 7.8236e-04
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.2399e-04 - val_loss: 7.9466e-04
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.1558e-04 - val_loss: 0.0010
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.2762e-04 - val_loss: 7.4750e-04
Epoch 7/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.2747e-04 - val_loss: 7.8148e-04
Epoch 8/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

 33%|███▎      | 1/3 [00:17<00:34, 17.08s/it]

Added signals for A

Processing ticker: AAPL
Training model for AAPL
Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0118 - val_loss: 8.8629e-04
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.1525e-04 - val_loss: 9.3899e-04
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.1177e-04 - val_loss: 0.0011
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9.6369e-05 - val_loss: 8.3022e-04
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9.3932e-05 - val_loss: 8.5902e-04
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9.4193e-05 - val_loss: 0.0018
Epoch 7/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9.2278e-05 - val_loss: 0.0010
Epoch 8/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━

 67%|██████▋   | 2/3 [00:33<00:16, 16.57s/it]

Added signals for AAPL

Processing ticker: ABBV
Training model for ABBV
Epoch 1/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0099 - val_loss: 7.8228e-04
Epoch 2/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.1214e-04 - val_loss: 8.2372e-04
Epoch 3/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.4495e-04 - val_loss: 6.6967e-04
Epoch 4/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.3879e-04 - val_loss: 7.7062e-04
Epoch 5/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.2585e-04 - val_loss: 5.7226e-04
Epoch 6/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.8664e-04 - val_loss: 5.3881e-04
Epoch 7/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.0231e-04 - val_loss: 5.0810e-04
Epoch 8/20
[1m138/138[0m [32m━━━━

100%|██████████| 3/3 [00:44<00:00, 14.75s/it]

Added signals for ABBV





## Aggregate the Signals into a DataFrame

In [185]:
# Concatenate signals
if signals_dict:
    signal_df = pd.concat(signals_dict.values(), axis=1)
    signal_df.sort_index(inplace=True)
else:
    print("No signals were generated.")


Filter the DataFrame for the Desired Date Range

In [186]:
# Define the date range for which we want the signals
start_signal_date = '2024-01-01'
end_signal_date = '2024-01-31'

# Filter the signal DataFrame
signal_df = signal_df.loc[(signal_df.index >= pd.to_datetime(start_signal_date)) & (signal_df.index <= pd.to_datetime(end_signal_date))]


In [187]:
print(future_df)


         date  prediction
0  2024-01-01  152.685852
1  2024-01-02  152.077988
2  2024-01-03  151.293091
3  2024-01-04  150.390640
4  2024-01-05  149.384644
5  2024-01-08  148.372131
6  2024-01-09  147.409561
7  2024-01-10  146.460403
8  2024-01-11  145.548294
9  2024-01-12  144.673798
10 2024-01-15  143.849686
11 2024-01-16  143.029022
12 2024-01-17  142.247223
13 2024-01-18  141.501221
14 2024-01-19  140.787933
15 2024-01-22  140.104095
16 2024-01-23  139.448090
17 2024-01-24  138.819183
18 2024-01-25  138.215759
19 2024-01-26  137.636612
20 2024-01-29  137.080475
21 2024-01-30  136.546494
22 2024-01-31  136.032822


In [188]:
# Display the signal DataFrame
print(signal_df)
# Save the signal DataFrame to a CSV file
signal_df.to_csv('signals_test_output.csv')

print("Signals saved to 'signals_test_output.csv'")


               A  AAPL  ABBV
date                        
2024-01-01  hold  hold  hold
2024-01-02  hold  sell  hold
2024-01-03  hold  sell  hold
2024-01-04  hold  sell  hold
2024-01-05  hold  sell  hold
2024-01-08  hold  sell  hold
2024-01-09  hold  sell  hold
2024-01-10  hold  sell  hold
2024-01-11  hold  sell  hold
2024-01-12  hold  sell  hold
2024-01-15  hold  sell  hold
2024-01-16  hold  sell  hold
2024-01-17  hold  sell  hold
2024-01-18  hold  sell  hold
2024-01-19  hold  sell  hold
2024-01-22  hold  sell  hold
2024-01-23  hold  sell  hold
2024-01-24  hold  hold  hold
2024-01-25  hold  hold  hold
2024-01-26  hold  hold  hold
2024-01-29  hold  hold  hold
2024-01-30  hold  hold  hold
Signals saved to 'signals_test_output.csv'
