## Imports

In [16]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import os

# Suppress warnings for cleaner output
import warnings

warnings.filterwarnings('ignore')


## Helper Functions

These functions handle data preprocessing, signal generation, and model construction.


In [17]:
# ## Helper Functions
def create_sequences(data, dates, look_back):
    """
    Converts time series data into sequences for LSTM input.

    Parameters:
    - data (numpy array): Scaled "Close" prices.
    - dates (pandas DatetimeIndex): Corresponding dates.
    - look_back (int): Number of previous time steps to include in each input sequence.

    Returns:
    - X (numpy array): Input sequences.
    - y (numpy array): Target values.
    - y_dates (list): Corresponding dates for each target value.
    """
    X, y, y_dates = [], [], []
    for i in range(look_back, len(data)):
        X.append(data[i - look_back:i])
        y.append(data[i])
        y_dates.append(dates[i])
    return np.array(X), np.array(y), y_dates


def generate_signal(df, threshold=0.01):
    """
    Generates trading signals ('buy', 'sell', 'hold') based on predicted price changes.

    Parameters:
    - df (pandas DataFrame): DataFrame containing 'date', 'prediction', and 'actual' columns.
    - threshold (float): Percentage change threshold to determine signals.

    Returns:
    - df_signals (pandas DataFrame): DataFrame with 'date', 'signal', 'prediction', and 'actual' columns.
    """
    df['next_day_prediction'] = df['prediction'].shift(-1)
    df['predicted_change'] = df['next_day_prediction'] - df['prediction']
    df['pct_change'] = df['predicted_change'] / df['prediction']
    df['next_day_actual'] = df['actual'].shift(-1)

    def get_signal(row):
        if row['pct_change'] > threshold:
            return 'buy'
        elif row['pct_change'] < -threshold:
            return 'sell'
        else:
            return 'hold'

    df['signal'] = df.apply(get_signal, axis=1)
    df.dropna(subset=['next_day_prediction', 'next_day_actual'], inplace=True)

    # Retain necessary columns for evaluation
    return df[['date', 'signal', 'prediction', 'actual']]

def build_lstm_model(input_shape):
    """
    Builds and compiles an LSTM neural network model.

    Parameters:
    - input_shape (tuple): Shape of the input data (look_back, features).

    Returns:
    - model (keras Sequential): Compiled LSTM model.
    """
    model = Sequential([
        LSTM(units=50, return_sequences=False, input_shape=input_shape),
        Dense(units=1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

## Define Parameters and Prepare Directories

Specify the list of stock tickers, threshold for signal generation, date ranges for training and testing, and create necessary directories for saving models and signals.


In [18]:
# Define the list of stock tickers you want to process
tickers = [filename.replace('.csv', '') for filename in os.listdir('clean_csvs') if filename.endswith('.csv')]

# Define a uniform threshold for all stocks
uniform_threshold = 0.01  # 1% threshold

# Define date ranges
start_date = "2010-01-01"
split_date = "2023-12-31"
end_date = "2024-01-31"

# Define look-back period
look_back = 10  # Number of previous days to consider

# Create directories for saving models and signals if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('signals', exist_ok=True)

Define Parameters

## Automate the LSTM Model for Each Stock

For each ticker:
1. Download historical data.
2. Preprocess the data.
3. Train the LSTM model on training data.
4. Make predictions on test data.
5. Generate trading signals based on predictions.
6. Compile all signals into a unified dataframe.


In [None]:
# Initialize an empty list to collect signals from all stocks
all_signals = []

for ticker in tickers:
    print(f"\nProcessing {ticker}...")

    try:
        # Download historical data
        data_train = yf.download(ticker, start=start_date, end=split_date)
        data_test = yf.download(ticker, start=split_date, end=end_date)

        # Check if sufficient data is available
        if len(data_train) < look_back + 1 or len(data_test) < look_back + 1:
            print(f"Insufficient data for {ticker}. Skipping.")
            continue

        # Extract 'Close' prices and reshape
        prices_train = data_train['Close'].values.reshape(-1, 1)
        prices_test = data_test['Close'].values.reshape(-1, 1)

        # Normalize the data using MinMaxScaler (fit on training data only)
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_prices_train = scaler.fit_transform(prices_train)
        scaled_prices_test = scaler.transform(prices_test)

        # Create sequences for training
        dates_train = data_train.index
        X_train, y_train, y_train_dates = create_sequences(scaled_prices_train, dates_train, look_back=look_back)

        # Create sequences for testing
        # For testing, use the last 'look_back' days from training + test data
        combined_scaled = np.concatenate((scaled_prices_train[-look_back:], scaled_prices_test))
        combined_dates = np.concatenate((dates_train[-look_back:], data_test.index))
        X_test, y_test, y_test_dates = create_sequences(combined_scaled, combined_dates, look_back=look_back)

        # Build the LSTM model
        model = build_lstm_model((X_train.shape[1], X_train.shape[2]))

        # Define Early Stopping callback
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        # Train the model
        model.fit(
            X_train, y_train,
            epochs=20,
            batch_size=32,
            validation_data=(X_test, y_test),
            callbacks=[early_stop],
            verbose=1
        )

        # Evaluate the model
        train_loss = model.evaluate(X_train, y_train, verbose=0)
        test_loss = model.evaluate(X_test, y_test, verbose=0)
        print(f"{ticker} - Train Loss: {train_loss:.6f}, Test Loss: {test_loss:.6f}")

        # Make predictions on test data
        test_predictions = model.predict(X_test)

        # Denormalize the predictions and actual values
        test_predictions_unscaled = scaler.inverse_transform(test_predictions)
        y_test_unscaled = scaler.inverse_transform(y_test)

        # Create a DataFrame for predictions
        df_predictions = pd.DataFrame({
            'date': y_test_dates,
            'prediction': test_predictions_unscaled.squeeze(),
            'actual': y_test_unscaled.squeeze()
        })
        df_predictions['date'] = pd.to_datetime(df_predictions['date'])
        df_predictions.sort_values('date', inplace=True)
        df_predictions.reset_index(drop=True, inplace=True)

        # Generate trading signals
        df_signals = generate_signal(df_predictions, threshold=uniform_threshold)
        df_signals['ticker'] = ticker  # Add ticker information

        # Save individual ticker signals with predictions and actuals for detailed evaluation
        df_signals.to_csv(f'signals/{ticker}_signals.csv', index=False)
        print(f"Signals for {ticker} saved to 'signals/{ticker}_signals.csv'.")

        # Append to all_signals list
        all_signals.append(df_signals)

    except Exception as e:
        print(f"An error occurred while processing {ticker}: {e}")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Processing A...
Epoch 1/20





[1m 16/110[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step - loss: 0.1301 

## Aggregate the Signals into a Single CSV

Combine the signals from all stocks into a single dataframe with dates as rows and tickers as columns. The content of each cell will be the generated signal (`buy`, `sell`, `hold`) for that stock on that date.


In [5]:
if all_signals:
    # Combine all signals into a single DataFrame
    combined_signals = pd.concat(all_signals, ignore_index=True)

    # Ensure 'date' is of datetime type
    combined_signals['date'] = pd.to_datetime(combined_signals['date'])

    # Filter combined signals for the desired date range (split date to end date)
    filtered_signals = combined_signals[(combined_signals['date'] >= split_date) & (combined_signals['date'] <= end_date)]

    # Pivot the dataframe to have dates as rows and tickers as columns
    pivoted_signals = filtered_signals.pivot(index='date', columns='ticker', values='signal')

    # Optional: Replace NaN with 'hold' or any default signal
    pivoted_signals.fillna('hold', inplace=True)

    # Save the pivoted dataframe to a CSV file
    pivoted_signals.to_csv('trading_signals.csv')
    print("\nPivoted trading signals (split to end) saved to 'trading_signals.csv'")

    # Display the first few rows of the pivoted dataframe
    print("\nPivoted Trading Signals:")
    print(pivoted_signals.head())
else:
    print("No signals were generated.")

Unnamed: 0,date,ticker,signal
0,2024-01-02,AAPL,hold
1,2024-01-03,AAPL,hold
2,2024-01-04,AAPL,sell
3,2024-01-05,AAPL,sell
4,2024-01-08,AAPL,hold


## Evaluate Signal Accuracy

Compare the generated trading signals to actual stock price movements to assess their accuracy. For each 'buy' signal, check if the stock price increased the next day. For each 'sell' signal, check if the stock price decreased the next day. Calculate the accuracy metrics accordingly.


In [1]:
# Initialize a DataFrame to store evaluation results
evaluation_results = []

# Evaluate predictions vs actual data for January
for ticker in tickers[:10]:
    # Load the individual ticker signals with predictions and actuals
    ticker_signals_path = f'signals/{ticker}_signals.csv'
    if not os.path.exists(ticker_signals_path):
        print(f"\nSignals file for {ticker} not found. Skipping evaluation.")
        continue

    # Load the data
    ticker_signals = pd.read_csv(ticker_signals_path, parse_dates=['date'])
    if ticker_signals.empty:
        print(f"\nNo signals found for {ticker}. Skipping evaluation.")
        continue

    # Filter for January 2024
    january_data = ticker_signals[(ticker_signals['date'] >= '2024-01-01') & (ticker_signals['date'] <= '2024-01-31')]

    if january_data.empty:
        print(f"No data for January 2024 for {ticker}. Skipping.")
        continue

    # Calculate metrics
    mae = (january_data['actual'] - january_data['prediction']).abs().mean()
    mse = ((january_data['actual'] - january_data['prediction']) ** 2).mean()

    # Append metrics to evaluation results
    evaluation_results.append({'ticker': ticker, 'MAE': mae, 'MSE': mse})

    # Plot actual vs predicted
    plt.figure(figsize=(10, 5))
    plt.plot(january_data['date'], january_data['actual'], label='Actual Price', marker='o')
    plt.plot(january_data['date'], january_data['prediction'], label='Predicted Price', linestyle='--', marker='x')
    plt.title(f'{ticker} Actual vs Predicted Prices (January 2024)')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.grid()
    plt.show()

# Create a DataFrame for evaluation results
results_df = pd.DataFrame(evaluation_results)

# Display results
if not results_df.empty:
    print("\nEvaluation Metrics for January 2024:")
    print(results_df)
else:
    print("No evaluation results available for January 2024.")

NameError: name 'tickers' is not defined