In [None]:
import pandas as pd
import yfinance as yf
import pandas_ta as ta
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, BatchNormalization

def import_data(stock, timeframe):
    data = yf.download(stock, period="2y", interval='1d')
    data.columns = data.columns.get_level_values(0)
    data['EMA_50'] = ta.ema(data['Close'], length=50)
    data['EMA_200'] = ta.ema(data['Close'], length=200)
    data['SMA_50'] = ta.sma(data['Close'], length=50)
    data['SMA_200'] = ta.sma(data['Close'], length=200)
    data['RSI'] = ta.rsi(data['Close'], length=14)
    data['Pct_Change'] = data['Close'].pct_change(periods=timeframe).shift(-timeframe)  # Percent change over X days
    data.dropna(inplace=True)  # Remove any rows with NaN values
    return data

def prepare_lstm_data(data, feature_columns, target_column, time_steps=60):
    X = []
    Y = []
    for i in range(time_steps, len(data) - 5):  # Ensure we have future data for Y
        X.append(data[feature_columns].iloc[i - time_steps:i].values)
        Y.append(data[target_column].iloc[i])
    return np.array(X), np.array(Y)

def build_training_data(tickers, timeframe):
    # sp500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    # sp500_table = pd.read_html(sp500_url)
    # TICKERS = sp500_table[0]['Symbol'].tolist()

    # # Select a subset for faster testing
    # TICKERS = TICKERS[:10]  # Adjust as needed

    # Prepare data for training
    feature_columns = ['Close', 'EMA_50', 'EMA_200', 'SMA_50', 'SMA_200', 'RSI']
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    time_steps = 60  # Number of time steps for LSTM
    scaler = MinMaxScaler()

    for ticker in tickers:
        # print(f"Processing {ticker}...")
        try:
            data = import_data(ticker, timeframe)
            data[feature_columns] = scaler.fit_transform(data[feature_columns])
            X, Y = prepare_lstm_data(data, feature_columns, 'Pct_Change', time_steps)
            if len(X) > 0 and len(Y) > 0:
                # length = len(X)
                # index = 0.8*length
                # X_train.extend(X[:index])
                # Y_train.extend(Y[:index])
                # X_test.extend(X[index:])
                # Y_test.extend(Y[index:])
                X_train.extend(X)
                Y_train.extend(Y)

        except Exception as e:
            print(f"Skipping {ticker} due to an error: {e}")

    X_train = np.array(X_train)
    Y_train = np.array(Y_train)

    return X_train, Y_train

def build_and_train_model(X_train, y_train):
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
        BatchNormalization(),
        LSTM(32),
        Dense(1)  # Regression output
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2)
    return model

def predict_and_decide(tickers, tf, buy):
    # Ensure tickers is a numpy array
    tickers = np.array(tickers)
    timeframe = tf
    
    # Prepare training data
    X_train, y_train = build_training_data(tickers, timeframe)
    
    # Check if training data is prepared
    if len(X_train) == 0 or len(y_train) == 0:
        print("Training data is empty. Ensure the tickers array is valid and data was processed correctly.")
        return np.array([])

    # Build and train the model
    model = build_and_train_model(X_train, y_train)

    # Initialize a list to store the results
    feature_columns = ['Close', 'EMA_50', 'EMA_200', 'SMA_50', 'SMA_200', 'RSI']
    time_steps = 60
    scaler = MinMaxScaler()
    predictions = []

    for ticker in tickers:
        try:
            # Download the latest data for the ticker
            data = import_data(ticker, timeframe)
            
            # Scale feature columns
            data[feature_columns] = scaler.fit_transform(data[feature_columns])
            
            # Extract the last `time_steps` rows for prediction
            if len(data) >= time_steps:
                last_data = data[feature_columns].iloc[-time_steps:].values
                last_data = np.expand_dims(last_data, axis=0)  # Reshape for LSTM input
                
                # Make a prediction
                predicted_change = model.predict(last_data)[0][0]
                predictions.append((ticker, predicted_change))
            else:
                print(f"Not enough data for {ticker} to make a prediction.")
        except Exception as e:
            print(f"Skipping {ticker} due to an error: {e}")

    # Sort predictions by predicted price change in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Extract tickers and predictions into separate arrays
    sorted_tickers = np.array([item[0] for item in predictions])
    y_pred = np.array([item[1] for item in predictions])
    
    # Generate buy or sell signals based on the buy flag
    if buy:
        selected_tickers = sorted_tickers[y_pred > 0]  # Buy if predicted change > 0
    else:
        selected_tickers = sorted_tickers[y_pred <= 0]  # Sell if predicted change <= 0

    return selected_tickers

In [20]:
def backtest(model, tickers, start_date, feature_columns, time_steps=60):
    capital = 10000  # Starting capital
    equity_curve = [capital]
    current_capital = capital

    # Create a combined dataframe for all tickers with predicted values
    combined_data = []

    for ticker in tickers:
        print(f"Backtesting {ticker}...")
        try:
            data = import_data(ticker, startDate=start_date)
            data[feature_columns] = scaler.transform(data[feature_columns])
            X_test, _ = prepare_lstm_data(data, feature_columns, 'Pct_Change_5D', time_steps)

            # Predict and add to combined data
            data['Predicted'] = np.nan
            for i in range(time_steps, len(data) - 5):
                pred = model.predict(X_test[i - time_steps].reshape(1, time_steps, -1), verbose=0)
                data.loc[data.index[i], 'Predicted'] = pred
            combined_data.append(data)
        except Exception as e:
            print(f"Skipping {ticker} due to an error: {e}")

    print(data.index[-1])
    for i in range(len(combined_data)):
        combined_data[i].index = combined_data[i].index.tz_localize(None)
    # Simulate backtesting over the given period
    for date in pd.date_range(start=start_date, end=data.index[-1]):
        best_stock = None
        max_pred = -np.inf

        # Find the stock with the highest predicted percent change for the given day
        for data in combined_data:
            if date in data.index and not pd.isna(data.loc[date, 'Predicted']):
                if data.loc[date, 'Predicted'] > max_pred:
                    max_pred = data.loc[date, 'Predicted']
                    best_stock = data

        # "Buy" the stock with the highest predicted change
        if best_stock is not None:
            buy_price = best_stock.loc[date, 'Close']
            future_date = date + pd.Timedelta(days=5)
            if future_date in best_stock.index:
                sell_price = best_stock.loc[future_date, 'Close']
                profit = (sell_price - buy_price) / buy_price * current_capital
                current_capital += profit
                equity_curve.append(current_capital)

    return equity_curve

In [21]:
# from sklearn.metrics import roc_auc_score
# # start_date = '2022-06-01'
# # equity_curve = backtest(model, TICKERS, start_date, feature_columns, time_steps)

# # # Plot the equity curve
# # plt.plot(equity_curve)
# # plt.title('Equity Curve')
# # plt.xlabel('Time')
# # plt.ylabel('Equity ($)')
# # plt.show()

# y_pred = model.predict(X_test)
# roc_auc = roc_auc_score(Y_test, y_pred)
tickers = ['TSLA', 'META', 'BABA', 'NVDA', 'GOOGL', 'MSFT', 'AMZN', 'JPM', 'NFLX', 'AAPL']
timeframe = 5
print(predict_and_decide(tickers, True))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed

['TSLA' 'BABA' 'MSFT' 'JPM' 'AAPL' 'META' 'NVDA' 'NFLX' 'GOOGL']



