In [1]:
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import backtrader as bt
import quantstats as qs
import pyfolio as pf
import json
from sklearn.ensemble import RandomForestClassifier
import os
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from fredapi import Fred
import numpy as np

import matplotlib.pyplot as plt



In [2]:
start='2007-01-01'
end='2019-10-04'

In [3]:
indexpc = pd.read_csv('indexpc.csv', skiprows=2)
#indexpc.columns = ['Date', 'Call', 'Put', 'Total', 'PC_Ratio']

# Converting the 'Date' column to datetime format
indexpc['DATE'] = pd.to_datetime(indexpc['DATE'], errors='coerce')

# Filtering the data
indexpc = indexpc[(indexpc['DATE'] >= '2007-01-01') & (indexpc['DATE'] <= '2019-10-04')]

# Check the data types
indexpc # P/C Ratio = Puts / Calls

Unnamed: 0,DATE,CALL,PUT,TOTAL,P/C Ratio
41,2007-01-03,205724,309416,515140,1.50
42,2007-01-04,270783,293745,564528,1.08
43,2007-01-05,290570,352167,642737,1.21
44,2007-01-08,232352,273485,505837,1.18
45,2007-01-09,237668,276434,514102,1.16
...,...,...,...,...,...
3248,2019-09-30,582589,987408,1569997,1.69
3249,2019-10-01,617097,1050008,1667105,1.70
3250,2019-10-02,1376258,1782956,3159214,1.30
3251,2019-10-03,769129,1260434,2029563,1.64


In [4]:
dgs2_data = pd.read_csv('DGS2.csv')

# Convert the 'DATE' column to datetime and the 'DGS2' column to numeric
dgs2_data['DATE'] = pd.to_datetime(dgs2_data['DATE'])
dgs2_data['DGS2'] = pd.to_numeric(dgs2_data['DGS2'], errors='coerce')

# Calculate the daily percentage change of DGS2 values
dgs2_data['DGS2_pct_change'] = dgs2_data['DGS2'].pct_change()

# Filter the data for the specified date range (2000-01-01 to 2021-11-12)
dgs2_filtered = dgs2_data[(dgs2_data['DATE'] >= start) & (dgs2_data['DATE'] <= end)]

In [5]:
class Stock:
    def __init__(self, data):
        self.data = data
        self.label = []
    
    #preprocess
    #feature creation
    def factor(self):
        del self.data["Close"]
        self.data = self.data.fillna(method = "bfill")
        
        #return
        #>0,+1;<=0,-1
        self.data['label']=self.data.rolling(2).apply(lambda x:x.iloc[1]>x.iloc[0])['Adj Close']
        self.data['label']=self.data.label.shift(-1)

        # Volume Change (daily percentage change)
        self.data['volume_pct_change'] = self.data['Volume'].pct_change()

        # Volume Moving Averages
        self.data['vol_ma_5'] = self.data['Volume'].rolling(window=5).mean()
        #self.data['vol_ma_10'] = self.data['Volume'].rolling(window=10).mean()
        self.data['vol_ma_20'] = self.data['Volume'].rolling(window=20).mean()  
        self.data['vol_ma_50'] = self.data['Volume'].rolling(window=50).mean()  
        self.data['vol_ma_200'] = self.data['Volume'].rolling(window=200).mean()  

        #moving average
        self.data["ma_5"] = self.data["Adj Close"].rolling(window = 5).mean()
        self.data["ma_10"] = self.data["Adj Close"].rolling(window = 10).mean()
        self.data["ma_20"] = self.data["Adj Close"].rolling(window = 20).mean()
        self.data["ma_50"] = self.data["Adj Close"].rolling(window = 50).mean()
        self.data["ma_200"] = self.data["Adj Close"].rolling(window = 200).mean()

        
        self.merge_pc_ratio(indexpc)
        self.add_volume_oscillator()
        self.add_relative_volume()
        self.add_volume_relative_to_ma()
        self.add_price_volume_trend()
        self.add_vix_feature()
        self.add_spy_vix_ratio_feature()
        self.add_rsi_feature()
        self.add_bollinger_bands_feature()
        self.add_cci_feature()
        
        
        self.add_10yr_bond_yield_change_feature()
        self.add_yield_spread_feature(dgs2_filtered)
        self.add_atr_feature()

        
        #self.add_volume_spikes()
        self.add_stochastic_oscillator()
        self.add_spy_iwm_ratio_feature()
        self.add_spy_qqq_ratio_feature()
        self.add_spy_dia_ratio_feature()
        self.add_dxy_feature()
        self.add_obv_feature()
        #self.create_combined_conditions()


        self.add_momentum_feature_5d(window=5)
        
        #self.add_momentum_feature_10d(window=10)
        #self.add_momentum_feature_20d(window=20)
        #self.add_momentum_feature_50d(window=50)
        #self.add_momentum_feature_200d(window=200)


        #self.add_volatility_feature_short(window=21)
        self.add_volatility_feature_long(window=252)

        self.add_atr_feature()
        self.add_momentum_volatility_feature()
        
        #del self.data["SPY_pct_change"]
        #del self.data["IWM_pct_change"]
        #self.add_2yr_treasury_yield_change_feature(dgs2_filtered)  # Correctly pass dgs2_filtered as an argument

        
        self.data = self.data.dropna(how = "any")
        self.label = list(self.data["label"])
        del self.data["label"]
        del self.data["Adj Close"]
        del self.data["Open"]
        del self.data["High"]
        del self.data["Low"]
        
        del self.data['avg_volume']
        del self.data['vix_pct_change']
        del self.data['vix']
        #del self.data['macd_signal']
        #del self.data['is_uptrend']
        #del self.data['is_downtrend']
        #del self.data['is_vix_high'] 
        #del self.data['is_vix_moderate']
        #del self.data['is_vix_low']
        #del self.data['macd']
        print(self.data.columns)


    def add_reversal_feature(self, window=5):
        """
        Add reversal feature based on the negative of momentum.
        A simple reversal feature could be the negative price change over a given window.
        """
        self.data['reversal_feature'] = -self.data['Adj Close'].pct_change(periods=window)

        # Now, make sure to forward fill any NaN values that were generated
        self.data['reversal_feature'] = self.data['reversal_feature'].fillna(method="ffill")

        # Optionally, rank the reversal features if you are considering a cross-sectional strategy
        # self.data['reversal_rank'] = self.data['reversal_feature'].rank(pct=True)

        # Normalize the reversal feature so that it's on the same scale as your other features
        self.data['reversal_feature'] = (self.data['reversal_feature'] - self.data['reversal_feature'].mean()) / self.data['reversal_feature'].std()

    
    def price_acceleration(self):
        lagged_day = 5
        self.data['lagged_data_1'] = self.data['Adj Close'].shift(lagged_day)
        self.data['lagged_data_2'] = self.data['Adj Close'].shift(lagged_day * 2)
        self.data['price_rate_change_1'] = (self.data['Adj Close'] - self.data['lagged_data_1']) / lagged_day
        self.data['price_rate_change_2'] = (self.data['lagged_data_1'] - self.data['lagged_data_2']) / lagged_day
        self.data['price_acceleration'] = (self.data['price_rate_change_1'] - self.data['price_rate_change_2']) / lagged_day
        del self.data['lagged_data_1']
        del self.data['lagged_data_2']
        del self.data['price_rate_change_1']
        del self.data['price_rate_change_2']

    def percent_off_52_weeks_high(self):
        one_year_high = self.data['Adj Close'].rolling(window=252).max()
        self.data['percent_off_52_weeks_high'] = (one_year_high - self.data['Adj Close']) / one_year_high

    


    def add_atr_feature(self, window=14):
        """Add Average True Range (ATR) feature."""
        high_low = self.data['High'] - self.data['Low']
        high_close = np.abs(self.data['High'] - self.data['Adj Close'].shift())
        low_close = np.abs(self.data['Low'] - self.data['Adj Close'].shift())

        true_ranges = np.maximum(high_low, high_close, low_close)
        self.data['atr'] = true_ranges.rolling(window=window).mean()

    def add_momentum_volatility_feature(self, momentum_window=14, vol_window=21):
        """Add Momentum-Based Volatility feature."""
        momentum = self.data['Adj Close'].pct_change(periods=momentum_window)
        self.data['momentum_volatility'] = momentum.rolling(window=vol_window).std()
        

        
    def add_volatility_feature_short(self, window=21):  # 21 trading days roughly equals 1 month
        """Add Volatility feature calculated as the rolling standard deviation of daily returns."""
        # Calculate daily returns
        daily_returns = self.data['Adj Close'].pct_change()
        
        # Calculate the rolling standard deviation of daily returns
        self.data['volatility_21d'] = daily_returns.rolling(window=window).std() * np.sqrt(window)


    def add_volatility_feature_long(self, window=252):  # 21 trading days roughly equals 1 month
        """Add Volatility feature calculated as the rolling standard deviation of daily returns."""
        # Calculate daily returns
        daily_returns = self.data['Adj Close'].pct_change()
        
        # Calculate the rolling standard deviation of daily returns
        self.data['volatility_252d'] = daily_returns.rolling(window=window).std() * np.sqrt(window)
    
    def merge_pc_ratio(self, pc_ratio_data):
        # Ensure 'DATE' is in datetime format and set it as the index
        if 'DATE' in pc_ratio_data.columns:
            pc_ratio_data['DATE'] = pd.to_datetime(pc_ratio_data['DATE'])
            pc_ratio_data.set_index('DATE', inplace=True)
    
        # Calculate the P/C Ratio change (you can use pct_change() for percentage change)
        pc_ratio_data['PC_Ratio_Change'] = pc_ratio_data['P/C Ratio'].pct_change()
    
        # Merge the PC_Ratio_Change into self.data
        self.data = self.data.join(pc_ratio_data['PC_Ratio_Change'], how='left')
    
        # Handle any infinite values and fill missing values
        self.data['PC_Ratio_Change'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['PC_Ratio_Change'] = self.data['PC_Ratio_Change'].fillna(method="bfill")
    

    def add_momentum_feature_5d(self, window=5):
        """Add Momentum feature."""
        self.data['momentum_5d'] = self.data['Adj Close'].pct_change(periods=window).shift(-window)

    def add_momentum_feature_10d(self, window=10):
        """Add Momentum feature."""
        self.data['momentum_10d'] = self.data['Adj Close'].pct_change(periods=window).shift(-window)

    def add_momentum_feature_20d(self, window=20):
        """Add Momentum feature."""
        self.data['momentum_10d'] = self.data['Adj Close'].pct_change(periods=window).shift(-window)

    def add_momentum_feature_50d(self, window=50):
        """Add Momentum feature."""
        self.data['momentum_10d'] = self.data['Adj Close'].pct_change(periods=window).shift(-window)

    def add_momentum_feature_200d(self, window=200):
        """Add Momentum feature."""
        self.data['momentum_10d'] = self.data['Adj Close'].pct_change(periods=window).shift(-window)
               
    def add_volume_oscillator(self):
            """Add Volume Oscillator feature."""
            short_term = 5
            long_term = 10
            
            self.data['volume_oscillator'] = self.data['Volume'].rolling(window=short_term).mean() - self.data['Volume'].rolling(window=long_term).mean()

    def add_relative_volume(self, comparison_period=20):
        """Add Relative Volume feature."""
        # Calculate the average volume over the specified comparison period
        self.data['avg_volume'] = self.data['Volume'].rolling(window=comparison_period).mean()

        # Calculate Relative Volume
        self.data['relative_volume'] = self.data['Volume'] / self.data['avg_volume']

    def add_volume_relative_to_ma(self, period=50):
        """Add Volume Relative to Moving Average."""
        self.data['vol_relative_to_ma'] = self.data['Volume'] / self.data['Volume'].rolling(window=period).mean()

    def add_volume_spikes(self, threshold=2):
        """Add Volume Spikes."""
        self.data['vol_spike'] = self.data['Volume'] > self.data['Volume'].rolling(window=50).mean() * threshold

    def add_price_volume_trend(self):
        """Add Price-Volume Trend."""
        self.data['pvt'] = (self.data['Volume'] * self.data['Adj Close'].diff()).cumsum()

    def download_vix(self):
        """Download VIX data for the same date range as the stock data."""
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        self.vix_data = yf.download("^VIX", start=start, end=end)['Close']

    def add_vix_feature(self):
        """Add VIX as a feature along with its percentage change."""
        self.download_vix()  # Download VIX data

        # Merge raw VIX data into the stock data
        self.data['vix'] = self.vix_data.reindex(self.data.index, method='bfill')

        # Calculate the percentage change in VIX
        self.data['vix_pct_change'] = self.data['vix'].pct_change()

        # Handle any missing values
        self.data['vix_pct_change'].fillna(method='bfill', inplace=True)


    def download_spy(self):
        """Download SPY data for the same date range as the stock data."""
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        self.spy_data = yf.download("SPY", start=start, end=end)['Adj Close']

    def IWM_moving_beta_change(self):
        """Calculate the change in the 30-day rolling beta of IWM relative to SPX."""
        # Download SPX data
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        spy_data = yf.download("SPY", start=start_date, end=end_date)['Adj Close']
        iwm_data= yf.download("IWM", start=start_date, end=end_date)['Adj Close']
        # Calculate returns
        iwm_returns = iwm_data.pct_change()
        spy_returns = spy_data.pct_change()

        # Calculate 30-day rolling beta
        covariance = iwm_returns.rolling(window=30).cov(spy_returns)
        variance = spy_returns.rolling(window=30).var()
        rolling_beta = covariance / variance

        # Calculate the change in rolling beta
        change_in_rolling_beta = rolling_beta.diff()

        # Store the change in rolling beta in the class
        self.data['Change_in_Rolling_Beta_IWM'] = change_in_rolling_beta.shift(5)
        

    def add_spy_vix_ratio_feature(self):
        """Add feature of SPY price change / VIX price change."""
        self.download_spy()  # Download SPY data
        self.download_vix()  # Download VIX data

        # Calculate daily percentage change for SPY and VIX
        spy_pct_change = self.spy_data.pct_change()
        vix_pct_change = self.vix_data.pct_change()
        
        # Calculate the ratio of SPY change to VIX change
        self.data['SPY_VIX_ratio'] = spy_pct_change/ vix_pct_change
        self.data['SPY_VIX_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_VIX_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")

    def download_etf_data(self, ticker, column_name):
        """Download ETF data for the same date range as the stock data."""
        start_date = self.data.index.min().strftime('%Y-%m-%d')
        end_date = self.data.index.max().strftime('%Y-%m-%d')
        etf_data = yf.download(ticker, start=start, end=end)['Adj Close']
        etf_pct_change = etf_data.pct_change()
        self.data[column_name] = etf_pct_change

    def add_spy_iwm_ratio_feature(self):
        """Add feature of SPY price change / IWM price change."""
        self.download_etf_data("SPY", "SPY_pct_change")
        self.download_etf_data("IWM", "IWM_pct_change")

        # Calculate the ratio of SPY change to IWM change
        self.data['SPY_IWM_ratio'] = self.data['SPY_pct_change'] / self.data['IWM_pct_change']
        self.data['SPY_IWM_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_IWM_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")
    def add_spy_qqq_ratio_feature(self):
        """Add feature of SPY price change / QQQ price change."""
        self.download_etf_data("SPY", "SPY_pct_change")
        self.download_etf_data("QQQ", "QQQ_pct_change")
        self.data['SPY_QQQ_ratio'] = self.data['SPY_pct_change'] / self.data['QQQ_pct_change']
        self.data['SPY_QQQ_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_QQQ_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")
    def add_spy_dia_ratio_feature(self):
        """Add feature of SPY price change / DIA price change."""
        self.download_etf_data("SPY", "SPY_pct_change")
        self.download_etf_data("DIA", "DIA_pct_change")
        self.data['SPY_DIA_ratio'] = self.data['SPY_pct_change'] / self.data['DIA_pct_change']
        self.data['SPY_DIA_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['SPY_DIA_ratio'] = self.data['SPY_VIX_ratio'].fillna(method = "bfill")

    def add_dxy_feature(self):
        """Add feature of DXY (US Dollar Index) daily percentage change."""
    # Download DXY data for the same date range as the stock data

        dxy_data = yf.download("DX-Y.NYB", start=start, end=end)['Adj Close']
        
        # Calculate daily percentage change for DXY
        dxy_pct_change = dxy_data.pct_change()
        
        # Add the DXY daily percentage change to the stock data DataFrame
        self.data['DXY_pct_change'] = dxy_pct_change
        self.data['DXY_pct_change'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['DXY_pct_change'] = self.data['DXY_pct_change'].fillna(method="bfill")
    def download_bond_yield_data(self, ticker, column_name):
        """Download bond yield data for the same date range as the stock data."""

        bond_data = yf.download(ticker, start=start, end=end)['Adj Close']
        bond_pct_change = bond_data.pct_change()
        self.data[column_name] = bond_pct_change

    def add_2yr_treasury_yield_change_feature(self, dgs2_data):
        """
        Add the 2-year Treasury yield percentage change as a feature to the stock data.
        
        Args:
        dgs2_data (DataFrame): DataFrame containing the DGS2 data with 'DATE' and 'DGS2_pct_change' columns.
        """
        # Check if 'DATE' column exists in dgs2_data
        if 'DATE' in dgs2_data.columns:
            # If 'DATE' column exists, ensure it's in datetime format and set it as the index
            dgs2_data['DATE'] = pd.to_datetime(dgs2_data['DATE'])
            dgs2_data.set_index('DATE', inplace=True)
        
        # Merge the DGS2_pct_change into the stock data
        self.data = self.data.join(dgs2_data['DGS2_pct_change'], how='left')

        # Handle any infinite values and fill missing values
        self.data['DGS2_pct_change'].replace([np.inf, -np.inf], np.nan, inplace=True)
        self.data['DGS2_pct_change'] = self.data['DGS2_pct_change'].fillna(method="bfill")
    
    def add_10yr_bond_yield_change_feature(self):
        """Add change of 10-year bond yield as a feature."""
        self.download_bond_yield_data("^TNX", "10yr_bond_yield_change")

    def add_yield_spread_feature(self, dgs2_data):
        """
        Add the feature representing the difference between 2-year and 10-year Treasury bond yields.
        
        Args:
        dgs2_data (DataFrame): DataFrame containing the local DGS2 data.
        """
        # Download 10-year Treasury yield data
        dgs10_data = yf.download("^TNX", start="2000-01-01", end="2023-12-20")['Adj Close']

        # Ensure DGS2 data is in the correct format
        if 'DATE' in dgs2_data.columns:
            dgs2_data.set_index('DATE', inplace=True)
        dgs2_data.index = pd.to_datetime(dgs2_data.index)

        # Align the DGS10 data with DGS2 data dates
        dgs10_aligned = dgs10_data.reindex(dgs2_data.index, method='bfill')

        # Calculate the yield spread
        yield_spread = dgs10_aligned - dgs2_data['DGS2']

        # Add the yield spread to the stock data
        self.data['yield_spread'] = yield_spread

        # Handle missing values
        self.data['yield_spread'].fillna(method='bfill', inplace=True)

    def add_rsi_feature(self, window=14):
        """Add Relative Strength Index (RSI) feature."""
        delta = self.data['Adj Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

        rs = gain / loss
        self.data['rsi'] = 100 - (100 / (1 + rs))

    def add_bollinger_bands_feature(self, window=20, num_std=2):
        """Add Bollinger Bands feature."""
        rolling_mean = self.data['Adj Close'].rolling(window=window).mean()
        rolling_std = self.data['Adj Close'].rolling(window=window).std()

        self.data['bollinger_upper'] = rolling_mean + (rolling_std * num_std)
        self.data['bollinger_lower'] = rolling_mean - (rolling_std * num_std)

    def add_atr_feature(self, window=14):
        """Add Average True Range (ATR) feature."""
        high_low = self.data['High'] - self.data['Low']
        high_close = np.abs(self.data['High'] - self.data['Adj Close'].shift())
        low_close = np.abs(self.data['Low'] - self.data['Adj Close'].shift())

        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = np.max(ranges, axis=1)
        self.data['atr'] = true_range.rolling(window=window).mean()

    def add_stochastic_oscillator(self, k_window=14, d_window=3):
        """Add Stochastic Oscillator feature."""
        min_low = self.data['Low'].rolling(window=k_window).min()
        max_high = self.data['High'].rolling(window=k_window).max()

        self.data['%K'] = 100 * ((self.data['Adj Close'] - min_low) / (max_high - min_low))
        self.data['%D'] = self.data['%K'].rolling(window=d_window).mean()

    def add_cci_feature(self, window=20):
        """Add Commodity Channel Index (CCI) feature."""
        tp = (self.data['High'] + self.data['Low'] + self.data['Adj Close']) / 3
        cci = (tp - tp.rolling(window=window).mean()) / (0.015 * tp.rolling(window=window).std())
        self.data['cci'] = cci

    def add_obv_feature(self):
        """Add On-Balance Volume (OBV) feature."""
        obv = (np.sign(self.data['Adj Close'].diff()) * self.data['Volume']).fillna(0).cumsum()
        self.data['obv'] = obv

    def calculate_macd(self):
        # Calculate MACD
        ema_short = self.data['Adj Close'].ewm(span=12, adjust=False, min_periods=12).mean()
        ema_long = self.data['Adj Close'].ewm(span=26, adjust=False, min_periods=26).mean()
        self.data['macd'] = ema_short - ema_long
        self.data['macd_signal'] = self.data['macd'].ewm(span=9, adjust=False, min_periods=9).mean()
        
    def classify_trend(self):
        self.calculate_macd()
        self.data['is_uptrend'] = (self.data['macd'] > self.data['macd_signal']).astype(int)
        self.data['is_downtrend'] = (self.data['macd'] < self.data['macd_signal']).astype(int)

    def add_vix_categories(self):
        vix_80_percentile = self.data['vix'].quantile(0.8)
        vix_40_percentile = self.data['vix'].quantile(0.4)
        self.data['is_vix_high'] = (self.data['vix'] > 30).astype(int)
        self.data['is_vix_moderate'] = ((self.data['vix'] >= 20) & (self.data['vix'] <= 30)).astype(int)
        self.data['is_vix_low'] = (self.data['vix'] < 20).astype(int)
    
    def create_combined_conditions(self):
        self.classify_trend()
        self.add_vix_categories()
    
        self.data['up_high'] = (self.data['is_uptrend'] & self.data['is_vix_high']).astype(int)
        self.data['down_high'] = (self.data['is_downtrend'] & self.data['is_vix_high']).astype(int)
        self.data['up_moderate'] = (self.data['is_uptrend'] & self.data['is_vix_moderate']).astype(int)
        self.data['down_moderate'] = (self.data['is_downtrend'] & self.data['is_vix_moderate']).astype(int)
        self.data['up_low'] = (self.data['is_uptrend'] & self.data['is_vix_low']).astype(int)
        self.data['down_low'] = (self.data['is_downtrend'] & self.data['is_vix_low']).astype(int)



    #standardize data
    def standardize(self):
        scaler = StandardScaler()      
        self.data = scaler.fit_transform(self.data)

    
    #normalize data
    def normalize(self):
        scaler = MinMaxScaler()      
        self.data = scaler.fit_transform(self.data)
  
    # Assuming self.data is a pandas DataFrame and self.label is the target variable

Step 1: Prepare the Data
Step 2: Normalize and Standardize the Data
Step 3: Create Sequences
Step 4: Split the Data into Training and Testing sets
Step 5: Build and Train LSTM model 
Step 6: Evaluate the Model

In [6]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import MinMaxScaler 

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [7]:
yf_data = yf.download("SPY", start=start, end=end)
stock = Stock(yf_data)
stock.factor()

# Standardize the data
stock.standardize()

# Normalize the data
stock.normalize()

def create_sequence(data, label, time_steps=1) :
    Xs, ys = [], []

    for i in range(len(data) - time_steps) :
        Xs.append(data[i:(i + time_steps)])
        ys.append(label[i:(i + time_steps)])

    return np.array(Xs), np.array(ys)

time_steps = 60 # adjust based on temporal frame

X, y = create_sequence(stock.data, stock.label, time_steps)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Index(['Volume', 'volume_pct_change', 'vol_ma_5', 'vol_ma_20', 'vol_ma_50',
       'vol_ma_200', 'ma_5', 'ma_10', 'ma_20', 'ma_50', 'ma_200',
       'PC_Ratio_Change', 'volume_oscillator', 'relative_volume',
       'vol_relative_to_ma', 'pvt', 'SPY_VIX_ratio', 'rsi', 'bollinger_upper',
       'bollinger_lower', 'cci', '10yr_bond_yield_change', 'yield_spread',
       'atr', '%K', '%D', 'SPY_pct_change', 'IWM_pct_change', 'SPY_IWM_ratio',
       'QQQ_pct_change', 'SPY_QQQ_ratio', 'DIA_pct_change', 'SPY_DIA_ratio',
       'DXY_pct_change', 'obv', 'momentum_5d', 'volatility_252d',
       'momentum_volatility'],
      dtype='object')





In [8]:
# Train-test split
train_size = int(len(X) * 0.8)

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Define the LSTM model
model = keras.Sequential()
model.add(layers.LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(layers.LSTM(50, return_sequences=False))
model.add(layers.Dense(25))
model.add(layers.Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=100, validation_split=0.1)

2024-02-27 02:14:34.042475: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-02-27 02:14:34.043237: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-02-27 02:14:34.043730: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/100


2024-02-27 02:14:34.171406: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-02-27 02:14:34.287680: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-02-27 02:14:34.288363: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-02-27 02:14:34.289094: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN



2024-02-27 02:14:36.324972: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-02-27 02:14:36.325410: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-02-27 02:14:36.325834: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x286bfc810>