In [16]:
import numpy as np
import pandas as pd
import sys
sys.path.append("utils/")
from joblib import Parallel, delayed
from lightgbm import LGBMRegressor
from strategies_indicators import *
from tqdm import tqdm
from exploratory_data_analyzer import EDA_Preprocessor
from regressor_utils import Regressor, regression_metrics, evaluate
from sklearn.preprocessing import LabelEncoder

%load_ext autoreload
%autoreload 2

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

# Set Training Data

In [None]:
# load data
df_bio = pd.read_csv("data/df_bio_nasdaq.csv", header=[0, 1], index_col=0)
print(df_bio.shape)
df_bio.head(1)

In [None]:
# fill missing values
df_bio.fillna(method='ffill', inplace=True)
df_bio.fillna(method='bfill', inplace=True)

In [None]:
# find companies
companies = list(df_bio.columns.get_level_values(0).unique())
len(companies)

In [None]:
# mean change in price: start to end
changes = [df_bio[company].iloc[-1].Close / df_bio[company].iloc[0].Close for company in companies]
np.mean(changes)

In [None]:
# filter companies
surviving_companies = [company for company in companies if df_bio[company].iloc[-1].Close / df_bio[company].iloc[0].Close > 0.15]
print(len(surviving_companies))
df_bio = df_bio[surviving_companies]
df_bio.shape

In [None]:
# mean change in price: start to end
changes = [df_bio[company].iloc[-1].Close / df_bio[company].iloc[0].Close for company in surviving_companies]
np.mean(changes)

# Prepare Data

In [None]:
# example dataframe: vtyx
vtyx = df_bio["VTYX"].copy()
print(vtyx.shape)
vtyx.head(1)

In [None]:
# set target
vtyx["moving_average"] = vtyx["Close"].rolling(window=18).mean()
vtyx['future_moving_average'] = vtyx['moving_average'].shift(-18)
vtyx['target_regression'] = ((vtyx['future_moving_average'] / vtyx['Close']) - 1) * 100
vtyx['target_classification'] = np.where(vtyx['future_moving_average'] / vtyx['Close'] > 1, 1, -1)

# plot targets for vtyx stock
vtyx[["Close", "future_moving_average", "target_regression", "target_classification"]].iplot()

In [None]:
# prepare training data
def incorporate_indicators(company):
    # set company
    df_company = df_bio[company].copy()
    df_company["company"] = company
    #incorporate indicators
    df_company, _, _ = sma_strategy(df_company, 200, 1200)
    df_company, _, _ = ema_strategy(df_company, 150, 1350)
    df_company, _, _ = smaema_strategy(df_company, 275)
    df_company, _, _ = macd_strategy(df_company, 125, 170, 100)
    df_company, _, _ = rsi_strategy(df_company, 105, 35)
    df_company, _, _ = stochastic_oscillator_alternative(df_company, 35, 40)
    df_company, _, _ = bollinger_bands(df_company, 45, 1)
    #df_company, _, _ = fibonacci_retracement(df_company, 420)
    df_company, _, _ = rate_of_change(df_company, 3)
    df_company, _, _ = rate_of_change(df_company, 5)
    df_company, _, _ = williams_r(df_company, 3)
    df_company, _, _ = williams_r(df_company, 5)
    df_company, _, _ = chaikin_money_flow(df_company, 820, -0.05)
    df_company = on_balance_volume(df_company)
    df_company = average_true_range(df_company, 7)
    df_company = average_true_range(df_company, 35)
    df_company = average_true_range(df_company, 70)
    # set target 
    df_company["moving_average"] = df_company["Close"].rolling(window=18).mean()
    df_company['future_moving_average'] = df_company['moving_average'].shift(-18)
    df_company['target_regression'] = ((df_company['future_moving_average'] / df_company['Close']) - 1) * 100
    df_company['target_classification'] = np.where(df_company['future_moving_average'] / df_company['Close'] > 1, 1, -1)
    # fill empty values
    df_company.fillna(method='ffill', inplace=True)
    df_company.fillna(method='bfill', inplace=True)
    return df_company

frames = Parallel()(delayed(incorporate_indicators)(company) for company in tqdm(surviving_companies)) #["TNYA", "VTYX"] / surviving_companies
df = pd.concat(frames).reset_index(drop=True)
df.drop(columns=["moving_average","future_moving_average"], inplace=True)
print(df.shape)
df.head(1)

In [None]:
# save data
df.to_parquet("data/df_bio_training.parquet")

# Training

In [3]:
# load data
df_bio = pd.read_parquet("data/df_bio_training.parquet")
print(df_bio.shape)
df_bio.head(1)

(2335480, 50)


Unnamed: 0,Adj Close,Close,High,Low,Open,Volume,company,SMA_Close_S,SMA_Close_L,SMA_strategy,EMA_Close_S,EMA_Close_L,EMA_strategy,SMAEMA_Close_S,SMAEMA_Close_L,SMAEMA_strategy,MACD_Close_S,MACD_Close_L,MACD_strategy,SMA_up,SMA_down,RSI,RSI_strategy,K,SOA_strategy,SMA_Close,Upper_Band,Lower_Band,BB_strategy,ROC_3,ROC_3_strategy,ROC_5,ROC_5_strategy,WR_3,WR_3_strategy,WR_5,WR_5_strategy,MF_multiplier,MF_volume,CMF,CMF_strategy,OBV,TR_7,ATR_7,TR_35,ATR_35,TR_70,ATR_70,target_regression,target_classification
0,177.570007,177.570007,178.220001,175.699997,176.449997,283105.0,A,166.294233,146.610049,-1,167.134467,136.01397,-1,159.290791,163.461708,-1,-1.507703,-1.332527,-1,0.252194,0.298575,45.789423,0,40.826811,0,177.728263,179.48937,175.967156,0,0.001633,0,-5.6e-05,0,-15.47614,0,-14.01497,0,0.484133,137060.388826,-0.020017,0,0,1.289993,0.0,1.289993,0.0,1.289993,0.0,0.353275,1


In [4]:
df_bio.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335480 entries, 0 to 2335479
Data columns (total 50 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Adj Close              2335480 non-null  float64
 1   Close                  2335480 non-null  float64
 2   High                   2335480 non-null  float64
 3   Low                    2335480 non-null  float64
 4   Open                   2335480 non-null  float64
 5   Volume                 2335480 non-null  float64
 6   company                2335480 non-null  object 
 7   SMA_Close_S            2335480 non-null  float64
 8   SMA_Close_L            2335480 non-null  float64
 9   SMA_strategy           2335480 non-null  int32  
 10  EMA_Close_S            2335480 non-null  float64
 11  EMA_Close_L            2335480 non-null  float64
 12  EMA_strategy           2335480 non-null  int32  
 13  SMAEMA_Close_S         2335480 non-null  float64
 14  SMAEMA_Close_L    

In [5]:
# generate EDA_Preprocessor object
eda = EDA_Preprocessor(df_bio, [], ["target_classification"], "regression", "target_regression")

EDA_Preprocessor instance initialized with data:
 	Keeping columns: 0
 	Numeric features: 47
 	Categorical features: 1
 	Binary features: 0
EDA data is now as follows:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335480 entries, 0 to 2335479
Data columns (total 49 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ATR_7              float64
 1   ATR_35             float64
 2   ATR_70             float64
 3   Adj_Close          float64
 4   BB_strategy        int32  
 5   CMF                float64
 6   CMF_strategy       int32  
 7   Close              float64
 8   EMA_Close_L        float64
 9   EMA_Close_S        float64
 10  EMA_strategy       int32  
 11  High               float64
 12  K                  float64
 13  Low                float64
 14  Lower_Band         float64
 15  MACD_Close_L       float64
 16  MACD_Close_S       float64
 17  MACD_strategy      int32  
 18  MF_multiplier      float64
 19  MF_volume          float64
 20  OBV       

In [6]:
# categorical label encode
le = LabelEncoder()
eda.df["company"] = le.fit_transform(eda.df["company"])
mapping = dict(zip(le.classes_, range(len(le.classes_))))
eda.df["company"].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

# Regressor Instance: Train / Test (0.75 / 0.25)

In [None]:
# instance of regressor
regressor = Regressor(eda.df, [], "target_regression")
regressor.generate_train_test()

In [None]:
# score in test
regressor.score_in_test(LGBMRegressor())

In [None]:
 regressor.y_test

In [None]:
# predictions
df_pred = regressor.X_test.copy()
df_pred["target_regression"] = regressor.y_test.copy()
df_pred["prediction"] = regressor.pred_test
df_pred['target_classification'] = np.where(df_pred['target_regression'] > 0, 1, -1)
df_pred['prediction_classification'] = np.where(df_pred['prediction'] > 0, 1, -1)
df_pred.head(1)

In [None]:
# evaluate binary
(df_pred.target_classification == df_pred.prediction_classification).value_counts(normalize=True)

In [None]:
df_pred[:25]

# Manual Strategy

In [None]:
# TODO:
# Try all different target definitions
# Maybe: period of opportunities: idk how to define this => should buy in this period of time, should sell in this period of time
# Backtest everything
# Backtest vs best best strategy with ground-truth way!

In [7]:
mapping["VTYX"]

637

In [8]:
# seperate vtyx as a test set
df_vtyx = eda.df[eda.df.company == 637].reset_index(drop=True)
df_train = eda.df[eda.df.company != 637].reset_index(drop=True)
df_train.shape, df_vtyx.shape

((2331968, 49), (3512, 49))

In [9]:
# manually split data
X_train = df_train.loc[:, df_train.columns != 'target_regression']
y_train = df_train.target_regression
X_test = df_vtyx.loc[:, df_vtyx.columns != 'target_regression']
y_test = df_vtyx.target_regression

In [10]:
# train the model
model = LGBMRegressor()
model.fit(X_train, y_train)

LGBMRegressor()

In [11]:
# do predictions
df_vtyx['target_classification'] = np.where(df_vtyx['target_regression'] > 0, 1, -1)
df_vtyx["ml_prediction"] = model.predict(X_test)
df_vtyx['ml_strategy'] = np.where(df_vtyx['ml_prediction'] > 0, 1, -1)

In [12]:
# binary evaluation
(df_vtyx['target_classification'] == df_vtyx['ml_strategy']).value_counts()

False    1766
True     1746
dtype: int64

In [13]:
# check buy/sell
buy, sell = decide_buy_sell(df_vtyx, "ml")
backtest_strategy(df_vtyx, buy, sell)

2.4694555957601096

In [14]:
df_vtyx.head(1)

Unnamed: 0,ATR_7,ATR_35,ATR_70,Adj_Close,BB_strategy,CMF,CMF_strategy,Close,EMA_Close_L,EMA_Close_S,EMA_strategy,High,K,Low,Lower_Band,MACD_Close_L,MACD_Close_S,MACD_strategy,MF_multiplier,MF_volume,OBV,Open,ROC_3,ROC_3_strategy,ROC_5,ROC_5_strategy,RSI,RSI_strategy,SMAEMA_Close_L,SMAEMA_Close_S,SMAEMA_strategy,SMA_Close,SMA_Close_L,SMA_Close_S,SMA_down,SMA_strategy,SMA_up,SOA_strategy,TR_7,TR_35,TR_70,Upper_Band,Volume,WR_3,WR_3_strategy,WR_5,WR_5_strategy,company,target_regression,target_classification,ml_prediction,ml_strategy
0,0.0,0.0,0.0,16.0,0,0.020667,0,16.0,15.259568,16.0,-1,16.0,83.440849,16.0,16.0,0.08854,0.0,-1,0.273584,0.0,0,16.0,0.0,0,0.0,0,100.0,0,16.414102,16.833848,-1,16.0,15.636927,16.0,0.0,-1,0.0,0,0.0,0.0,0.0,16.0,0.0,-16.559151,0,-16.559151,0,637,0.0,-1,0.527066,1


In [None]:
# plot them all
plot_generic(df_vtyx, "target_regression", "target_classification", buy, sell)

In [18]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [22]:
def plot_(df, col1, dates_of_buy, dates_of_sell):
    """# plot buy / sell dates
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True)

    # Add traces for short long
    fig.add_trace(go.Scatter(x=df.index, y=df["Close"], mode='lines', name="Close"))
    fig.add_trace(go.Scatter(x=df.index, y=df[col1], mode='lines', name=col1))
    fig.add_trace(go.Scatter(x=df.index, y=df[col2], mode='lines', name=col2))

    # Add vertical lines at specified dates
    for date in dates_of_buy:
        fig.add_shape(type="line", x0=date, x1=date, y0=0, y1=1, xref="x", yref="paper", line=dict(color="green", dash="dash"))
    for date in dates_of_sell:
        fig.add_shape(type="line", x0=date, x1=date, y0=0, y1=1, xref="x", yref="paper", line=dict(color="red", dash="dash"))

    # Update the layout and display the plot
    fig.update_layout(title_text=f"Stock Prices & Strategy", xaxis_title="Date", width=1200, height=600)
    fig.show()"""

    # Create subplots with 2 y-axes
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True, specs=[[{'secondary_y': True}]])

    # Add traces for Close on the left y-axis
    fig.add_trace(go.Scatter(x=df.index, y=df["Close"], mode='lines', name="Close"))

    # Add traces for col1
    fig.add_trace(go.Scatter(x=df.index, y=df[col1], mode='lines', name=col1, yaxis='y2'))

    # Add vertical lines at specified dates
    for date in dates_of_buy:
        fig.add_shape(type="line", x0=date, x1=date, y0=0, y1=1, xref="x", yref="paper", line=dict(color="green", dash="dash"))
    for date in dates_of_sell:
        fig.add_shape(type="line", x0=date, x1=date, y0=0, y1=1, xref="x", yref="paper", line=dict(color="red", dash="dash"))

    # Update the layout and display the plot
    fig.update_layout(
        title_text=f"Stock Prices & Strategy",
        xaxis_title="Date",
        xaxis_rangeslider_visible=True,  # Add a range slider for zooming
        width=1200,
        height=600
    )

    # Set the y-axis titles
    fig.update_yaxes(title_text="Close", range=[df["Close"].min(), df["Close"].max()], row=1, col=1)
    fig.update_yaxes(title_text=col1, range=[df[col1].min(), df[col1].max()], secondary_y=True, row=1, col=1)

    fig.show()

In [25]:
df_vtyx.target_regression.min()

-17.75731476860135

In [26]:
df_vtyx.target_regression.max()

61.66366235099212

In [27]:
df_vtyx.target_regression.mean()

0.4240570716135757

In [24]:
# plot them all
plot_(df_vtyx, "target_regression", buy, sell)