In [515]:
import pandas as pd
import numpy as np
import yaml
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from scipy.fft import fft
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import openpyxl
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score,precision_score, f1_score
import xgboost as xgb
import optuna
import talib
import json
import pickle
import joblib


In [516]:
def load_yaml(file):
    import yaml
    with open(file, 'r') as f:
        config = yaml.safe_load(f)
    return config
    



In [517]:
config = load_yaml('../config.yaml')

In [518]:
#trading_data = pd.read_excel(config['data_excel_path'], sheet_name='Data_Basic')
#trading_data.count()
all_trading_data_dfs = []
sheet_names = ['5minData11-6-2014', '5minData12-17-2019']
for sheet in sheet_names:
    temp_df = pd.read_excel(config['all_data_excel_path'], sheet_name=sheet)
    all_trading_data_dfs.append(temp_df)
all_trading_data = pd.concat(all_trading_data_dfs, ignore_index=True)

# Load 5 min data
all_trading_data['Date'] = pd.to_datetime(all_trading_data['Date'])


In [519]:
##### Set the Date Start and End for the filtering of trading data

train_start_date = pd.to_datetime('2023-01-01')
train_end_date = pd.to_datetime('2023-12-31')

trading_data_raw = all_trading_data[(all_trading_data['Date'] >= train_start_date) & (all_trading_data['Date'] <= train_end_date)]
trading_data_raw = trading_data_raw.reset_index(drop=True)
trading_data_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19430 entries, 0 to 19429
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    19430 non-null  datetime64[ns]
 1   Symbol  19430 non-null  object        
 2   Open    19430 non-null  float64       
 3   High    19430 non-null  float64       
 4   Low     19430 non-null  float64       
 5   Close   19430 non-null  float64       
 6   Volume  19430 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 1.0+ MB


In [520]:
def calculate_bollinger_bands(data, window_size, num_std_dev):
    rolling_mean = data['Close'].rolling(window=window_size).mean()
    rolling_std = data['Close'].rolling(window=window_size).std()
    data['BOLLBU'] = rolling_mean + (rolling_std * num_std_dev)
    data['BOLLBM'] = rolling_mean
    data['BOLLBL'] = rolling_mean - (rolling_std * num_std_dev)
    
    return data

def calculate_donchn_bands(data, window_size):
    data['DONCH_U'] = data['High'].rolling(window=window_size).max()
    data['DONCH_L'] = data['Low'].rolling(window=window_size).min()
    
    return data
    
def calculate_tchr(data):
    period = config['tchr_period']
    retracement = config['tchr_retracement']
    adj = config['tchr_adj']
    range = config['tchr_range']

    if range == 'highlow':
        data['TCHR_U'] = talib.MAX(data['High'], timeperiod=period) + adj
        data['TCHR_L'] = talib.MIN(data['Low'], timeperiod=period) - adj
    elif range == 'close':
        data['TCHR_U'] = talib.MAX(data['Close'], timeperiod=period) + adj
        data['TCHR_L'] = talib.MIN(data['Close'], timeperiod=period) - adj
    
    #calculate retracement
    if retracement == "long":
        data['TCHR'] = (data['Close'] - data['TCHR_L']) / (data['TCHR_U'] - data['TCHR_L'])
    elif retracement == "short":
        data['TCHR'] = (data['TCHR_U'] - data['Close']) / (data['TCHR_U'] - data['TCHR_L'])
    
    return data
    
def calculate_adwm(data):
    period = config['adwm_period']
    data['Previous_Close']= data['Close'].shift(1)
    data['TRH'] = data[['High', 'Previous_Close']].max(axis=1)
    data['TRL'] = data[['Low', 'Previous_Close']].min(axis=1)

    data['ADWM_AD'] = 0.0
    data.loc[data['Close'] > data['Previous_Close'], 'ADWM_AD'] = (data['Close'] - data['TRL'])
    data.loc[data['Close'] < data['Previous_Close'], 'ADWM_AD'] = (data['Close'] - data['TRH'])

    data['ADWM'] = data['ADWM_AD']

    data['ADWMMA'] = data['ADWM'].rolling(window=period).mean()

    return data

def calculate_si(row, prev_row, limit):
    if pd.isna(prev_row['Close']):
        return 0
    c = row['Close']
    c_prev = prev_row['Close']
    o = row['Open']
    o_prev = prev_row['Open']
    return (50 * ((c - c_prev) + (0.5 * (c - o)) + (0.25 * (c_prev - o_prev))) / limit)

def calculate_WASI(data):
    wasi_limit = config['wasi_limit']
    data['SI'] = data.apply(lambda row: calculate_si(row, data.shift(1).loc[row.name], wasi_limit), axis=1)
    data['WASI'] = data['SI']
    return data

def calculate_ATR(data):
    atr_period = config['atr_period']
    atr_ma = config['atr_ma']
    data['ATR'] = talib.ATR(data['High'], data['Low'], data['Close'], timeperiod=atr_period)
    data['ADJATR'] = talib.SMA(data['ATR'], timeperiod=atr_ma)
    return data


def compute_fourier_df(value_series, n_components=10):
    fft_result = np.fft.fft(value_series)
    real = fft_result.real[:n_components]
    imag = fft_result.imag[:n_components]
    mag = np.abs(fft_result)[:n_components]

    real_min, real_max = real.min(), real.max()
    imag_min, imag_max = imag.min(), imag.max()
    mag_min, mag_max = mag.min(), mag.max()
    if real_max > real_min:
        real = (real - real_min) / (real_max - real_min)
    else:
        real = np.full_like(real, 0.5)
    
    if imag_max > imag_min:
        imag = (imag - imag_min) / (imag_max - imag_min)
    else:
        imag = np.full_like(imag, 0.5)
    
    if mag_max > mag_min:
        mag = (mag - mag_min) / (mag_max - mag_min)
    else:
        mag = np.full_like(mag, 0.5)

    return real, imag, mag

    

In [521]:
fourier_lookback_window = config['fourier_lookback_window']
fourier_n_components = config['fourier_n_components']
raw_features_g1 = config['raw_features_g1'].split(',')

#g2 is the other features. need to use standard scaler for this
raw_features_g2 = config['raw_features_g2'].split(',')

#g3 is volume features. need to use min max scaler separately
raw_features_g3 = config['raw_features_g3'].split(',')

raw_features_g4 = config['raw_features_g4'].split(',')

In [522]:
def add_new_features_df(data):
    global fourier_lookback_window
    global fourier_n_components
    global raw_features_g1
    global raw_features_g2
    global raw_features_g3
    global raw_features_g4
    data['Date'] = pd.to_datetime(data['Date'])
    #print(f"bolband period : {config['bolband_period']}")
    bolband_period = config['bolband_period']
    bolband_width = config['bolband_width']
    upper, middle, lower = talib.BBANDS(data['Close'], timeperiod=bolband_period, nbdevup=bolband_width, nbdevdn=bolband_width, matype=0)
    data['BOLLBU'] = upper
    data['BOLLBM'] = middle
    data['BOLLBL'] = lower
    #data = calculate_bollinger_bands(data, int(config['bolband_period']), int(config['bolband_width']))

    # Calculate DONCHN Bands
    donchn_period = config['donchn_period']
    data['DONUP'] = talib.MAX(data['High'], timeperiod=donchn_period)

    data['DONLOW'] = talib.MIN(data['Low'], timeperiod=donchn_period)

    data['DONMID'] = (data['DONLOW'] + data['DONUP']) / 2

    data['MA20'] = talib.SMA(data['Close'], timeperiod=20)

    data['MA50'] = talib.SMA(data['Close'], timeperiod=50)

    data['MA100'] = talib.SMA(data['Close'], timeperiod=100)

    data['EMA20'] = talib.EMA(data['Close'], timeperiod=20)

    # Calculate the pivot points
    data['PVPT'] = (data['High'] + data['Low'] + data['Close']) / 3
    data['PVPTR1'] = (2 * data['PVPT']) - data['Low']

    data['PVPTR2'] = data['PVPT'] + data['High'] - data['Low']

    data['PVPTR3'] = data['High'] + 2 * (data['PVPT'] - data['Low'])

    data['PVPTS1'] = (2 * data['PVPT']) - data['High']

    data['PVPTS2'] = data['PVPT'] + data['High'] - data['Low']

    data['PVPTS3'] = data['Low'] - 2 * (data['High'] - data['PVPT'])

    data = calculate_tchr(data)

    data = calculate_adwm(data)

    data = calculate_WASI(data)

    volume_ma_period = config['volume_ma_period']

    data['VOLMA'] = talib.SMA(data['Volume'], timeperiod=volume_ma_period)

    data = calculate_ATR(data)

    data['DayofWeek'] = data['Date'].dt.dayofweek

    data['DayofWeek'] = data['DayofWeek'].astype('category')

    # Add fourier columns to the df
    for i in range(fourier_n_components):
        data[f'fourier_real_{i+1}'] = np.nan
        data[f'fourier_imag_{i+1}'] = np.nan
        data[f'fourier_mag_{i+1}'] = np.nan

    features = []
    epsilon = 1e-5
    #print(f"fourier window - 1 : {fourier_lookback_window - 1}")
    for i in range(len(data)):
        if i >= fourier_lookback_window - 1:
            #print("entered point 1")
            close_window = data['Close'].iloc[i - fourier_lookback_window + 1: i + 1].values
            real, imag, mag = compute_fourier_df(close_window, n_components=fourier_n_components)

            for j in range(fourier_n_components):
                data.loc[i, f'fourier_real_{j+1}'] = real[j]
                data.loc[i, f'fourier_imag_{j+1}'] = imag[j]
                data.loc[i, f'fourier_mag_{j+1}'] = mag[j]
    # Add the time of day feature to the trading data
    data['Minutes_Passed'] = (data['Date'].dt.hour* 60) + data['Date'].dt.minute
    #invalid_Rows = data[data['Minutes_Passed'].isna() | data['Minutes_Passed'].isin([np.inf, -np.inf])]
    #print(f"Invalid rows count : {invalid_Rows.shape[0]}")
    #print(invalid_Rows.head(10))
    data['TimeOfDay_Group'] = (data['Minutes_Passed'] // 5).astype('int')
    data['Sine_TimeOfDay'] = np.sin(2 * np.pi * data['TimeOfDay_Group'])
    data['Cosine_TimeOfDay'] = np.cos(2 * np.pi * data['TimeOfDay_Group'])
    data['Take_Profit_Level'] = (data['Close'] * config['atr_multiplier'] * data['ADJATR'])
    #print(f"Last row after adding features : ")
    #print(data.tail())
    return data

def calculate_label(data):
    data['Label'] = 'H'
    for i in range(len(data)):
        stop_loss_level_buy = data['Close'][i] - (data['Take_Profit_Level'][i] * config['risk'])
        stop_loss_level_sell = data['Close'][i] + (data['Take_Profit_Level'][i] * config['risk'])
        for j in range(1, config['bars_no_to_wait']):
            if i+j < len(data):
                if data['High'][i+j] > (data['Close'][i] + data['Take_Profit_Level'][i]):
                    if data['Low'][i+j] < stop_loss_level_buy:
                        break
                    else:
                        data.iloc[i, data.columns.get_loc('Label')] = 'B'
                        break
                elif data['Low'][i+j] < (data['Close'][i] - data['Take_Profit_Level'][i]):
                    if data['High'][i+j] > stop_loss_level_sell:
                        break
                    else:
                        data.iloc[i, data.columns.get_loc('Label')] = 'S'
                    break
    
    return data

In [523]:
trading_data = add_new_features_df(trading_data_raw)
trading_data = trading_data.dropna()
trading_data.reset_index(drop=True, inplace=True)

In [524]:
trading_data['Take_Profit_Level'].mean()

np.float64(4.0674445228720275)

In [525]:

trading_data = calculate_label(trading_data)
trading_data['Label'] = trading_data['Label'].astype('category')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [526]:
trading_data.value_counts('Label')

Label
H    17238
S     1050
B     1043
Name: count, dtype: int64

In [527]:
#trading_data['Date'] = pd.to_datetime(trading_data['Date'])
fig = go.Figure(data=[go.Candlestick(x=trading_data['Date'], open=trading_data['Open'], high=trading_data['High'], low=trading_data['Low'], close=trading_data['Close'])])
fig.update_layout(title='CandleStick Chart SPY', xaxis_title='Date', yaxis_title='Price', xaxis_rangeslider_visible=False, yaxis=dict(fixedrange=False), xaxis=dict(type='category'))
buy_signals = trading_data[trading_data['Label'] == 'B']
sell_signals = trading_data[trading_data['Label'] == 'S']
fig.add_trace(go.Scatter(x=buy_signals['Date'], y=buy_signals['Low'], mode='markers', name='Buy Signal', marker=dict(color='blue', size=10)))
fig.add_trace(go.Scatter(x=sell_signals['Date'], y=sell_signals['High'], mode='markers', name='Sell Signal', marker=dict(color='yellow', size=10)))
fig.show()

In [528]:

def get_features(data, inference=False):
    
    # Define global variables
    global minmax_scaling_params
    global stand_scaler
    global raw_features_g1
    global raw_features_g2
    global raw_features_g3
    global raw_features_g4
    
    # need to use min max scaler for g1


    #print(data[raw_features_g1].head())
    #print(data[raw_features_g1].dtypes)
    if inference:
        global_max_g1 = minmax_scaling_params['g1']['max']
        global_min_g1 = minmax_scaling_params['g1']['min']
    else:
        global_min_g1 = data[raw_features_g1].min().min()
        global_max_g1 = data[raw_features_g1].max().max()

    #Apply min max scaler to g1
    if (global_max_g1 - global_min_g1) == 0:
        print("Divide by 0 coming")
    data[raw_features_g1] = (data[raw_features_g1] - global_min_g1) / (global_max_g1 - global_min_g1)


    # Apply standard scaler to g2
    #print("Before standard scaler")
    if inference:
        data[raw_features_g2] = stand_scaler.transform(data[raw_features_g2])
    else:
        scaler = StandardScaler()
        data[raw_features_g2] = scaler.fit_transform(data[raw_features_g2])
        #print("After standard scaler")

        with open('../Saved_Data/g2_standard_scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
    
    eps = 1e-5
    #print(raw_features_g3)
    data[raw_features_g3] = np.log(data[raw_features_g3] + eps)

    # Apply min max scaler to g3
    if inference:
        global_max_g3 = minmax_scaling_params['g3']['max']
        global_min_g3 = minmax_scaling_params['g3']['min']
    else:
        global_min_g3 = data[raw_features_g3].min().min()
        global_max_g3 = data[raw_features_g3].max().max()
        minmax_scaling_params = {
            'g1': {'min': global_min_g1, 'max': global_max_g1},
            'g3': {'min': global_min_g3, 'max': global_max_g3}
        }

        with open('../Saved_Data/minmax_scaling_params.json', 'w') as f:
            json.dump(minmax_scaling_params, f)
            
    if (global_max_g3 - global_min_g3) == 0:
        print("3 Divide by 0 coming")
        
    data[raw_features_g3] = (data[raw_features_g3] - global_min_g3) / (global_max_g3 - global_min_g3)
  
    
        
    # Add one hot encoding for dayofweek column
    #encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories=[range(7)])
    #encoded_days = encoder.fit_transform(data[['DayofWeek']])
    #encoded_days_df = pd.DataFrame(encoded_days, columns=[f'Day_{i}' for i in range(7)])
    #data = pd.concat([data, encoded_days_df], axis=1)
    
    # Set categorical column to category dtype
    

    #output_excel_path = '../temp_data/input_features.xlsx'
    #data.to_excel(output_excel_path, index=False)
    return data


In [529]:
fourier_columns = [f'fourier_real_{j+1}' for j in range(fourier_n_components)] + [f'fourier_imag_{j+1}' for j in range(fourier_n_components)] + [f'fourier_mag_{j+1}' for j in range(fourier_n_components)]

#all_feature_columns = raw_features_g1 + raw_features_g2 + raw_features_g3 + raw_features_g4 + fourier_columns + ['DayofWeek']
all_feature_columns = raw_features_g1 + raw_features_g2 + raw_features_g3 + raw_features_g4 + fourier_columns
trading_data_full = get_features(trading_data)
trading_data_full = trading_data_full.dropna()
#trading_features = trading_data_full[all_feature_columns]
#trading_labels = trading_data_full['Label']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [530]:
print(trading_data_full[all_feature_columns].shape)

(19331, 60)


In [531]:
trading_data_full.dropna(inplace=True)
trading_data_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19331 entries, 0 to 19330
Data columns (total 75 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               19331 non-null  datetime64[ns]
 1   Symbol             19331 non-null  object        
 2   Open               19331 non-null  float64       
 3   High               19331 non-null  float64       
 4   Low                19331 non-null  float64       
 5   Close              19331 non-null  float64       
 6   Volume             19331 non-null  float64       
 7   BOLLBU             19331 non-null  float64       
 8   BOLLBM             19331 non-null  float64       
 9   BOLLBL             19331 non-null  float64       
 10  DONUP              19331 non-null  float64       
 11  DONLOW             19331 non-null  float64       
 12  DONMID             19331 non-null  float64       
 13  MA20               19331 non-null  float64       
 14  MA50  

In [532]:

trading_data_full['Label'] = trading_data_full['Label'].astype('category')

label_encoder = LabelEncoder()
trading_data_full['Encoded_Label'] = label_encoder.fit_transform(trading_data_full['Label'])
trading_data_full['Encoded_Label'] = trading_data_full['Encoded_Label'].astype('int')
trading_data_full['Encoded_Label'] = trading_data_full['Encoded_Label'].astype('category')
#trading_data_full.loc[:, 'Encoded_Label'] = trading_data_full['Encoded_Label'].astype(int)
#trading_data_full['Encoded_Label'].astype('category')
#print(trading_data_full['Encoded_Label'].dtype)
#with open('../Saved_Data/label_encoder.pkl', 'wb') as f:
#    pickle.dump(label_encoder, f)

trading_data_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19331 entries, 0 to 19330
Data columns (total 76 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               19331 non-null  datetime64[ns]
 1   Symbol             19331 non-null  object        
 2   Open               19331 non-null  float64       
 3   High               19331 non-null  float64       
 4   Low                19331 non-null  float64       
 5   Close              19331 non-null  float64       
 6   Volume             19331 non-null  float64       
 7   BOLLBU             19331 non-null  float64       
 8   BOLLBM             19331 non-null  float64       
 9   BOLLBL             19331 non-null  float64       
 10  DONUP              19331 non-null  float64       
 11  DONLOW             19331 non-null  float64       
 12  DONMID             19331 non-null  float64       
 13  MA20               19331 non-null  float64       
 14  MA50  

In [533]:
with open('../Saved_Data/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)


In [534]:

label_column_name = 'Encoded_Label'
#assert trading_features.shape[0] == trading_labels.shape[0], "Mismatch between features and labels length"
X = trading_data_full[all_feature_columns]
y = trading_data_full[label_column_name]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=46)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=46)


In [535]:


#convert data to d matrix to use with xgb
enab_cat = True
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=enab_cat )
dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=enab_cat)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
dtrain_valid = xgb.DMatrix(data = pd.concat([X_train, X_valid]),label=pd.concat([y_train, y_valid]), enable_categorical=enab_cat)



In [536]:
y_train.unique()

[0, 1, 2]
Categories (3, int64): [0, 1, 2]

In [537]:
num_boosting_rounds = 5000

In [None]:

##### Implement initial training of the model   
learning_rate = 0.3
starting_tree_method = 'approx'
#metric can be mlogloss, auc, merror etc
metric = 'auc'

base_params = {
    'objective': 'multi:softprob',
    'eval_metric': metric,
    'num_class': 3
}

params = {
    'learning_rate': learning_rate,
    'tree_method': starting_tree_method
}

params.update(base_params)

model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boosting_rounds, evals=[(dtrain, 'train')], early_stopping_rounds=50)

[0]	train-mlogloss:0.70017
[1]	train-mlogloss:0.51046
[2]	train-mlogloss:0.39551
[3]	train-mlogloss:0.32706
[4]	train-mlogloss:0.27735
[5]	train-mlogloss:0.24298
[6]	train-mlogloss:0.21645
[7]	train-mlogloss:0.19545
[8]	train-mlogloss:0.17949
[9]	train-mlogloss:0.16222
[10]	train-mlogloss:0.14965
[11]	train-mlogloss:0.14132
[12]	train-mlogloss:0.13308
[13]	train-mlogloss:0.12623
[14]	train-mlogloss:0.11789
[15]	train-mlogloss:0.11267
[16]	train-mlogloss:0.10983
[17]	train-mlogloss:0.10514
[18]	train-mlogloss:0.09707
[19]	train-mlogloss:0.09006
[20]	train-mlogloss:0.08312
[21]	train-mlogloss:0.07936
[22]	train-mlogloss:0.07594
[23]	train-mlogloss:0.07214
[24]	train-mlogloss:0.06551
[25]	train-mlogloss:0.06050
[26]	train-mlogloss:0.05822
[27]	train-mlogloss:0.05493
[28]	train-mlogloss:0.05180
[29]	train-mlogloss:0.04914
[30]	train-mlogloss:0.04703
[31]	train-mlogloss:0.04546
[32]	train-mlogloss:0.04286
[33]	train-mlogloss:0.04023
[34]	train-mlogloss:0.03826
[35]	train-mlogloss:0.03653
[3

In [539]:
##### CReate the objective function for optuna to tune tree parameters

def objective(trial):
    params = {
        'tree_method' : trial.suggest_categorical('tree_method', ['approx', 'hist', 'exact']) , 
        'learning_rate': learning_rate,
        'gamma': trial.suggest_float('gamma', 1e-9, 1e-3),
       'max_depth': trial.suggest_int('max_depth', 3, 6),
       'min_child_weight': trial.suggest_float('min_child_weight', 1, 250),
       'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('reg_lambda', 0.5, 25), 
        'alpha': trial.suggest_float('alpha', 0.001, 0.5, log = True),
    }
    params.update(base_params)
    #thresholds = [trial.suggest_float(f'threshold_{i}', 0.1, 0.9) for i in ]
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'valid-{metric}')

    xgb_model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boosting_rounds, 
                          evals=[(dtrain, 'train'),(dvalid, 'valid')],
                          early_stopping_rounds=50,
                          verbose_eval=0,
                          callbacks=[pruning_callback])
    trial.set_user_attr('best_iteration', xgb_model.best_iteration)
    #xgb.XGBClassifier(**params, random_state=46, early_stopping_rounds=30, objective='multi:softprob', lambda_=config['lam'], alpha=config['alpha'], n_estimators=100)
    #xgb_model.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_valid, Y_valid)])

    y_pred = xgb_model.predict(dvalid)
    y_pred = y_pred.argmax(axis=1)
    #accuracy = accuracy_score(Y_test, y_pred)
    #score = recall_score(Y_test, y_pred, average='macro')
    #score = precision_score(y_valid, y_pred, average='macro')
    score = recall_score(y_valid, y_pred, average='macro')

    return score

In [560]:

study = optuna.create_study(direction='maximize') # for metric auc its maximuze, and for mlogloss its minimie

study.optimize(objective, n_trials=50)

# Get the best parameters
print(f" Best parameters: {study.best_params}")
print(f" Best Accuracy: {study.best_value}")

[I 2025-02-01 22:22:08,671] A new study created in memory with name: no-name-2ef89920-763b-4c4f-884d-f0f05482c339
[I 2025-02-01 22:22:17,973] Trial 0 finished with value: 0.8248942471217336 and parameters: {'tree_method': 'hist', 'gamma': 0.00024292467250342108, 'max_depth': 5, 'min_child_weight': 86.71619364090158, 'subsample': 0.45858552914738904, 'colsample_bytree': 0.9944376268299656, 'reg_lambda': 7.797095342365796, 'alpha': 0.24917558992386102}. Best is trial 0 with value: 0.8248942471217336.
[I 2025-02-01 22:23:24,234] Trial 1 finished with value: 0.8657828907958421 and parameters: {'tree_method': 'approx', 'gamma': 1.3134872611657228e-05, 'max_depth': 6, 'min_child_weight': 138.03746811454647, 'subsample': 0.8126835373440766, 'colsample_bytree': 0.7179098343114585, 'reg_lambda': 20.419727607694604, 'alpha': 0.17846474319940664}. Best is trial 1 with value: 0.8657828907958421.
[I 2025-02-01 22:23:59,742] Trial 2 finished with value: 0.6641359240445025 and parameters: {'tree_meth

 Best parameters: {'tree_method': 'approx', 'gamma': 1.3134872611657228e-05, 'max_depth': 6, 'min_child_weight': 138.03746811454647, 'subsample': 0.8126835373440766, 'colsample_bytree': 0.7179098343114585, 'reg_lambda': 20.419727607694604, 'alpha': 0.17846474319940664}
 Best Accuracy: 0.8657828907958421


In [561]:
best_params = study.best_params
best_params

{'tree_method': 'approx',
 'gamma': 1.3134872611657228e-05,
 'max_depth': 6,
 'min_child_weight': 138.03746811454647,
 'subsample': 0.8126835373440766,
 'colsample_bytree': 0.7179098343114585,
 'reg_lambda': 20.419727607694604,
 'alpha': 0.17846474319940664}

In [562]:
low_learning_rate = 0.01

params = {}
params.update(base_params)
params.update(study.best_params)
params['learning_rate'] = low_learning_rate

model_stage2 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boosting_rounds, 
                         evals=[(dtrain, 'train'), (dvalid, 'valid')], 
                         early_stopping_rounds=50,
                         verbose_eval=0)

In [563]:
model_stage2.best_iteration # got the best iteration from stage 2 training

9999

In [564]:
model_final = xgb.train(params=params, dtrain=dtrain_valid,
                        num_boost_round = model_stage2.best_iteration,
                        verbose_eval=0)

In [565]:
model_path = f'../Saved_Data/{config['model_save_name']}'
model_final.save_model(model_path)

In [566]:
y_pred_test_prob = model_final.predict(dtest)
#y_probmax = np.argmax(y_pred_test, axis=1)
#print(y_pred_test)
classes = ['B', 'H', 'S']

y_pred_test = np.argmax(y_pred_test_prob, axis=1)

test_dist = pd.Series(y_test).value_counts().sort_index()
print(f"Class distribution in test set : \n {test_dist}")

pred_dist = pd.Series(y_pred_test).value_counts().sort_index()
print(f"Class distribution in predictions : \n {pred_dist}")

conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_matrix_df = pd.DataFrame(conf_matrix, index=classes, columns=classes)

print("Confusion Matrix : ")
print(conf_matrix_df)
#Evaluate test set
print(f"Test Accuracy : {accuracy_score(y_test, y_pred_test)}")
print("classification_report : ")
print(classification_report(y_test, y_pred_test))

Class distribution in test set : 
 Encoded_Label
0     159
1    2580
2     161
Name: count, dtype: int64
Class distribution in predictions : 
 0      97
1    2716
2      87
Name: count, dtype: int64
Confusion Matrix : 
    B     H   S
B  91    67   1
H   6  2569   5
S   0    80  81
Test Accuracy : 0.9451724137931035
classification_report : 
              precision    recall  f1-score   support

           0       0.94      0.57      0.71       159
           1       0.95      1.00      0.97      2580
           2       0.93      0.50      0.65       161

    accuracy                           0.95      2900
   macro avg       0.94      0.69      0.78      2900
weighted avg       0.94      0.95      0.94      2900



In [567]:
def objective_threshold(trial):
    # threshold values to be tuned
    thresholds = [trial.suggest_float(f'threshold_{i}', 0.1, 0.9) for i in range(y_pred_test_prob.shape[1])]
    
    y_pred = np.zeros_like(y_pred_test_prob)
    for class_idx, threshold in enumerate(thresholds):
        y_pred[:, class_idx] = (y_pred_test_prob[:, class_idx] >= threshold).astype(int)
    
    y_pred_final = np.argmax(y_pred, axis=1)

    
    # Calculate the F1 score for each class
    f1_sc = f1_score(y_test, y_pred_final, average='macro')
    
    return f1_sc

In [568]:
study_threshold = optuna.create_study(direction='maximize')
study_threshold.optimize(objective_threshold, n_trials=50)
best_thresholds = study_threshold.best_params
print(f"Best Thresholds : {best_thresholds}")

[I 2025-02-01 22:59:12,466] A new study created in memory with name: no-name-9c1a6bda-5ca9-4f9e-9b8f-71ef6c43d68a
[I 2025-02-01 22:59:12,578] Trial 0 finished with value: 0.6084714069228883 and parameters: {'threshold_0': 0.5359150151499734, 'threshold_1': 0.7942880506756551, 'threshold_2': 0.6804235054410629}. Best is trial 0 with value: 0.6084714069228883.
[I 2025-02-01 22:59:12,597] Trial 1 finished with value: 0.6479284878545714 and parameters: {'threshold_0': 0.29592964872118854, 'threshold_1': 0.15356458277760457, 'threshold_2': 0.1524762830907961}. Best is trial 1 with value: 0.6479284878545714.
[I 2025-02-01 22:59:12,604] Trial 2 finished with value: 0.6648114392190222 and parameters: {'threshold_0': 0.4622752086469655, 'threshold_1': 0.7776131799224427, 'threshold_2': 0.5996309544298608}. Best is trial 2 with value: 0.6648114392190222.
[I 2025-02-01 22:59:12,611] Trial 3 finished with value: 0.7361982613931407 and parameters: {'threshold_0': 0.12094637407608957, 'threshold_1':

Best Thresholds : {'threshold_0': 0.3189756333479579, 'threshold_1': 0.6311117939025653, 'threshold_2': 0.12165625162493515}


In [569]:
with open('../Saved_Data/minmax_scaling_params.json', 'r') as f:
    minmax_scaling_params = json.load(f)

# Load standard scaler
with open('../Saved_Data/g2_standard_scaler.pkl', 'rb') as f:
    stand_scaler = pickle.load(f)

with open('../Saved_Data/label_encoder.pkl', 'rb') as f:
    loaded_label_encoder = pickle.load(f)

In [570]:
## Load the new data to run inference
#new_data = pd.read_excel(config['data_excel_path'], sheet_name='TestNew')
new_data_start_date = pd.to_datetime('2024-01-01')
new_data_end_date = pd.to_datetime('2024-12-31')
new_data = all_trading_data[(all_trading_data['Date'] >= new_data_start_date) & (all_trading_data['Date'] <= new_data_end_date)]
new_data = new_data.reset_index(drop=True)

In [571]:
new_data.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2024-01-02 09:30:00,SPY,472.16,472.8,472.05,472.67,2339778
1,2024-01-02 09:35:00,SPY,472.67,472.74,471.88,471.92,1574945
2,2024-01-02 09:40:00,SPY,471.92,472.1,471.71,471.8,1634708
3,2024-01-02 09:45:00,SPY,471.79,472.09,471.39,471.39,1398881
4,2024-01-02 09:50:00,SPY,471.395,471.95,471.36,471.42,1396561


In [572]:
bolband_period = config['bolband_period']
donchn_period = config['donchn_period']
max_MA_period = 100
max_EMA_period = 20
tchr_period = config['tchr_period']
adwm_period = config['adwm_period']
atr_period = config['atr_period']
volume_ma_period = config['volume_ma_period']
print(f" Fourier window : {fourier_lookback_window}")
max_window = max(bolband_period, 
                 donchn_period, 
                 max_MA_period, 
                 max_EMA_period,
                 tchr_period,
                 adwm_period,
                 atr_period,
                 volume_ma_period, fourier_lookback_window)


 Fourier window : 100


In [573]:
def process_data(raw_data):
    trades = []
    balance = 0
    profit_amount = 10
    profit_count = 0
    loss_count = 0
    loss_amount = profit_amount * config['risk']
    trading_signals = pd.DataFrame(columns=['Date','High', 'Low', 'Open', 'Close', 'Signal'])
    thres_opt = config['use_threshold_optimization']
    print(f"Use threshold optimization : {thres_opt}")
                                   
    global label_encoder
    # This is the dataframe to which we will be adding the rows live
    historical_data = pd.DataFrame(columns=['Date','High', 'Low', 'Open', 'Close', 'Volume'])
    numeric_columns = ['High', 'Low', 'Open', 'Close', 'Volume'] 
    historical_data[numeric_columns] = historical_data[numeric_columns].apply(pd.to_numeric)
    predictions = []
    for index,row in raw_data.iterrows():
        #print(f'volume : {row['Volume']}')
        if index % 100 == 0:
            print(f'Index : {index}')
        if index > 2000:
            break
        new_row = pd.DataFrame({'Date': [row['Date']], 'High': [row['High']], 'Low': [row['Low']], 'Open': [row['Open']], 'Close': [row['Close']], 'Volume': [row['Volume']]})
        signal_row = pd.DataFrame({'Date': [row['Date']], 'High': [row['High']], 'Low': [row['Low']], 'Open': [row['Open']], 'Close': [row['Close']], 'Signal': ['N']})
        trading_signals = pd.concat([trading_signals, signal_row], ignore_index=True)
        historical_data = pd.concat([historical_data, new_row], ignore_index=True)
        historical_data['Volume'] = historical_data['Volume'].astype(int)
        #print(historical_data['Volume'])
        #print(f'historical data length : {len(historical_data)}')
        if len(historical_data) > max_window:
            #print(f"At index : {index}")
            #print(f"length of historical data : {len(historical_data)} , so splicing")
            historical_data = historical_data.iloc[-max_window:].reset_index(drop=True)
            #print(f"Now lenngth : {len(historical_data)} and max window : {max_window}")
            
        if len(historical_data) >= max_window:
            updated_data = add_new_features_df(historical_data.copy())
            #print("Historical data 1 : ")
            #print(historical_data[['Volume']].tail())
            #last_row_to_print = historical_data.iloc[[-1]]
            #for column, value in last_row_to_print.items():
            #    print(f'{column} : {value}')
            #updated_data.info()
            inf_features = get_features(updated_data.copy(), inference=True)
            #print("Historical data 2 : ")
            #print(historical_data[['Volume']].tail())
            last_row_features = inf_features.iloc[[-1]]
            last_row_features = last_row_features[all_feature_columns]
            if last_row_features.isna().any().any():
                print(f'The inference row at index : {index} contains na')
                #last_row_to_print = updated_data.iloc[[-1]]
                #for column, value in last_row_to_print.items():
                #    print(f'{column} : {value}')
                print(historical_data[['Volume']].tail())
                break
                continue
        
            last_row_dm = xgb.DMatrix(last_row_features, enable_categorical=True)
            prob_prediction = model_final.predict(last_row_dm)
            if thres_opt:
                prob_prediction_final = np.zeros_like(prob_prediction)
                for class_idx, threshold in enumerate(best_thresholds.values()):
                    prob_prediction_final[:, class_idx] = (prob_prediction[:, class_idx] > threshold).astype(int)

                predicted_class_index = np.argmax(prob_prediction_final, axis=1)  
            else:
                predicted_class_index = np.argmax(prob_prediction, axis=1)  
            #predicted_class_index = np.argmax(prob_prediction, axis=1)
            prediction = label_encoder.inverse_transform(predicted_class_index)
            #print(f'Prediction : {prediction}')
            trading_signals.loc[trading_signals.index[-1], 'Signal'] = prediction[0]
            predictions.append(prediction[0])
            for trade in trades:
                if (trade['Type'] == 'B' and row['High'] >= trade['TakeProfit']) or (trade['Type'] == 'S' and row['Low'] <= trade['TakeProfit']):
                    trade['Active'] = 'N'
                    balance += profit_amount
                    profit_count += 1
                    print(f"Profit, New Balance : {balance}")
                elif (trade['Type'] == 'B' and row['Low'] <= trade['StopLoss']) or (trade['Type'] == 'S' and row['High'] >= trade['StopLoss']):
                    trade['Active'] = 'N'
                    balance -= loss_amount
                    loss_count += 1
                    print(f"Loss, New Balance : {balance}")
            
            # Filter out all trades that are not active
            trades = [trade for trade in trades if trade['Active'] == 'Y']
            
            if len(trades) == 0:
                if prediction[0] == 'B':
                    take_profit = updated_data.iloc[-1]['Take_Profit_Level'] 
                    trades.append({"Type": "B", 
                                "TakeProfit": row['Close'] + take_profit,
                                "StopLoss": row['Close'] - (take_profit * config['risk']),
                                "Active": "Y"
                                })
                elif prediction[0] == 'S':
                    take_profit = updated_data.iloc[-1]['Take_Profit_Level'] 
                    trades.append({"Type": "S", 
                                "TakeProfit": row['Close'] - take_profit,
                                "StopLoss": row['Close'] + (take_profit * config['risk']),
                                "Active": "Y"
                                })
        


    #print(f' Predictions : {predictions}')
    counts = pd.Series(predictions).value_counts()
    print(counts)
    print(f"Final Balance : {balance} \n Profit count : {profit_count} \n Loss count : {loss_count}")
    
    return trading_signals

            

        
    

In [574]:
new_data.isna().any(axis=1).sum()

np.int64(0)

In [555]:
print(volume_ma_period)

10


In [575]:
new_data.shape[0]

19473

In [576]:
tr_signals = process_data(new_data)


Use threshold optimization : True
Index : 0



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Index : 100
Index : 200
Index : 300
Loss, New Balance : -5.0
Index : 400
Index : 500
Index : 600
Index : 700
Index : 800
Index : 900
Index : 1000
Index : 1100
Loss, New Balance : -10.0
Index : 1200
Profit, New Balance : 0.0
Index : 1300
Index : 1400
Index : 1500
Profit, New Balance : 10.0
Index : 1600
Index : 1700
Index : 1800
Index : 1900
Index : 2000
H    1892
S       9
B       1
Name: count, dtype: int64
Final Balance : 10.0 
 Profit count : 2 
 Loss count : 2


In [577]:
#trading_data['Date'] = pd.to_datetime(trading_data['Date'])
fig = go.Figure(data=[go.Candlestick(x=tr_signals['Date'], open=tr_signals['Open'], high=tr_signals['High'], low=tr_signals['Low'], close=tr_signals['Close'])])
fig.update_layout(title='CandleStick Chart Trading Signals SPY', xaxis_title='Date', yaxis_title='Price', xaxis_rangeslider_visible=False, yaxis=dict(fixedrange=False), xaxis=dict(type='category'))
buy_signals = tr_signals[tr_signals['Signal'] == 'B']
sell_signals = tr_signals[tr_signals['Signal'] == 'S']
fig.add_trace(go.Scatter(x=buy_signals['Date'], y=buy_signals['Low'], mode='markers', name='Buy Signal', marker=dict(color='blue', size=10)))
fig.add_trace(go.Scatter(x=sell_signals['Date'], y=sell_signals['High'], mode='markers', name='Sell Signal', marker=dict(color='yellow', size=10)))
fig.show()

In [None]:
def simulate_returns_trading_signal(ts_df):
    