In [10]:

from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', None)

In [2]:

df22 = pd.read_csv("/Users/sibysuriyan/Documents/code_projects/TradingApplication/machine_learning/price_levels/price_level22.csv")

In [3]:
def time_independent_features(df, price_level_num):
    # Mid Price
    df["Mid Price"] = (df["Bid Price Level 1"] + df["Ask Price Level 1"]) / 2
    # Mid Price Velocity
    df["Mid Price Velocity"] = df["Mid Price"] - df["Mid Price"].shift(1)
    # Mid Price Acceleration
    df["Mid Price Acceleration"] = df["Mid Price Velocity"] - df["Mid Price Velocity"].shift(1)
    # Bid Ask Spread
    df["Bid Ask Spread"] = (df["Bid Price Level 1"] - df["Ask Price Level 1"])
    # Volume Level Log Bid & Ask
    for i in range(price_level_num):
        df[f"Bid Volume Level {i+1} Log"] = np.log(df[f"Bid Volume Level {i+1}"])
    for i in range(price_level_num):
        df[f"Ask Volume Level {i+1} Log"] = np.log(df[f"Ask Volume Level {i+1}"])
    # Volume Level Difference Bid & Ask
    for i in range(1,price_level_num):
        
        df[f'Bid Volume Diff {i}'] = df[f'Bid Volume Level {i+1}'] - df[f'Bid Volume Level {i}']
    for i in range(1,price_level_num):
        df[f'Ask Volume Diff {i}'] = df[f'Ask Volume Level {i+1}'] - df[f'Ask Volume Level {i}']
    # Imbalance
    df["Imbalance"] = (df["Bid Volume Level 1"] - df["Ask Volume Level 1"]) / (df["Bid Volume Level 1"] + df["Ask Volume Level 1"])
    # VWAP Bid & Ask
    def vwap_helper(df, side, levels):
        total_volume = sum(df[[f"{side} Volume Level {i}" for i in range(1,levels+1)]])
        vwap = sum(df[f'{side} Price Level {i}'] * df[f'{side} Volume Level {i}'] for i in range(1, levels+1)) / total_volume
        return vwap
    df[f"Bid VWAP {price_level_num}"] = df.apply(lambda row: vwap_helper(row, "Bid", price_level_num), axis = 1)
    df[f"Ask VWAP {price_level_num}"] = df.apply(lambda row: vwap_helper(row, "Ask", price_level_num), axis = 1)
    # Total Volume Bid & Ask
    df[f"Total Bid Volume {price_level_num}"] = df[[f'Bid Volume Level {i}' for i in range(1, price_level_num+1)]].sum(axis=1)
    df[f"Total Ask Volume {price_level_num}"] = df[[f'Ask Volume Level {i}' for i in range(1, price_level_num+1)]].sum(axis=1)
    # Price Range
    df["Price Range"] = df["Ask Price Level 10"] - df["Bid Price Level 10"]
    # Time Difference
    df['Timestamp'] = pd.to_datetime(df['Timestamp'],format='ISO8601')
    df['Time Since Last Update'] = df['Timestamp'].diff().dt.total_seconds()

    return df
    

In [4]:
def time_dependent_features(df, windows):
    
    # Mid Price Volatility Calculation
    def volatility_helper(i, series, window):
        if i - window + 1 >= 0:
            std = series[i-window+1:i+1].std()
            return std
        return np.nan
    
    mid_price_series = df["Mid Price"]
    
    for window in windows:
        if window == 1:
            continue
        df[f"Mid Price Volatility {window}"] = df.index.to_series().apply(
            lambda i: volatility_helper(i, mid_price_series, window)
        )

    def create_sma_helper(series, window):
        sma = [None] * len(series)
        for i in range(len(series)):
            if i >= window:
                sma[i] = sum(series[i-window+1:i+1]) / window
        return sma

    def create_rate_of_change_helper(series):
        roc = [None]*len(series)
        for i in range(1,len(series)):
            if series[i-1]!=0:
                roc[i] = (series[i]-series[i-1] / series[i-1])
        return roc

    def create_ema_helper(series, window):
        alpha = 2 / (window+1)
        ema = [None]*len(series)
        ema[0] = series[0]
        for i in range(1, len(series)):
            ema[i] = (alpha * series[i] + (1-alpha) * ema[i-1])
        return ema

    def create_rsi_helper(series, window):
        deltas = [None]*len(series)
        for i in range(1,len(series)):
            deltas[i] = series[i] - series[i-1]
    
        gains = [max(delta, 0) if delta is not None else None for delta in deltas]
        losses = [-min(delta, 0) if delta is not None else None for delta in deltas]
    
        avg_gains = create_sma_helper(gains,window)
        avg_losses = create_sma_helper(losses,window)
    
        rs = [None]*len(series)
        rsi = [None]* len(series)
    
        for i in range(len(series)):
            if avg_gains[i] is None or avg_losses[i] is None:
                continue
                
            if avg_losses[i] == 0:
                rsi[i] = 100
            else:
                rs_value = avg_gains[i] / avg_losses[i]
                rs[i] = rs_value
                rsi[i] = (100 - (100 / (1 + rs_value)))
        return rsi

    for window in windows:
        df[f"Mid Price SMA {window}"] = create_sma_helper(df["Mid Price"], window)

    df["Mid Price ROC"] = create_rate_of_change_helper(df["Mid Price"])

    for window in windows:
        df[f"Mid Price EMA {window}"] = create_ema_helper(df["Mid Price"], window)

    df["Mid Price RSI"] = create_rsi_helper(series = df["Mid Price"], window=14)

    return df


    
    

In [5]:
def create_dataset_helper(df, target_lag_column, price_level_num, time_lag, windows, percent_gain):

    # Feature Columns
    df_copy = df.copy()
    df_copy = time_independent_features(df_copy,price_level_num)
    df_copy = time_dependent_features(df_copy, windows)

    return df_copy


def create_dataset(dfs, target_lag_column, price_level_num, time_lag, windows, percent_gain):
    res_df = pd.DataFrame()
    
    for df in dfs:
        df_done = create_dataset_helper(df, target_lag_column, price_level_num, time_lag, windows, percent_gain)
        res_df = pd.concat([res_df, df_done], ignore_index = True)

    return res_df

In [6]:
def graph(y_actual, y_pred, start = None, end = None):

    if not start:
        start = 0
    if not end:
        end = len(y_actual)

    
    plt.figure(figsize=(10, 5))
    plt.plot(np.arange(len(y_actual))[start:end], y_actual[start:end], label="Actual", color='blue', linewidth=2)
    plt.plot(np.arange(len(y_pred))[start:end], y_pred[start:end], label="Predicted", color='red', linestyle='dashed', linewidth=2)
    #plt.plot(np.arange(len(mid_price))[start:end], mid_price[start:end], label="Mid Price", color='green', linestyle='dashed', linewidth=2)
    plt.xlabel("Time Steps")
    plt.ylabel("Value")
    plt.title("Actual vs Predicted Over Time")
    plt.legend()
    plt.grid(True)
    plt.show()
    

In [7]:
##################################
######### CREATE DATASET #########
##################################
dfs = [df22]
target_lag_column = "Mid Price"
price_level_num = 10
time_lag = 3
windows = [1,2,5,10]
percent_gain = 0.00000
full_df = create_dataset(dfs=dfs, target_lag_column = target_lag_column, price_level_num = price_level_num, time_lag = time_lag, windows = windows, percent_gain = percent_gain)

In [8]:
##########################################
###### EVALUATE INFERENCE PROCESS ########
##########################################
class BinaryClassifier:
    def __init__(self, price_level_num):
        self.price_level_num = price_level_num
        self.historical_inference_vectors = pd.DataFrame()

    def bc_create_inference_vector(self, bids, asks, timestamp_str):
        inf_row = {}

        for i in range(self.price_level_num):
            inf_row[f"Bid Price Level {i+1}"] = bids[i][0]
            inf_row[f"Bid Volume Level {i+1}"] = bids[i][1]

        for i in range(self.price_level_num):
            inf_row[f"Ask Price Level {i+1}"] = asks[i][0]
            inf_row[f"Ask Volume Level {i+1}"] = asks[i][1]


        inf_row["Timestamp"] = timestamp_str

        inf_df = pd.DataFrame([inf_row])

        inf_df = self.bc_time_independent_features(inf_df, self.price_level_num)

        inf_df = self.bc_time_dependent_features(inf_df, [1,2,5,10])


        self.historical_inference_vectors = pd.concat([self.historical_inference_vectors,inf_df], ignore_index= True)

        if len(self.historical_inference_vectors) >= 500:

            self.historical_inference_vectors = self.historical_inference_vectors.tail(200)
            

        return inf_df




    def bc_time_independent_features(self,df, price_level_num):

        # Mid Price
        df["Mid Price"] = (df["Bid Price Level 1"] + df["Ask Price Level 1"]) / 2

        # Mid Price Velocity
        if len(self.historical_inference_vectors) < 1:
            df["Mid Price Velocity"] = np.nan
        else:
            df["Mid Price Velocity"] = df["Mid Price"] - self.historical_inference_vectors.iloc[-1]["Mid Price"] # df["Mid Price"].shift(1)

        # Mid Price Acceleration
        if len(self.historical_inference_vectors) < 2:
            df["Mid Price Acceleration"] = np.nan
        else:
            df["Mid Price Acceleration"] = df["Mid Price Velocity"] - self.historical_inference_vectors.iloc[-1]["Mid Price Velocity"] # df["Mid Price Velocity"].shift(1)
        
        # Bid Ask Spread
        df["Bid Ask Spread"] = (df["Bid Price Level 1"] - df["Ask Price Level 1"])
        
        # Volume Level Log Bid & Ask
        for i in range(price_level_num):
            df[f"Bid Volume Level {i+1} Log"] = np.log(df[f"Bid Volume Level {i+1}"])
        for i in range(price_level_num):
            df[f"Ask Volume Level {i+1} Log"] = np.log(df[f"Ask Volume Level {i+1}"])
        
        # Volume Level Difference Bid & Ask
        for i in range(1,price_level_num):
            df[f'Bid Volume Diff {i}'] = df[f'Bid Volume Level {i+1}'] - df[f'Bid Volume Level {i}']
        for i in range(1,price_level_num):
            df[f'Ask Volume Diff {i}'] = df[f'Ask Volume Level {i+1}'] - df[f'Ask Volume Level {i}']
        
        # Imbalance
        df["Imbalance"] = (df["Bid Volume Level 1"] - df["Ask Volume Level 1"]) / (df["Bid Volume Level 1"] + df["Ask Volume Level 1"])
        
        # VWAP Bid & Ask
        def vwap_helper(df, side, levels):
            total_volume = sum(df[[f"{side} Volume Level {i}" for i in range(1,levels+1)]].values)
            vwap = sum(df[f'{side} Price Level {i}'] * df[f'{side} Volume Level {i}'] for i in range(1, levels+1)) / total_volume
            return vwap
        df[f"Bid VWAP {price_level_num}"] = df.apply(lambda row: vwap_helper(row, "Bid", price_level_num), axis = 1)
        df[f"Ask VWAP {price_level_num}"] = df.apply(lambda row: vwap_helper(row, "Ask", price_level_num), axis = 1)
       
        # Total Volume Bid & Ask
        df[f"Total Bid Volume {price_level_num}"] = df[[f'Bid Volume Level {i}' for i in range(1, price_level_num+1)]].sum(axis=1)
        df[f"Total Ask Volume {price_level_num}"] = df[[f'Ask Volume Level {i}' for i in range(1, price_level_num+1)]].sum(axis=1)
        
        # Price Range
        df["Price Range"] = df["Ask Price Level 10"] - df["Bid Price Level 10"]
        
        # Time Difference
        df['Timestamp'] = pd.to_datetime(df['Timestamp'],format='ISO8601')
        if len(self.historical_inference_vectors) >= 1:
            df['Time Since Last Update'] = (df['Timestamp'] - self.historical_inference_vectors.iloc[-1]["Timestamp"]).dt.total_seconds()
        else:
            df['Time Since Last Update'] = np.nan

        return df


    def bc_time_dependent_features(self, df, windows):

        temp_df = pd.concat([self.historical_inference_vectors, df], ignore_index = True)
                
        # Mid Price Volatility Calculation
        def volatility_helper(series, window):
            if len(series) >= window:
                std = series.tail(window).std()
                return std
            return np.nan
        
        for window in windows:
            if window == 1:
                continue
            df[f"Mid Price Volatility {window}"] = volatility_helper(temp_df["Mid Price"], window)

        

        def create_sma_helper(series, window):

            if len(series) < window:
                return np.nan

            return sum(series.tail(window)) / window
        
        for window in windows:
            df[f"Mid Price SMA {window}"] = create_sma_helper(temp_df["Mid Price"], window)


        
        def create_rate_of_change_helper(series):
            if len(series) < 2:
                return np.nan

            return (series.iloc[-1] - series.iloc[-2]) / (series.iloc[-2])
        
        df[f"Mid Price ROC"] = create_rate_of_change_helper(temp_df["Mid Price"])


        
        def create_ema_helper(series, window):
            alpha = 2 / (window+1)
            ema = [None]*len(series)
            if len(self.historical_inference_vectors) == 0:
                ema[0] = df["Mid Price"]
            else:
                ema[0] = self.historical_inference_vectors.iloc[-1][f"Mid Price EMA {window}"]

            for i in range(1, len(series)):
                ema[i] = (alpha * series[i] + (1-alpha) * ema[i-1])
            return ema[-1]
        
        for window in windows:
            df[f"Mid Price EMA {window}"] = create_ema_helper(temp_df["Mid Price"], window)


        
        def create_rsi_helper(series, window):
            deltas = [None]*len(series)
            for i in range(1,len(series)):
                deltas[i] = series[i] - series[i-1]
        
            gains = [max(delta, 0) if delta is not None else None for delta in deltas]
            losses = [-min(delta, 0) if delta is not None else None for delta in deltas]
        
            avg_gain = create_sma_helper(pd.Series(gains),window)
            avg_loss = create_sma_helper(pd.Series(losses),window)

            if avg_loss == None or avg_gain == None:
                return None

            if avg_loss == 0:
                return 100

            rs = avg_gain / avg_loss
                
            rsi =  (100 - (100 / (1 + rs)))
            
            return rsi
        

        df["Mid Price RSI"] = create_rsi_helper(temp_df["Mid Price"], window=14)

        return df


    def create_training_check(self, df, price_level_num, windows):
        df_copy = df.copy()
        df_copy = time_independent_features(df_copy, price_level_num)
        df_copy = time_dependent_features(df_copy, windows)
        return df_copy
    

In [9]:
binary_classifier = BinaryClassifier(10)
inf_rows = []
for i in range(len(df22)):
    new_row = df22.iloc[i]
    bids = []
    asks = []
    timestamp_str = new_row["Timestamp"]
    for j in range(10):
        bids.append((new_row[f"Bid Price Level {j+1}"],new_row[f"Bid Volume Level {j+1}"]))
    for j in range(10):
        asks.append((new_row[f"Ask Price Level {j+1}"],new_row[f"Ask Volume Level {j+1}"]))
    new_x = binary_classifier.bc_create_inference_vector(bids, asks, timestamp_str)
    inf_rows.append(new_x)

inf_df = pd.concat(inf_rows)



In [26]:
if len(inf_df) != len(full_df):
    print(len(inf_df), len(full_df))
else:
    columns = inf_df.columns

    for i in range(len(full_df)):
        for column_name in columns:
            train_value = full_df.iloc[i][column_name]
            inf_value = full_df.iloc[i][column_name]
            if inf_value != train_value:
                print(f"Train value: {train_value}; inf value: {inf_value}; {i, column_name}")



Train value: nan; inf value: nan; (0, 'Mid Price Velocity')
Train value: nan; inf value: nan; (0, 'Mid Price Acceleration')
Train value: nan; inf value: nan; (0, 'Time Since Last Update')
Train value: nan; inf value: nan; (0, 'Mid Price Volatility 2')
Train value: nan; inf value: nan; (0, 'Mid Price Volatility 5')
Train value: nan; inf value: nan; (0, 'Mid Price Volatility 10')
Train value: nan; inf value: nan; (0, 'Mid Price SMA 1')
Train value: nan; inf value: nan; (0, 'Mid Price SMA 2')
Train value: nan; inf value: nan; (0, 'Mid Price SMA 5')
Train value: nan; inf value: nan; (0, 'Mid Price SMA 10')
Train value: nan; inf value: nan; (0, 'Mid Price ROC')
Train value: nan; inf value: nan; (0, 'Mid Price RSI')
Train value: nan; inf value: nan; (1, 'Mid Price Acceleration')
Train value: nan; inf value: nan; (1, 'Mid Price Volatility 5')
Train value: nan; inf value: nan; (1, 'Mid Price Volatility 10')
Train value: nan; inf value: nan; (1, 'Mid Price SMA 2')
Train value: nan; inf value: n

In [12]:
feature_columns = [
    # 'Mid Price', 
    
    'Bid Ask Spread',
    
    'Bid Volume Level 1','Bid Volume Level 2','Bid Volume Level 3',
    # 'Bid Volume Level 4',
    # 'Bid Volume Level 5','Bid Volume Level 6','Bid Volume Level 7', 'Bid Volume Level 8',
    # 'Bid Volume Level 9','Bid Volume Level 10',
    
    'Ask Volume Level 1','Ask Volume Level 2','Ask Volume Level 3', 
    # 'Ask Volume Level 4',
    # 'Ask Volume Level 5','Ask Volume Level 6','Ask Volume Level 7', 'Ask Volume Level 8',
    # 'Ask Volume Level 9','Ask Volume Level 10',

    'Bid Price Level 1','Bid Price Level 2','Bid Price Level 3', 
    # 'Bid Price Level 4',
    # 'Bid Price Level 5','Bid Price Level 6','Bid Price Level 7', 'Bid Price Level 8',
    # 'Bid Price Level 9','Bid Price Level 10',
    
    'Ask Price Level 1','Ask Price Level 2','Ask Price Level 3', 
    # 'Ask Price Level 4',
    # 'Ask Price Level 5','Ask Price Level 6','Ask Price Level 7', 'Ask Price Level 8',
    # 'Ask Price Level 9','Ask Price Level 10',
    
    'Bid Volume Level 1 Log','Bid Volume Level 2 Log','Bid Volume Level 3 Log', 'Bid Volume Level 4 Log',
    'Bid Volume Level 5 Log',
    # 'Bid Volume Level 6 Log','Bid Volume Level 7 Log', 'Bid Volume Level 8 Log',
    # 'Bid Volume Level 9 Log','Bid Volume Level 10 Log',
    
    'Ask Volume Level 1 Log','Ask Volume Level 2 Log','Ask Volume Level 3 Log', 'Ask Volume Level 4 Log',
    'Ask Volume Level 5 Log',
    # 'Ask Volume Level 6 Log','Ask Volume Level 7 Log', 'Ask Volume Level 8 Log',
    # 'Ask Volume Level 9 Log','Ask Volume Level 10 Log',
    
    # 'Bid Volume Diff 1', 'Bid Volume Diff 2', 'Bid Volume Diff 3',
    # 'Bid Volume Diff 4', 'Bid Volume Diff 5', 'Bid Volume Diff 6',
    # 'Bid Volume Diff 7', 'Bid Volume Diff 8', 'Bid Volume Diff 9',
    
    # 'Ask Volume Diff 1', 'Ask Volume Diff 2', 'Ask Volume Diff 3',
    # 'Ask Volume Diff 4', 'Ask Volume Diff 5', 'Ask Volume Diff 6',
    # 'Ask Volume Diff 7', 'Ask Volume Diff 8', 'Ask Volume Diff 9',
    
    'Imbalance',
    'Mid Price Velocity',
    'Mid Price Acceleration',
    'Time Since Last Update',
    'Bid VWAP 10',
    'Ask VWAP 10', 
    # 'Total Bid Volume 10',
    # 'Total Ask Volume 10', 
    'Price Range', 
    "Mid Price Volatility 2","Mid Price Volatility 5","Mid Price Volatility 10",
    # 'Mid Price SMA 1', 'Mid Price SMA 2', 'Mid Price SMA 5','Mid Price SMA 10',
    # 'Mid Price ROC',
    # 'Mid Price EMA 1', 'Mid Price EMA 2', 'Mid Price EMA 5', 'Mid Price EMA 10',
    'Mid Price RSI'
]

In [13]:
full_X = full_df[feature_columns]
inf_X = inf_df[feature_columns]

filename = "3_update_lag_xgbclassifier1.pkl"
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

filename_scaler = "3_update_lag_xgbclassifier1_scaler.pkl"
with open(filename_scaler, "rb") as file:
    loaded_scaler = joblib.load(file)
full_X_dropna = full_X.dropna(ignore_index=True)
inf_X_dropna = inf_X.dropna(ignore_index=True)
full_X_dropna_scaled = loaded_scaler.transform(full_X_dropna)
inf_X_dropna_scaled = loaded_scaler.transform(inf_X_dropna)
full_proba = loaded_model.predict_proba(full_X_dropna_scaled)
inf_proba = loaded_model.predict_proba(inf_X_dropna_scaled)

In [14]:
full_proba

array([[0.29097529, 0.70902471],
       [0.40721197, 0.59278803],
       [0.40790522, 0.59209478],
       ...,
       [0.72336154, 0.27663846],
       [0.74472863, 0.25527137],
       [0.84353586, 0.15646414]])

In [15]:
inf_proba

array([[0.29097529, 0.70902471],
       [0.40721197, 0.59278803],
       [0.40790522, 0.59209478],
       ...,
       [0.72336154, 0.27663846],
       [0.74472863, 0.25527137],
       [0.84353586, 0.15646414]])