In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import Config
import datetime
import time
import copy
import BS

from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, precision_score

from pymongo import MongoClient
client=MongoClient(Config.DB_Hostname,Config.DB_Port)

class Gap_Move_Classifier:
    
    def __init__(self, start_date, end_date, prediction_date, underlying, start_holding_period, end_holding_period):
        
        self.start_date = start_date #2017-01-01
        self.end_date = end_date #2017-01-01
        self.prediction_date = prediction_date #2017-01-01
        self.underlying = underlying #NIFTY, BANKNIFTY
        self.underlying_dynamics = None
        self.vol_surface = None
        self.days_to_expiry = None
        self.data_matrix = None
        self.return_matrix = None
        self.start_holding_period = start_holding_period #in minutes
        self.end_holding_period = end_holding_period #in minutes
        self.XGBoost_Parameters = None
        if self.start_date>=self.prediction_date:
            raise Exception("Start date has to be before Prediction Date")

        #start = time.time()
        self.underlying_dynamics = pd.DataFrame(client[f'{Config.Data_DB}'][f'{self.underlying}OHLC'].find())
        self.underlying_dynamics.drop(columns=['_id'],inplace=True)
        self.underlying_dynamics.sort_values(['date','batch_id'],inplace=True)
        self.underlying_dynamics=self.underlying_dynamics[(self.underlying_dynamics.date>=self.start_date)&(self.underlying_dynamics.date<=self.end_date)]
        self.underlying_dynamics.batch_id=self.underlying_dynamics.batch_id.astype(int)
        self.underlying_dynamics.set_index('date',inplace=True)
        #print(f"Prices Downloaded in {time.time() - start} seconds")

        #start=time.time()
        self.vol_surface=pd.DataFrame(client[f'{Config.Data_DB}']['Vol_Surface'].find({"SYMBOL" : self.underlying}))
        self.vol_surface.drop(columns=['_id', 'CONTRACTS', 'VAL_INLAKH','OPEN_INT', 'CHG_IN_OI', 'SYMBOL'], inplace=True)
        self.vol_surface.sort_values('date',inplace=True)
        self.vol_surface=self.vol_surface[(self.vol_surface.date>=self.start_date)&(self.vol_surface.date<=self.end_date)]
        self.vol_surface.set_index('date',inplace=True)
        #print(f"Vol Surface Downloaded in {time.time() - start} seconds")
        
        #start=time.time()
        self.days_to_expiry = pd.DataFrame(client[f'{Config.Data_DB}']['Days_To_Expiry'].find({"underlying":self.underlying}))
        self.days_to_expiry.drop(columns = ['_id','underlying'],inplace=True)
        self.days_to_expiry.sort_values('date',inplace=True)
        self.days_to_expiry=self.days_to_expiry[(self.days_to_expiry.date>=self.start_date)&(self.days_to_expiry.date<=self.end_date)]
        self.days_to_expiry.set_index('date',inplace=True)
        #print(f"Expiry Dates Downloaded in {time.time()-start} seconds")

        #start=time.time()
        self._make_features()
        #print(f"Features Engineered in {time.time() - start} seconds")
        
    def _make_features(self):
        
        vol_surface = copy.deepcopy(self.vol_surface)
        if self.end_date not in vol_surface.index:
            vol_surface.loc[self.end_date] = [np.nan]*len(vol_surface.columns)

        closing_candle = self.underlying_dynamics[self.underlying_dynamics.batch_id==375 - (self.start_holding_period - 1)][['open']]
        if self.end_date not in closing_candle.index:
            closing_candle.loc[self.end_date] = np.nan
        closing_candle=closing_candle.shift()
        
        opening_candle = self.underlying_dynamics[self.underlying_dynamics.batch_id==1 + (self.end_holding_period - 1)][['close']]
        if self.end_date not in opening_candle.index:
            opening_candle.loc[self.end_date] = np.nan
        
        candle = pd.merge(closing_candle, opening_candle, left_index=True, right_index=True)
        candle.close = np.where(candle.close.isna(), candle.open, candle.close)
        
        self.data_matrix = pd.DataFrame()
        self.data_matrix["gap_move"] = (candle.close - candle.open) / candle.open
        self.data_matrix["gap_move_flag"] = np.where(self.data_matrix.gap_move>=0, 1, 0)
        self.data_matrix["gap_move_lagged_1"] = self.data_matrix.gap_move.shift()
        self.data_matrix["gap_move_lagged_2"] = self.data_matrix.gap_move.shift(2)
        self.data_matrix["gap_move_lagged_5"] = self.data_matrix.gap_move.shift(5)
        self.data_matrix = pd.merge(self.data_matrix, vol_surface.shift(), left_index = True, right_index = True).dropna()
        
        candle = pd.merge(candle, self.days_to_expiry, left_index=True, right_index=True)
        candle["IVOL"] = vol_surface["vol_0"].shift()
        candle.dropna(inplace = True)

        candle["call_current_week_return"] = candle.apply(lambda x : (BS.bs_call(x.close, 
                                                                                x.open * np.exp(Config.interest_rate * x.current_week / 252), 
                                                                                (x.current_week - self.end_holding_period/375)/ 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL) - 

                                                                      BS.bs_call(x.open, 
                                                                                x.open * np.exp(Config.interest_rate * x.current_week / 252), 
                                                                                (x.current_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL)) / 

                                                                      BS.bs_call(x.open, 
                                                                                x.open * np.exp(Config.interest_rate * x.current_week / 252), 
                                                                                (x.current_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL), axis = 1)

        candle["call_next_week_return"] = candle.apply(lambda x : (BS.bs_call(x.close, 
                                                                                x.open * np.exp(Config.interest_rate * x.next_week / 252), 
                                                                                (x.next_week - self.end_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL) - 

                                                                      BS.bs_call(x.open, 
                                                                                x.open * np.exp(Config.interest_rate * x.next_week / 252), 
                                                                                (x.next_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL)) / 

                                                                      BS.bs_call(x.open, 
                                                                                x.open * np.exp(Config.interest_rate * x.next_week / 252), 
                                                                                (x.next_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL), axis = 1)

        candle["put_current_week_return"] = candle.apply(lambda x : (BS.bs_put(x.close, 
                                                                                x.open * np.exp(Config.interest_rate * x.current_week / 252), 
                                                                                (x.current_week - self.end_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL) - 

                                                                      BS.bs_put(x.open, 
                                                                                x.open * np.exp(Config.interest_rate * x.current_week / 252), 
                                                                                (x.current_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL)) / 

                                                                      BS.bs_put(x.open, 
                                                                                x.open * np.exp(Config.interest_rate * x.current_week / 252), 
                                                                                (x.current_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL), axis = 1)

        candle["put_next_week_return"] = candle.apply(lambda x : (BS.bs_put(x.close, 
                                                                                x.open * np.exp(Config.interest_rate * x.next_week / 252), 
                                                                                (x.next_week - self.end_holding_period/375) / 252, 
                                                                                Config.interest_rate, 
                                                                                x.IVOL) - 

                                                                  BS.bs_put(x.open, 
                                                                            x.open * np.exp(Config.interest_rate * x.next_week / 252), 
                                                                            (x.next_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                            Config.interest_rate, 
                                                                            x.IVOL)) / 

                                                                  BS.bs_put(x.open,
                                                                            x.open * np.exp(Config.interest_rate * x.next_week / 252), 
                                                                            (x.next_week + 0.75 + self.start_holding_period/375) / 252, 
                                                                            Config.interest_rate, 
                                                                            x.IVOL), axis = 1)

        self.return_matrix = candle[["call_current_week_return", "call_next_week_return", "put_current_week_return", "put_next_week_return"]]

    def _get_XGB_params(self):
        
        params = pd.DataFrame(client['Gap_Move_Strategy']['XGBoost_Filter_Params'].find({"underlying" : self.underlying}))
        params = params[params.date == sorted(params.date.unique())[-1]]
        params.drop(columns = ['_id', 'date', 'underlying', 'accuracy', 'max_accuracy', 'min_accuracy', 'sharpe'], inplace = True)
        
        params.max_depth = params.max_depth.astype(int)
        params.n_estimators = params.n_estimators.astype(int)
        params.learning_rate = params.learning_rate.astype(float)
        params.reg_lambda = params.reg_lambda.astype(float)
        params.subsample = params.subsample.astype(float)
        params.sample_weight = params.sample_weight.astype(float)
        
        self.XGBoost_Parameters = params.to_dict('records')[0]

    def _get_scaled_parameters(self, split_index):
        
        sc = StandardScaler()
        y = self.data_matrix[['gap_move_flag']].values
        X = self.data_matrix.drop(columns = ['gap_move', 'gap_move_flag']).values
        
        X_train = sc.fit_transform(X[:-split_index])
        X_test = sc.transform(X[-split_index:])
        y_train = y[:-split_index]
        y_test = y[-split_index:]
                
        return X_train, X_test, y_train, y_test

    def _XGB(self, params):

        try:

            model = XGBClassifier(nthread = -1, 
                                  n_estimators = params['n_estimators'], 
                                  learning_rate = params['learning_rate'], 
                                  max_depth = params['max_depth'], 
                                  reg_lambda = params['reg_lambda'], 
                                  subsample = params['subsample'])

            model.fit(params['X_train'], params['y_train'], 
                      sample_weight = [params['sample_weight'] if y==0 else 1 for y in params['y_train'].ravel()])

            return model.predict(params['X_test'])

        except Exception as e:
            raise RuntimeError(f"{e}, Error in running XGBoost")
            
    def _XGB_Grid_Simulator(self, X_train, X_test, y_train, y_test):

        max_depth_array = [2, 3, 4, 5, 6, 7] #no change
        n_estimators_array = [10, 30, 50, 100] #no change
        learning_rate_array = [0.01, 0.05, 0.075, 0.1, 0.5, 1] #no change
        reg_lambda_array = [0, 0.1, 0.01, 0.001] #no change
        subsample_array = [0.7, 0.8, 0.9] #no change
        sample_weight_array= [0.1, 0.25, 0.5, 0.67, 0.8, 0.9, 1, 1.1, 1.2, 1.5, 2, 4, 10, 50, 100]

        counter = 0
        summary = pd.DataFrame(columns =['max_depth', 'n_estimators', 'learning_rate', 'reg_lambda',
                                         'subsample', 'sample_weight', 'accuracy'])

        for max_depth in max_depth_array:
            for n_estimators in n_estimators_array:
                for learning_rate in learning_rate_array:
                    for reg_lambda in reg_lambda_array:
                        for subsample in subsample_array:
                            for sample_weight in sample_weight_array:

                                params = {'max_depth' : max_depth, 
                                          'n_estimators': n_estimators, 
                                          'learning_rate' : learning_rate,
                                          'reg_lambda' : reg_lambda, 
                                          'subsample' : subsample,
                                          'sample_weight' : sample_weight,
                                          'X_train' : X_train, 
                                          'X_test' : X_test, 
                                          'y_train' : y_train}

                                y_pred = self._XGB(params)
                                summary.loc[counter] = [max_depth, 
                                                        n_estimators, 
                                                        learning_rate, 
                                                        reg_lambda, 
                                                        subsample, 
                                                        sample_weight,
                                                        accuracy_score(y_test, y_pred)]

                            counter+=1

        return summary

    def Hyperparameter_Tuner(self):
        
        start = time.time()
        splits = [250, 200, 150, 100, 50]
        accuracy_list_XGB = []

        for split in splits:
            
            print(split)
            #preparing datasets
            X_train_split, X_test_split, y_train_split, y_test_split = self._get_scaled_parameters(split)
            
            if split - 50 != 0:
                X_test_split = X_test_split[:-(split-50)]
                y_test_split = y_test_split[:-(split-50)]

            #XGBoost grid sim
            summary_XGB = self._XGB_Grid_Simulator(X_train_split, X_test_split, y_train_split, y_test_split)
            accuracy_list_XGB.append(summary_XGB.accuracy.tolist())

        #best XGB params
        summary_XGB.accuracy = np.array(accuracy_list_XGB).mean(axis=0)
        summary_XGB["underlying"] = self.underlying
        summary_XGB["date"] = self.end_date
        summary_XGB["max_accuracy"] = np.array(accuracy_list_XGB).max(axis=0)
        summary_XGB["min_accuracy"] = np.array(accuracy_list_XGB).min(axis=0)
        
        summary_XGB["sharpe"] = summary_XGB.max_accuracy - summary_XGB.min_accuracy
        summary_XGB.sharpe = summary_XGB.accuracy / summary_XGB.sharpe
        summary_XGB.to_csv('summary_XGB.csv')
        summary_XGB = summary_XGB[summary_XGB.sharpe>=summary_XGB.sharpe.quantile(0.5)].sort_values(['accuracy','max_accuracy','min_accuracy'], ascending=[False, False, True]).head(1)

        #inserting into XGB DB
        print(client['Gap_Move_Strategy']['XGBoost_Filter_Params'].delete_many({"date" : self.end_date, 'underlying' : self.underlying}).deleted_count,f" documents deleted for {self.underlying} and {self.end_date} from XGBoost_Filter_Params")
        print(len(client['Gap_Move_Strategy']['XGBoost_Filter_Params'].insert_many(summary_XGB.to_dict('records')).inserted_ids),f" documents entered for {self.underlying} and {self.end_date} into XGBoost_Filter_Params")
            
    def Predictor(self):

        start = time.time()
        self._get_XGB_params()

        date_list = self.return_matrix.index.unique()
        initial_location = date_list.get_loc(self.prediction_date)
        summary_list = []

        initial_loc = initial_location
        summary_XGB = pd.DataFrame(columns = ['date', 'XGB_prediction'])

        for date in sorted(date_list[date_list >= self.prediction_date]):
            
            X_train, X_test, y_train, y_test = self._get_scaled_parameters(len(date_list) - initial_loc)
            X_test = X_test[:1]
            y_test = y_test[:1]
            initial_loc += 1

            params_XGB = copy.deepcopy(self.XGBoost_Parameters)
            params_XGB['X_train'] = X_train
            params_XGB['X_test'] = X_test
            params_XGB['y_train'] = y_train
            summary_XGB.loc[initial_loc] = [date, self._XGB(params_XGB).ravel()[0]]


        if len(summary_XGB)==len(date_list[date_list >= self.prediction_date]):
            
            summary_XGB = pd.merge(summary_XGB.set_index('date'), self.return_matrix, left_index=True, right_index=True).dropna().reset_index()
            print(client['Gap_Move_Strategy']['Ensemble_Prediction'].delete_many({"date":{"$in":summary_XGB.date.unique().tolist()}}).deleted_count, f" records deleted")
            print(len(client['Gap_Move_Strategy']['Ensemble_Prediction'].insert_many(summary_XGB.to_dict('records')).inserted_ids), " records inserted")
            print(summary_XGB)
        else:
            print("Could not predict for all dates")
            
        print(f"Prediction Took {time.time() - start} Seconds")

In [2]:
start_date = '2015-01-09' #starting data point best left untouchedd
end_date = '2024-01-24'#'2022-01-03'#'2023-07-03' # Last day for which you want prediction - should be next trading day
prediction_date = '2024-01-19' # should be legitimate and after start date and before end date. Only dates >= are processed and pushed into DB
underlying = 'NIFTY' #NIFTY/BANKNIFTY
start_holding_period = 10
end_holding_period = 10
self = Gap_Move_Classifier(start_date, end_date, prediction_date, underlying, start_holding_period, end_holding_period)

In [7]:
from scipy import stats
import pandas as pd
import numpy as np

def _sim(start_holding_period,end_holding_period):
    
    start_date = '2015-01-09' #starting data point best left untouchedd
    end_date = '2024-01-22'#'2022-01-03'#'2023-07-03' # Last day for which you want prediction - should be next trading day
    prediction_date = '2024-01-19' # should be legitimate and after start date and before end date. Only dates >= are processed and pushed into DB
    underlying = 'NIFTY' #NIFTY/BANKNIFTY
    self = Gap_Move_Classifier(start_date, end_date, prediction_date, underlying, start_holding_period, end_holding_period)

    self.data_matrix.gap_move*=8.9-0.0023
    ts = (1  + self.data_matrix.gap_move).rolling(250).apply(lambda x : x.prod()).dropna()
    mdd= ((1  + self.data_matrix.gap_move).cumprod() / (1  + self.data_matrix.gap_move).cumprod().cummax() -1 )
    return [start_holding_period, end_holding_period, 100*self.data_matrix.gap_move.mean(), ts.mean(), 
            ts.median(), ts.max(), ts.min(), 
            16*self.data_matrix.gap_move.mean() / self.data_matrix.gap_move.std(), 
            mdd.min(), mdd[mdd<0].median(),
            stats.percentileofscore((1  + self.data_matrix.gap_move).rolling(250).apply(lambda x : x.prod()).dropna(), 1)]
summary = pd.DataFrame(columns = ["start_holding_period", "end_holding_period", "Avg_Return", "Mean", "Median", "Max", "Min", "Sharpe", "MDD", "Median_MDD", "Breakeven_percentage"])
num = 0
for start in range(1,11):
    for end in range(1,11):
        num+=1
        print(start, end)
        summary.loc[num] = _sim(start,end)

1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
4 10
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
5 10
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 10
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
7 10
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
8 10
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
9 10
10 1
10 2
10 3
10 4
10 5
10 6
10 7
10 8
10 9
10 10


In [8]:
summary.describe()

Unnamed: 0,start_holding_period,end_holding_period,Avg_Return,Mean,Median,Max,Min,Sharpe,MDD,Median_MDD,Breakeven_percentage
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,5.5,5.5,0.70456,10.08955,3.27878,326.75588,0.01963,1.82859,-0.99599,-0.14989,12.26913
std,2.88675,2.88675,0.03906,2.31807,0.34024,120.17087,0.01243,0.09806,0.00275,0.01355,0.95832
min,1.0,1.0,0.61905,6.31805,2.67977,150.74958,0.00553,1.60396,-0.99912,-0.17986,8.46427
25%,3.0,3.0,0.68598,8.58295,3.045,257.7757,0.00946,1.7696,-0.99848,-0.16121,12.10086
50%,5.5,5.5,0.70019,9.80999,3.20415,310.85206,0.01535,1.82916,-0.99663,-0.14913,12.51901
75%,8.0,8.0,0.72512,10.99214,3.49722,366.25734,0.0275,1.90338,-0.99426,-0.13804,12.72174
max,10.0,10.0,0.78349,17.96215,4.48566,780.81402,0.05238,2.00731,-0.98991,-0.12574,13.78611


In [9]:
summary[(summary.Median>=summary.Median.quantile(0.8)) & 
        (summary.Sharpe>=summary.Sharpe.quantile(0.8)) & 
        (summary.Min>=summary.Min.quantile(0.8)) & 
        (summary.Median_MDD>=summary.Median_MDD.quantile(0.8)) & 
        (summary.Breakeven_percentage<=summary.Breakeven_percentage.quantile(0.2))]

Unnamed: 0,start_holding_period,end_holding_period,Avg_Return,Mean,Median,Max,Min,Sharpe,MDD,Median_MDD,Breakeven_percentage
3,1.0,3.0,0.7168,9.59564,3.59731,242.1799,0.0323,1.92951,-0.99217,-0.13061,10.2889
4,1.0,4.0,0.74118,9.85808,3.7674,252.60744,0.03642,1.98554,-0.99238,-0.12859,10.18753
84,9.0,4.0,0.74903,9.78525,3.59751,312.96315,0.05238,1.98961,-0.98996,-0.12574,8.46427
85,9.0,5.0,0.75893,10.94998,3.66181,386.52166,0.04038,2.00314,-0.99297,-0.12634,9.22453


In [10]:
summary[summary.Median >= 4.4]

Unnamed: 0,start_holding_period,end_holding_period,Avg_Return,Mean,Median,Max,Min,Sharpe,MDD,Median_MDD,Breakeven_percentage
10,1.0,10.0,0.77581,12.28352,4.48566,288.88186,0.00695,1.99659,-0.99897,-0.14318,12.62038
