In [1]:
import Config
import ann_helper

import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

import multiprocessing
import datetime
import time
import copy

from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, precision_score
import tensorflow as tf
from tensorflow.keras import regularizers
from keras.layers import Dropout
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from pymongo import MongoClient
client=MongoClient(Config.DB_Hostname,Config.DB_Port)

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

class Classification_Engine:
    
    def __init__(self, start_date, end_date, prediction_date, underlying):
        
        self.start_date=start_date #2017-01-01
        self.end_date=end_date #2017-01-01
        self.prediction_date=prediction_date #2017-01-01
        self.underlying=underlying #NIFTY,BANKNIFTY
        self.XGBoost_Parameters = None
        self.ANN_Parameters = None
        
        if self.start_date>=self.prediction_date:
            raise Exception("Start date has to be before Prediction Date")

        start = time.time()
        self.underlying_dynamics = pd.DataFrame(client[f'{Config.Data_DB}'][f'{self.underlying}OHLC'].find())
        self.underlying_dynamics.drop(columns=['_id'],inplace=True)
        self.underlying_dynamics.sort_values(['date','batch_id'],inplace=True)
        self.underlying_dynamics=self.underlying_dynamics[(self.underlying_dynamics.date>=self.start_date)&(self.underlying_dynamics.date<=self.end_date)]
        self.underlying_dynamics.batch_id=self.underlying_dynamics.batch_id.astype(int)
        self.underlying_dynamics.set_index('date',inplace=True)
        print(f"Prices Downloaded in {time.time() - start} seconds")
        
        start=time.time()
        self.vol_surface=pd.DataFrame(client[f'{Config.Data_DB}']['Vol_Surface'].find({"SYMBOL":self.underlying}))
        self.vol_surface.drop(columns=['_id', 'expected_skew', 'expected_kurt', 'strike_conc_3rd_moment', 'strike_conc_4th_moment', 
                                       'CONTRACTS', 'VAL_INLAKH','OPEN_INT', 'CHG_IN_OI'],inplace=True)
        self.vol_surface.sort_values('date',inplace=True)
        self.vol_surface=self.vol_surface[(self.vol_surface.date>=self.start_date)&(self.vol_surface.date<=self.end_date)]
        self.vol_surface.set_index('date',inplace=True)
        print(f"Vol Surface Downloaded in {time.time() - start} seconds")
        
        start=time.time()
        self.days_to_expiry = pd.DataFrame(client[f'{Config.Data_DB}']['Days_To_Expiry'].find({"underlying":self.underlying}))
        self.days_to_expiry.drop(columns = ['_id','underlying'],inplace=True)
        self.days_to_expiry.sort_values('date',inplace=True)
        self.days_to_expiry=self.days_to_expiry[(self.days_to_expiry.date>=self.start_date)&(self.days_to_expiry.date<=self.end_date)]
        self.days_to_expiry.set_index('date',inplace=True)
        print(f"Expiry Dates Downloaded in {time.time()-start} seconds")
        
        start=time.time()
        self.predicted_params=pd.DataFrame(client[f'{Config.Data_DB}'][f'{self.underlying}_Predicted_Parameters'].find())
        self.predicted_params.drop(columns=['_id'],inplace=True)
        self.predicted_params.sort_values(['date','strategy_variant'],inplace=True)
        self.predicted_params=self.predicted_params[(self.predicted_params.date>=self.start_date)&(self.predicted_params.date<=self.end_date)]
        self.predicted_params.set_index('date',inplace=True)
        self.predicted_params.drop(columns=['PnL', 'Drawdown_Count', 'Trade_Count', 'first_drawdown_pnl'],inplace=True)
        print(f"Predicted Parameters Downloaded in {time.time() - start} seconds")
        
        start=time.time()
        self.underlying_simulated_parameters=pd.DataFrame(client[f'{Config.Data_DB}'][f'{self.underlying}_Simulated_Parameters'].find())
        self.underlying_simulated_parameters=self.underlying_simulated_parameters.sort_values(['date', 'strategy_variant']).drop(columns=['_id'])
        self.underlying_simulated_parameters=self.underlying_simulated_parameters[(self.underlying_simulated_parameters.date>=self.start_date)&(self.underlying_simulated_parameters.date<=self.end_date)]
        self.underlying_simulated_parameters.set_index('date',inplace=True)
        print(f"Simulated Parameters Downloaded in {time.time()-start} seconds")
        
        start=time.time()
        self.underlying_guided_parameters=pd.DataFrame(client[f'{Config.Data_DB}'][f'{self.underlying}_Simulated_Parameters'].find())
        self.underlying_guided_parameters=self.underlying_guided_parameters.sort_values(['date', 'strategy_variant']).drop(columns=['_id'])
        self.underlying_guided_parameters=self.underlying_guided_parameters[(self.underlying_guided_parameters.date>=self.start_date)&(self.underlying_guided_parameters.date<=self.end_date)]        
        self.underlying_guided_parameters.set_index('date',inplace=True)
        print(f"Guided Parameters Downloaded in {time.time()-start} seconds")
        
        '''
        start=time.time()
        self.strategy_clusters=pd.DataFrame(client[f'{Config.Data_DB}']['Strategy_Clusters'].find({"underlying" : self.underlying}))
        self.strategy_clusters = self.strategy_clusters.sort_values('strategy_variant').drop(columns=['_id'])
        print(f"Strategy Clusters Downloaded in {time.time()-start} seconds")

        start=time.time()
        self.ensemble_predictions = pd.DataFrame(client[f'{Config.Data_DB}']['Ensemble_Prediction'].find())
        self.ensemble_predictions = self.ensemble_predictions.sort_values(['date', 'strategy_variant']).drop(columns=['_id'])
        print(f"Ensemble Predictions Downloaded in {time.time()-start} seconds")
        '''
        
        self._create_features()
        self._aggregate()
        
    def _create_features(self):
        
        start=time.time()
        self.predicted_params["Profitability"] = np.where(self.predicted_params.second_drawdown_pnl>=0,1,0)
        self.predicted_params["Profitability_1"] = self.predicted_params.groupby('strategy_variant').rolling(1).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).Profitability.tolist()
        self.predicted_params["Profitability_2"] = self.predicted_params.groupby('strategy_variant').rolling(2).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).Profitability.tolist()
        self.predicted_params["Profitability_3"] = self.predicted_params.groupby('strategy_variant').rolling(3).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).Profitability.tolist()
        self.predicted_params["Profitability_4"] = self.predicted_params.groupby('strategy_variant').rolling(4).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).Profitability.tolist()
        self.predicted_params["Profitability_5"] = self.predicted_params.groupby('strategy_variant').rolling(5).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).Profitability.tolist()
        self.predicted_params['pnl_rank_1'] = self.predicted_params.groupby('strategy_variant').rolling(1).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).second_drawdown_pnl.tolist()
        self.predicted_params['pnl_rank_2'] = self.predicted_params.groupby('strategy_variant').rolling(2).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).second_drawdown_pnl.tolist()
        self.predicted_params['pnl_rank_3'] = self.predicted_params.groupby('strategy_variant').rolling(3).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).second_drawdown_pnl.tolist()
        self.predicted_params['pnl_rank_4'] = self.predicted_params.groupby('strategy_variant').rolling(4).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).second_drawdown_pnl.tolist()
        self.predicted_params['pnl_rank_5'] = self.predicted_params.groupby('strategy_variant').rolling(5).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant']).second_drawdown_pnl.tolist()
        predicted_columns = ['Profitability', 'pos_param', 'mov_param', 'prof_param', 'Profitability_1', 
                             'Profitability_2', 'Profitability_3', 'Profitability_4', 'Profitability_5', 
                             'pnl_rank_1', 'pnl_rank_2', 'pnl_rank_3','pnl_rank_4', 'pnl_rank_5']
        self.predicted_params_list=[self.predicted_params[self.predicted_params.strategy_variant==i][predicted_columns] for i in range(1,1+max(self.predicted_params.strategy_variant))]
        self.predicted_params=self.predicted_params[['strategy_variant','second_drawdown_pnl','Profitability']]
        print(f"Predicted Parameters Processed in {time.time()-start} seconds")
        
        start=time.time()
        columns_to_drop = ['underlying', 'sim_number','pos_param', 'mov_param', 'prof_param', 'strike_offset', 'drawdown_count', 
                         'trade_count', 'capital', 'first_drawdown_pnl','second_drawdown_pnl']+[col for col in self.underlying_simulated_parameters.columns if 'pmpr' in col]
        indexable_columns = ['pnl', 'taylor_sensitivity_1', 'taylor_sensitivity_2', 'taylor_sensitivity_3', 'pos_dev', 'mov_dev', 'prof_dev']
        absolute_value_columns = ['taylor_sensitivity_1', 'taylor_sensitivity_2', 'taylor_sensitivity_3']
        mean_columns = ["sim_vol", "pos_diff", "mov_diff", "prof_diff"]
        variant_columns=[col for col in  self.underlying_simulated_parameters.columns if 'variant_' in col]
        self.underlying_simulated_parameters.drop(columns = columns_to_drop + variant_columns, inplace = True)

        if self.end_date not in self.underlying_simulated_parameters.index:
            temp = self.underlying_simulated_parameters.loc[self.underlying_simulated_parameters.index[-1]].reset_index()
            temp.date = self.end_date
            temp.set_index('date', inplace = True)
            self.underlying_simulated_parameters = pd.concat([self.underlying_simulated_parameters, temp])

        underlying_simulated_parameters_list=pd.DataFrame(index=self.underlying_simulated_parameters.index)
        underlying_simulated_parameters_list["strategy_variant"]=self.underlying_simulated_parameters.strategy_variant.tolist()

        df = self.underlying_simulated_parameters[['strategy_variant'] + indexable_columns]
        for col in absolute_value_columns:
            df[col] = df[col].abs().tolist()

        index_1 = df.groupby('strategy_variant').rolling(1).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])
        index_10 = df.groupby('strategy_variant').rolling(10).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])
        index_15 = df.groupby('strategy_variant').rolling(15).median().groupby(level='date').rank(pct=True).groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])

        for columns in indexable_columns:

            underlying_simulated_parameters_list[columns + "_index_1"] = index_1[columns].tolist()
            underlying_simulated_parameters_list[columns + "_index_10"] = index_10[columns].tolist()
            underlying_simulated_parameters_list[columns + "_index_15"] = index_15[columns].tolist()

        median_columns = [col for col in self.underlying_simulated_parameters.columns if col not in mean_columns]
        median_df = self.underlying_simulated_parameters[median_columns]
        mean_df = self.underlying_simulated_parameters[['strategy_variant'] + mean_columns]

        for col in absolute_value_columns:
            median_df[col] = median_df[col].abs().tolist()

        median1 = median_df.groupby('strategy_variant').rolling(1).median().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])
        mean1 = mean_df.groupby('strategy_variant').rolling(1).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])

        median10 = median_df.groupby('strategy_variant').rolling(10).median().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])
        mean10 = mean_df.groupby('strategy_variant').rolling(10).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])

        median15 = median_df.groupby('strategy_variant').rolling(15).median().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])
        mean15 = mean_df.groupby('strategy_variant').rolling(15).mean().groupby(level='strategy_variant').shift().reset_index().sort_values(['date','strategy_variant'])

        for columns in mean_columns:

            if columns != 'strategy_variant':
                underlying_simulated_parameters_list[columns + "_mean_1"] = mean1[columns].tolist()
                underlying_simulated_parameters_list[columns + "_mean_10"] = mean10[columns].tolist()
                underlying_simulated_parameters_list[columns + "_mean_15"] = mean15[columns].tolist()

        for columns in median_columns:

            if columns != 'strategy_variant':
                underlying_simulated_parameters_list[columns + "_median_1"] = median1[columns].tolist()
                underlying_simulated_parameters_list[columns + "_median_10"] = median10[columns].tolist()
                underlying_simulated_parameters_list[columns + "_median_15"] = median15[columns].tolist()
        
        self.underlying_simulated_parameters = [underlying_simulated_parameters_list[
            underlying_simulated_parameters_list.strategy_variant==i].drop(columns=['strategy_variant']) 
                        for i in range(1,underlying_simulated_parameters_list.strategy_variant.max()+1)]
        print(f"Simulated Parameters Processed in {time.time()-start} seconds")
        
        start=time.time()
        temp=pd.DataFrame(index=self.underlying_dynamics.index.unique())
        temp["daily_range"] = (self.underlying_dynamics.groupby('date').max().high/self.underlying_dynamics.groupby('date').max().low-1)
        temp["daily_return"] = self.underlying_dynamics.groupby('date').tail(1).close.pct_change()
        temp["minute_range"] = (self.underlying_dynamics.high/self.underlying_dynamics.low-1).groupby('date').mean()
        temp["minute_return"] = (self.underlying_dynamics.close/self.underlying_dynamics.open-1).groupby('date').mean()
        temp["range_std"] = (self.underlying_dynamics.high/self.underlying_dynamics.low-1).groupby('date').std()
        temp["return_std"] = (self.underlying_dynamics.close/self.underlying_dynamics.open-1).groupby('date').std()
        temp = temp[~temp.range_std.isna()]

        if self.end_date not in temp.index:
            temp.loc[self.end_date] = [np.nan]*(len(temp.columns))
            
        self.underlying_dynamics = pd.DataFrame(index=temp.index)
        for i in [1]:
            df_temp = temp.rolling(i).median().shift()
            df_temp.columns = [ col+f"_median_{i}" for col in temp.columns]
            self.underlying_dynamics = pd.merge(self.underlying_dynamics, df_temp, left_index=True, right_index=True)

        print(f"Prices Processed in {time.time()-start} seconds")
        
        start=time.time()
        temp = self.vol_surface.copy()
        if self.end_date not in temp.index:
            temp.loc[self.end_date] = [np.nan]*(len(temp.columns))
            
        self.vol_surface=pd.DataFrame(index=temp.index)
        for i in [1]:

            df_temp = temp.rolling(i).median().shift()
            df_temp.columns = [ col+f"_mean_{i}" for col in df_temp.columns]
            self.vol_surface = pd.merge(self.vol_surface, df_temp, left_index=True, right_index = True)
            
        print(f"Vol Surface Processed in {time.time()-start} seconds")
        
        start=time.time()
        self.underlying_guided_parameters = self.underlying_guided_parameters[['strategy_variant', 'pos_param', 'mov_param', 'prof_param']]
        self.underlying_guided_parameters = pd.merge(self.underlying_guided_parameters, self.days_to_expiry[['current_week']], left_index=True, right_index= True)
        underlying_guided_parameters_list=[]

        for i in range(self.underlying_guided_parameters.strategy_variant.max()):

            temp = self.underlying_guided_parameters[self.underlying_guided_parameters.strategy_variant == i+1].drop(columns=['strategy_variant'])
            if self.end_date not in temp.index:
                temp.loc[self.end_date]=[np.nan]*len(temp.columns)

            temp = temp.shift()
            underlying_guided_parameters_list.append(temp)

        self.underlying_guided_parameters=[]
        for i in range(len(underlying_guided_parameters_list)):

            if i%24<12:
                temp=underlying_guided_parameters_list[i+12].copy()
                temp.columns=["temp_" + col for col in underlying_guided_parameters_list[i].columns]
                cols_to_drop=temp.columns
                cols_to_fix=[col for col in underlying_guided_parameters_list[i].columns if col != "current_week"]
                temp=pd.merge(underlying_guided_parameters_list[i], temp, left_index=True, right_index=True)

                for col in cols_to_fix:
                    temp[col]=np.where(temp.current_week==1, temp[f'temp_{col}'], temp[col])

                temp.drop(columns=list(cols_to_drop)+["current_week"],inplace=True)

            else:
                temp=underlying_guided_parameters_list[i].drop(columns=['current_week'])
                
            temp.dropna(inplace=True)  
            temp-=self.predicted_params_list[i][["pos_param","mov_param","prof_param"]]
            temp.columns=["guided_"+col for col in cols_to_fix]
            self.underlying_guided_parameters.append(temp)
        
        print(f"Guided Parameters Processed in {time.time()-start} seconds")

    def _aggregate(self):
        
        start=time.time()
        self.parameters=[]
                        
        for i in range(len(self.predicted_params_list)):
            temp = pd.merge(self.predicted_params_list[i],self.underlying_simulated_parameters[i],left_index=True, right_index=True)
            temp = pd.merge(temp, self.underlying_guided_parameters[i], left_index=True, right_index=True)
            temp = pd.merge(temp, self.vol_surface, left_index=True, right_index=True)
            temp = pd.merge(temp, self.underlying_dynamics, left_index=True, right_index=True)                
            self.parameters.append(temp.dropna())
        
        self.categorical_parameter = pd.DataFrame()
        self.categorical_parameter["date"] = self.parameters[0].index
        self.categorical_parameter.set_index('date', inplace = True)
        if self.underlying=="NIFTY":
            self.categorical_parameter["weekly_flag"]= np.where(self.categorical_parameter.index>="2019-02-11", 1, 0)
        else:
            self.categorical_parameter["weekly_flag"]= np.where(self.categorical_parameter.index>="2016-05-27", 1, 0)
                                   
        del self.predicted_params_list
        del self.underlying_simulated_parameters
        del self.underlying_guided_parameters
        del self.days_to_expiry
        del self.vol_surface
        del self.underlying_dynamics

        print(f"All parameters aggregated in {time.time()-start} seconds")

    def _get_scaled_parameters(self, strategy_variant, split_index):
        
        #strategy_variant input here expects the actual variant - 1, i.e., [0 - 383]
        
        sc = StandardScaler()
        y = self.parameters[strategy_variant][['Profitability']].values
        X = self.parameters[strategy_variant].drop(columns = ['Profitability']).values
        
        X_train = sc.fit_transform(X[:-split_index])
        X_test = sc.transform(X[-split_index:])
        y_train = y[:-split_index]
        y_test = y[-split_index:]
        
        cat_train = self.categorical_parameter.values[:-split_index]
        cat_test = self.categorical_parameter.values[-split_index:]
            
        X_train = np.concatenate((X_train, cat_train), axis = 1)
        X_test = np.concatenate((X_test, cat_test), axis = 1)
        
        return X_train, X_test, y_train, y_test

    def _XGB(self, params):

        try:

            model = XGBClassifier(nthread = -1, n_estimators = params['n_estimators'], learning_rate = params['learning_rate'], 
                                  max_depth = params['max_depth'], reg_lambda = params['reg_lambda'], subsample = params['subsample'])

            model.fit(params['X_train'], params['y_train'], 
                      sample_weight = [params['sample_weight'] if y==0 else 1 for y in params['y_train'].ravel()])

            return model.predict(params['X_test'])

        except Exception as e:
            print(e, ". Error in running XGBoost")
            return np.zeros(len(params['X_test'])).reshape(-1,1)

    def _ANN(self, params_list):

        try:            
            with multiprocessing.Pool() as pool:
                res = pool.map(ann_helper.run_sim, iterable = params_list)

            results = {identifier : prediction for identifier, prediction in res}

        except Exception as e:
            print(e,". Failed to parallelize ANN")
            results = {param['identifier'] : np.array([[0] for i in range(len(param['X_test']))])
                       for param in params_list}

        return results

    def _XGB_Grid_Simulator(self, X_train, X_test, y_train, y_test):

        max_depth_array = [2, 3, 4, 5] #no change
        n_estimators_array = [10, 30, 50] #no change
        learning_rate_array = [0.05, 0.075, 0.1, 0.5] #no change
        reg_lambda_array = [0, 0.01] #no change
        subsample_array = [0.9] #no change
        sample_weight_array= [2, 3, 4, 5, 10, 25, 50, 75, 100]

        counter = 0
        summary = pd.DataFrame(columns =['max_depth', 'n_estimators', 'learning_rate', 'reg_lambda',
                                 'subsample', 'sample_weight', 'precision', 'num_signals'])

        for max_depth in max_depth_array:
            for n_estimators in n_estimators_array:
                for learning_rate in learning_rate_array:
                    for reg_lambda in reg_lambda_array:
                        for subsample in subsample_array:
                            for sample_weight in sample_weight_array:

                                params = {'max_depth' : max_depth, 'n_estimators': n_estimators, 'learning_rate' : learning_rate,
                                         'reg_lambda' : reg_lambda, 'subsample' : subsample, 'sample_weight' : sample_weight,
                                         'X_train' : X_train, 'X_test' : X_test, 'y_train' : y_train}

                                y_pred = self._XGB(params)
                                summary.loc[counter] = [max_depth, n_estimators, learning_rate, reg_lambda, subsample, 
                                                        sample_weight, precision_score(y_test, y_pred), sum(y_pred)]
                                
                                counter+=1

        return summary

    def _ANN_Grid_Simulator(self, X_train, X_test, y_train, y_test):

        num_layers = [2, 4]
        num_cells = [30]
        kernel_regularizer_l1 = [0]
        kernel_regularizer_l2 = [0.0001, 0.001]
        bias_regularizers = [0.00001]
        activity_regularizers = [0]
        dropouts = [0.2]
        activations = ['swish', 'tanh']
        learning_rates = [0.00001]
        beta_1 = [0.9]
        class_weights = [10, 25, 50]
        precision_filters = [0.5, 0.6, 0.7, 0.8]
        epochs = [25, 100]

        counter = 0
        summary = pd.DataFrame(columns =['num_layer', 'num_cell', 'kernel_regularizer_l1', 'kernel_regularizer_l2', 
                                         'bias_regularizer', 'activity_regularizer', 'dropout', 'activation', 
                                         'learning_rate', 'beta', 'class_weight', 'epoch', 'precision_filter', 
                                         'precision', 'num_signals'])
        params_list = []

        for num_layer in num_layers:
            for num_cell in num_cells:
                for l1 in kernel_regularizer_l1:
                    for l2 in kernel_regularizer_l2:
                        for bias_regularizer in bias_regularizers:
                            for activity_regularizer in activity_regularizers:
                                for dropout in dropouts:
                                    for activation in activations:
                                        for learning_rate in learning_rates:
                                            for beta in beta_1:    
                                                for class_weight in class_weights:
                                                    for precision_filter in precision_filters:
                                                        for epoch in epochs:
                                                            params_list.append({'num_layer': num_layer, 'num_cell': num_cell,
                                                                                'kernel_regularizer_l1': l1,
                                                                                'kernel_regularizer_l2': l2,
                                                                                'bias_regularizer': bias_regularizer,
                                                                                'activity_regularizer': activity_regularizer,
                                                                                'dropout': dropout, 'activation': activation,
                                                                                'learning_rate': learning_rate, 'beta': beta,
                                                                                'class_weight' : class_weight, 'epoch': epoch,
                                                                                'precision_filter' : precision_filter, 
                                                                                'X_train' : X_train, 'y_train' : y_train,
                                                                                'X_test' : X_test, 'y_test' : y_test, 
                                                                                'identifier' : counter})
                                                            counter+=1

        results = self._ANN(params_list)
        for identifier, prediction in results.items():
            summary.loc[identifier] = [params_list[identifier]['num_layer'],
                                       params_list[identifier]['num_cell'],
                                       params_list[identifier]['kernel_regularizer_l1'],
                                       params_list[identifier]['kernel_regularizer_l2'],
                                       params_list[identifier]['bias_regularizer'],
                                       params_list[identifier]['activity_regularizer'],
                                       params_list[identifier]['dropout'],
                                       params_list[identifier]['activation'],
                                       params_list[identifier]['learning_rate'],
                                       params_list[identifier]['beta'],
                                       params_list[identifier]['class_weight'], 
                                       params_list[identifier]['epoch'], 
                                       params_list[identifier]['precision_filter'],
                                       precision_score(params_list[identifier]['y_test'], prediction), 
                                       sum(prediction)[0]]

        return summary.sort_index()

    def _eval_hyperparams(self, summary_, size):

        summary = summary_.copy()
        
        if summary[summary.precision==1].num_signals.max() >= 0.02*size:
            summary = summary[summary.precision==1]

        elif len(summary[(summary.precision>=0.75)&(summary.precision<1)]) > 0:
            summary = summary[(summary.precision>=0.75)&(summary.precision<1)]

        elif len(summary[(summary.precision>=0.6)&(summary.precision<0.75)]) > 0:
            summary = summary[(summary.precision>=0.6)&(summary.precision<0.75)]

        elif len(summary[(summary.precision>=0.5)&(summary.precision<0.6)]) > 0:
            summary = summary[(summary.precision>=0.5)&(summary.precision<0.6)]

        elif len(summary[(summary.precision>=0.3)&(summary.precision<0.5)]) > 0:
            summary = summary[(summary.precision>=0.3)&(summary.precision<0.5)]

        summary_subset = summary[summary.precision >= summary.precision.quantile(0.75)]
        return summary_subset.sort_values(['True_Positive','precision'], ascending=False).head(1)

    def Hyperparameter_Tuner(self, start_variant = 0):
        
        splits = [301, 201, 101]
        for strategy_variant in range(len(self.parameters)):
            
            if strategy_variant<start_variant-1:
                continue
                
            start = time.time()
            print(f"Tuning XGBoost & ANN parameters for strategy variant {strategy_variant+1}")
            precision_list_XGB = []
            num_signal_list_XGB = []
            precision_list_ANN = []
            num_signal_list_ANN = []
            
            for split in splits:
                
                #preparing datasets
                X_train_split, X_test_split, y_train_split, y_test_split = self._get_scaled_parameters(strategy_variant, split)
                X_test_split = X_test_split[:-(split-100)]
                y_test_split = y_test_split[:-(split-100)]
                
                #XGBoost grid sim
                summary_XGB = self._XGB_Grid_Simulator(X_train_split, X_test_split, y_train_split, y_test_split)
                summary_XGB.num_signals *= 100/len(X_test_split)
                precision_list_XGB.append(summary_XGB.precision.tolist())
                num_signal_list_XGB.append(summary_XGB.num_signals.tolist())
                
                #ANN grid sim
                summary_ANN = self._ANN_Grid_Simulator(X_train_split, X_test_split, y_train_split, y_test_split)
                summary_ANN.num_signals *= 100/len(X_test_split)
                precision_list_ANN.append(summary_ANN.precision.tolist())
                num_signal_list_ANN.append(summary_ANN.num_signals.tolist())
            
            #best XGB params
            summary_XGB.precision = np.array(precision_list_XGB).mean(axis=0)
            summary_XGB.num_signals = np.array(num_signal_list_XGB).mean(axis=0)
            summary_XGB["True_Positive"] = summary_XGB.precision * summary_XGB.num_signals
            summary_XGB = self._eval_hyperparams(summary_XGB, 100)
            summary_XGB["underlying"] = self.underlying
            summary_XGB["strategy_variant"] = strategy_variant + 1
            summary_XGB["date"] = self.end_date

            #best ANN params
            summary_ANN.precision = np.array(precision_list_ANN).mean(axis=0)
            summary_ANN.num_signals = np.array(num_signal_list_ANN).mean(axis=0)
            summary_ANN["True_Positive"] = summary_ANN.precision * summary_ANN.num_signals
            summary_ANN = self._eval_hyperparams(summary_ANN, 100)
            summary_ANN["underlying"] = self.underlying
            summary_ANN["strategy_variant"] = strategy_variant + 1
            summary_ANN["date"] = self.end_date
        
            #inserting into XGB DB
            print(client[f'{Config.Data_DB}']['XGBoost_Filter_Params'].delete_many({"date" : self.end_date, 'underlying' : self.underlying,
            "strategy_variant": strategy_variant + 1}).deleted_count,f" documents deleted for {self.underlying} and {self.end_date} from XGBoost_Filter_Params")
            print(len(client[f'{Config.Data_DB}']['XGBoost_Filter_Params'].insert_many(summary_XGB.to_dict('records'
            )).inserted_ids),f" documents entered for {self.underlying} and {self.end_date} into XGBoost_Filter_Params")

            #inserting into ANN DB
            print(client[f'{Config.Data_DB}']['ANN_Filter_Params'].delete_many({"date" : self.end_date, 'underlying' : self.underlying,
            "strategy_variant": strategy_variant + 1}).deleted_count,f" documents deleted for {self.underlying} and {self.end_date} from ANN_Filter_Params")
            print(len(client[f'{Config.Data_DB}']['ANN_Filter_Params'].insert_many(summary_ANN.to_dict('records'
            )).inserted_ids),f" documents entered for {self.underlying} and {self.end_date} into ANN_Filter_Params")

            print(f"Completed in {round(time.time()-start)} seconds")

    def _get_XGB_params(self):
        
        params = pd.DataFrame(client[f'{Config.Data_DB}']['XGBoost_Filter_Params'].find({"underlying" : self.underlying}))
        params = params[params.date == sorted(params.date.unique())[-1]]
        params.drop(columns = ['_id', 'date', 'underlying', 'precision', 'num_signals', 'True_Positive'], inplace = True)
        params = params.sort_values('strategy_variant').set_index('strategy_variant')
        
        params.max_depth = params.max_depth.astype(int)
        params.n_estimators = params.n_estimators.astype(int)
        params.learning_rate = params.learning_rate.astype(float)
        params.reg_lambda = params.reg_lambda.astype(float)
        params.subsample = params.subsample.astype(float)
        params.sample_weight = params.sample_weight.astype(float)
        
        self.XGBoost_Parameters = params.to_dict('records')
        
    def _get_ANN_params(self):
        
        params = pd.DataFrame(client[f'{Config.Data_DB}']['ANN_Filter_Params'].find({"underlying" : self.underlying}))
        params = params[params.date == sorted(params.date.unique())[-1]]
        params.drop(columns = ['_id', 'date', 'underlying', 'precision', 'num_signals', 'True_Positive'], inplace = True)
        params = params.sort_values('strategy_variant').set_index('strategy_variant')

        params.num_layer = params.num_layer.astype(int)
        params.num_cell = params.num_cell.astype(int)
        params.kernel_regularizer_l1 = params.kernel_regularizer_l1.astype(float)
        params.kernel_regularizer_l2 = params.kernel_regularizer_l2.astype(float)
        params.bias_regularizer = params.bias_regularizer.astype(float)
        params.activity_regularizer = params.activity_regularizer.astype(float)
        params.dropout = params.dropout.astype(float)
        params.activation = params.activation.astype(str)
        params.learning_rate = params.learning_rate.astype(float)
        params.beta = params.beta.astype(float)
        params.class_weight = params.class_weight.astype(float)
        params.epoch = params.epoch.astype(int)
        params.precision_filter = params.precision_filter.astype(float)

        self.ANN_Parameters = params.to_dict('records')
    
    def Predictor(self, single_date_prediction = False, start_variant = 0):
        
        self._get_XGB_params()
        self._get_ANN_params()
        
        date_list = self.parameters[0].index.unique()
        initial_location = date_list.get_loc(self.prediction_date)
        summary_list = []
        ANN_params_list = []

        for strategy_variant in range(len(self.parameters)):
            
            if strategy_variant<start_variant-1:
                continue

            start = time.time()
            print(f"Starting XGB prediction for strategy_variant {strategy_variant+1}")
            initial_loc = initial_location
            summary_XGB = pd.DataFrame(columns = ['date', 'strategy_variant', 'XGB_prediction'])
            
            if not single_date_prediction:
                ANN_params_list = []
            
            for date in sorted(date_list[date_list >= self.prediction_date]):
                
                X_train, X_test, y_train, y_test = self._get_scaled_parameters(strategy_variant, len(date_list) - initial_loc)
                X_test = X_test[:1]
                y_test = y_test[:1]

                #XGBoost Params
                params_XGB = copy.deepcopy(self.XGBoost_Parameters[strategy_variant])
                params_XGB['X_train'] = X_train
                params_XGB['X_test'] = X_test
                params_XGB['y_train'] = y_train
                summary_XGB.loc[initial_loc] = [date, strategy_variant + 1, self._XGB(params_XGB).ravel()[0]]
                initial_loc += 1
                
                #Ann Params List
                params_ANN = copy.deepcopy(self.ANN_Parameters[strategy_variant])
                params_ANN['X_train'] = X_train
                params_ANN['X_test'] = X_test
                params_ANN['y_train'] = y_train
                
                if single_date_prediction:
                    params_ANN['identifier'] = strategy_variant + 1
                else:
                    params_ANN['identifier'] = date
                    
                ANN_params_list.append(params_ANN)
                
            #combining XGB and ANN results for multi date prediction
            if not single_date_prediction:
                
                print(f"Starting ANN prediction for strategy_variant {strategy_variant+1}")
                summary_dict_ANN = self._ANN(ANN_params_list)
                summary_dict_ANN = {identifier : prediction.ravel()[0] for identifier, prediction in summary_dict_ANN.items()}
                summary_XGB["ANN_prediction"] = summary_XGB.date.map(summary_dict_ANN)
                summary_XGB.ANN_prediction = summary_XGB.ANN_prediction.astype(int)
                summary_XGB["underlying"] = self.underlying

                if len(summary_XGB[summary_XGB.ANN_prediction.isna()])!=0:
                    raise Exception(f"ANN prediction missing for strategy_variant {strategy_variant+1} on \
                                    {summary_XGB[summary_XGB.ANN_prediction.isna()].date.tolist()}")
                
                print(client[f'{Config.Data_DB}']['Ensemble_Prediction'].delete_many({"date":{"$in":summary_XGB.date.unique().tolist()}, 
                "strategy_variant": strategy_variant+1}).deleted_count, f"records deleted for strategy_variant {strategy_variant+1}")
                print(len(client[f'{Config.Data_DB}']['Ensemble_Prediction'].insert_many(summary_XGB.to_dict('records')).inserted_ids),
                      f"records inserted for strategy_variant {strategy_variant+1}")

            summary_list.append(summary_XGB)
            print(f"completed in {round(time.time()-start)} seconds")
            
        summary_list = pd.concat(summary_list)

        #combining XGB and ANN results for single date prediction
        if single_date_prediction:
            
            #combining XGB and ANN results for multi date prediction
            print(f"Starting ANN prediction for {self.prediction_date}")
            summary_dict_ANN = self._ANN(ANN_params_list)
            summary_dict_ANN = {identifier: prediction.ravel()[0] for identifier, prediction in summary_dict_ANN.items()}
            summary_list["ANN_prediction"] = summary_list.strategy_variant.map(summary_dict_ANN)
            summary_list.ANN_prediction = summary_list.ANN_prediction.astype(int)
            summary_list["underlying"] = self.underlying

            if len(summary_list[summary_list.ANN_prediction.isna()])!=0:
                raise Exception(f"ANN prediction missing for strategy_variant {summary_list[summary_list.ANN_prediction.isna()].strategy_variant}")
                
            print(client[f'{Config.Data_DB}']['Ensemble_Prediction'].delete_many({"date":self.prediction_date}).deleted_count, f"records deleted for {self.prediction_date}")
            print(len(client[f'{Config.Data_DB}']['Ensemble_Prediction'].insert_many(summary_list.to_dict('records')).inserted_ids), f"records inserted for {self.prediction_date}")
            

In [2]:
start_date = '2015-01-12' #starting data point best left untouchedd
end_date = '2024-01-19'#'2022-01-03'#'2023-07-03' # Last day for which you want prediction - should be next trading day
prediction_date = '2024-01-19' # should be legitimate and after start date and before end date. Only dates >= are processed and pushed into DB
underlying = 'NIFTY' #NIFTY/BANKNIFTY
self = Classification_Engine(start_date, end_date, prediction_date, underlying)

Prices Downloaded in 5.01796293258667 seconds
Vol Surface Downloaded in 0.04097890853881836 seconds
Expiry Dates Downloaded in 0.017759084701538086 seconds
Predicted Parameters Downloaded in 4.793071746826172 seconds
Simulated Parameters Downloaded in 275.0002899169922 seconds
Guided Parameters Downloaded in 289.3746449947357 seconds
Predicted Parameters Processed in 24.57867193222046 seconds
Simulated Parameters Processed in 22.53779411315918 seconds
Prices Processed in 1.1080691814422607 seconds
Vol Surface Processed in 0.01850295066833496 seconds
Guided Parameters Processed in 2.6183621883392334 seconds
All parameters aggregated in 2.0233309268951416 seconds


In [3]:
start = time.time()
#self.Hyperparameter_Tuner(start_variant = 0)
print(time.time()-start)

3.790855407714844e-05


In [4]:
start = time.time()
self.Predictor(start_variant = 0)
#self.Predictor(start_variant = 0, single_date_prediction = True)
print(time.time()-start)

Starting XGB prediction for strategy_variant 1
Starting ANN prediction for strategy_variant 1
0 records deleted for strategy_variant 1
1 records inserted for strategy_variant 1
completed in 7 seconds
Starting XGB prediction for strategy_variant 2
Starting ANN prediction for strategy_variant 2
0 records deleted for strategy_variant 2
1 records inserted for strategy_variant 2
completed in 6 seconds
Starting XGB prediction for strategy_variant 3
Starting ANN prediction for strategy_variant 3
0 records deleted for strategy_variant 3
1 records inserted for strategy_variant 3
completed in 4 seconds
Starting XGB prediction for strategy_variant 4
Starting ANN prediction for strategy_variant 4
0 records deleted for strategy_variant 4
1 records inserted for strategy_variant 4
completed in 5 seconds
Starting XGB prediction for strategy_variant 5
Starting ANN prediction for strategy_variant 5
0 records deleted for strategy_variant 5
1 records inserted for strategy_variant 5
completed in 3 seconds


0 records deleted for strategy_variant 41
1 records inserted for strategy_variant 41
completed in 3 seconds
Starting XGB prediction for strategy_variant 42
Starting ANN prediction for strategy_variant 42
0 records deleted for strategy_variant 42
1 records inserted for strategy_variant 42
completed in 5 seconds
Starting XGB prediction for strategy_variant 43
Starting ANN prediction for strategy_variant 43
0 records deleted for strategy_variant 43
1 records inserted for strategy_variant 43
completed in 4 seconds
Starting XGB prediction for strategy_variant 44
Starting ANN prediction for strategy_variant 44
0 records deleted for strategy_variant 44
1 records inserted for strategy_variant 44
completed in 4 seconds
Starting XGB prediction for strategy_variant 45
Starting ANN prediction for strategy_variant 45
0 records deleted for strategy_variant 45
1 records inserted for strategy_variant 45
completed in 3 seconds
Starting XGB prediction for strategy_variant 46
Starting ANN prediction for 

0 records deleted for strategy_variant 82
1 records inserted for strategy_variant 82
completed in 3 seconds
Starting XGB prediction for strategy_variant 83
Starting ANN prediction for strategy_variant 83
0 records deleted for strategy_variant 83
1 records inserted for strategy_variant 83
completed in 5 seconds
Starting XGB prediction for strategy_variant 84
Starting ANN prediction for strategy_variant 84
0 records deleted for strategy_variant 84
1 records inserted for strategy_variant 84
completed in 5 seconds
Starting XGB prediction for strategy_variant 85
Starting ANN prediction for strategy_variant 85
0 records deleted for strategy_variant 85
1 records inserted for strategy_variant 85
completed in 4 seconds
Starting XGB prediction for strategy_variant 86
Starting ANN prediction for strategy_variant 86
0 records deleted for strategy_variant 86
1 records inserted for strategy_variant 86
completed in 3 seconds
Starting XGB prediction for strategy_variant 87
Starting ANN prediction for 

Starting ANN prediction for strategy_variant 122
0 records deleted for strategy_variant 122
1 records inserted for strategy_variant 122
completed in 6 seconds
Starting XGB prediction for strategy_variant 123
Starting ANN prediction for strategy_variant 123
0 records deleted for strategy_variant 123
1 records inserted for strategy_variant 123
completed in 3 seconds
Starting XGB prediction for strategy_variant 124
Starting ANN prediction for strategy_variant 124
0 records deleted for strategy_variant 124
1 records inserted for strategy_variant 124
completed in 3 seconds
Starting XGB prediction for strategy_variant 125
Starting ANN prediction for strategy_variant 125
0 records deleted for strategy_variant 125
1 records inserted for strategy_variant 125
completed in 6 seconds
Starting XGB prediction for strategy_variant 126
Starting ANN prediction for strategy_variant 126
0 records deleted for strategy_variant 126
1 records inserted for strategy_variant 126
completed in 3 seconds
Starting 

0 records deleted for strategy_variant 162
1 records inserted for strategy_variant 162
completed in 4 seconds
Starting XGB prediction for strategy_variant 163
Starting ANN prediction for strategy_variant 163
0 records deleted for strategy_variant 163
1 records inserted for strategy_variant 163
completed in 4 seconds
Starting XGB prediction for strategy_variant 164
Starting ANN prediction for strategy_variant 164
0 records deleted for strategy_variant 164
1 records inserted for strategy_variant 164
completed in 5 seconds
Starting XGB prediction for strategy_variant 165
Starting ANN prediction for strategy_variant 165
0 records deleted for strategy_variant 165
1 records inserted for strategy_variant 165
completed in 4 seconds
Starting XGB prediction for strategy_variant 166
Starting ANN prediction for strategy_variant 166
0 records deleted for strategy_variant 166
1 records inserted for strategy_variant 166
completed in 3 seconds
Starting XGB prediction for strategy_variant 167
Starting 

0 records deleted for strategy_variant 202
1 records inserted for strategy_variant 202
completed in 3 seconds
Starting XGB prediction for strategy_variant 203
Starting ANN prediction for strategy_variant 203
0 records deleted for strategy_variant 203
1 records inserted for strategy_variant 203
completed in 3 seconds
Starting XGB prediction for strategy_variant 204
Starting ANN prediction for strategy_variant 204
0 records deleted for strategy_variant 204
1 records inserted for strategy_variant 204
completed in 3 seconds
Starting XGB prediction for strategy_variant 205
Starting ANN prediction for strategy_variant 205
0 records deleted for strategy_variant 205
1 records inserted for strategy_variant 205
completed in 4 seconds
Starting XGB prediction for strategy_variant 206
Starting ANN prediction for strategy_variant 206
0 records deleted for strategy_variant 206
1 records inserted for strategy_variant 206
completed in 3 seconds
Starting XGB prediction for strategy_variant 207
Starting 

0 records deleted for strategy_variant 242
1 records inserted for strategy_variant 242
completed in 4 seconds
Starting XGB prediction for strategy_variant 243
Starting ANN prediction for strategy_variant 243
0 records deleted for strategy_variant 243
1 records inserted for strategy_variant 243
completed in 6 seconds
Starting XGB prediction for strategy_variant 244
Starting ANN prediction for strategy_variant 244
0 records deleted for strategy_variant 244
1 records inserted for strategy_variant 244
completed in 3 seconds
Starting XGB prediction for strategy_variant 245
Starting ANN prediction for strategy_variant 245
0 records deleted for strategy_variant 245
1 records inserted for strategy_variant 245
completed in 3 seconds
Starting XGB prediction for strategy_variant 246
Starting ANN prediction for strategy_variant 246
0 records deleted for strategy_variant 246
1 records inserted for strategy_variant 246
completed in 3 seconds
Starting XGB prediction for strategy_variant 247
Starting 

0 records deleted for strategy_variant 282
1 records inserted for strategy_variant 282
completed in 6 seconds
Starting XGB prediction for strategy_variant 283
Starting ANN prediction for strategy_variant 283
0 records deleted for strategy_variant 283
1 records inserted for strategy_variant 283
completed in 3 seconds
Starting XGB prediction for strategy_variant 284
Starting ANN prediction for strategy_variant 284
0 records deleted for strategy_variant 284
1 records inserted for strategy_variant 284
completed in 4 seconds
Starting XGB prediction for strategy_variant 285
Starting ANN prediction for strategy_variant 285
0 records deleted for strategy_variant 285
1 records inserted for strategy_variant 285
completed in 6 seconds
Starting XGB prediction for strategy_variant 286
Starting ANN prediction for strategy_variant 286
0 records deleted for strategy_variant 286
1 records inserted for strategy_variant 286
completed in 6 seconds
Starting XGB prediction for strategy_variant 287
Starting 

Starting ANN prediction for strategy_variant 322
0 records deleted for strategy_variant 322
1 records inserted for strategy_variant 322
completed in 4 seconds
Starting XGB prediction for strategy_variant 323
Starting ANN prediction for strategy_variant 323
0 records deleted for strategy_variant 323
1 records inserted for strategy_variant 323
completed in 3 seconds
Starting XGB prediction for strategy_variant 324
Starting ANN prediction for strategy_variant 324
0 records deleted for strategy_variant 324
1 records inserted for strategy_variant 324
completed in 6 seconds
Starting XGB prediction for strategy_variant 325
Starting ANN prediction for strategy_variant 325
0 records deleted for strategy_variant 325
1 records inserted for strategy_variant 325
completed in 4 seconds
Starting XGB prediction for strategy_variant 326
Starting ANN prediction for strategy_variant 326
0 records deleted for strategy_variant 326
1 records inserted for strategy_variant 326
completed in 4 seconds
Starting 

0 records deleted for strategy_variant 362
1 records inserted for strategy_variant 362
completed in 6 seconds
Starting XGB prediction for strategy_variant 363
Starting ANN prediction for strategy_variant 363
0 records deleted for strategy_variant 363
1 records inserted for strategy_variant 363
completed in 4 seconds
Starting XGB prediction for strategy_variant 364
Starting ANN prediction for strategy_variant 364
0 records deleted for strategy_variant 364
1 records inserted for strategy_variant 364
completed in 4 seconds
Starting XGB prediction for strategy_variant 365
Starting ANN prediction for strategy_variant 365
0 records deleted for strategy_variant 365
1 records inserted for strategy_variant 365
completed in 5 seconds
Starting XGB prediction for strategy_variant 366
Starting ANN prediction for strategy_variant 366
0 records deleted for strategy_variant 366
1 records inserted for strategy_variant 366
completed in 3 seconds
Starting XGB prediction for strategy_variant 367
Starting 