# Financial ML Final Project-Part 2: Jumps Modeling


Work by: Skander Chouchene, Mohamed Amine Mairech, Oussema Labidi
3rd year EGES - EPT - 2020/2021

In [None]:
%matplotlib inline
import pandas as pd 
import numpy as np
import pickle
import datetime
import pandas_datareader.data as web
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

#import tensorflow.keras as keras
#from tensorflow.keras import layers


## Loading and preparing raw data

### Raw data loading

We start by setting up the directory/path to the raw data. Notice that I clean up the names of the data folders and csv files. 

In [None]:
raw_data_dir = '/content/drive/MyDrive/data'

The following loads the raw historical daily prices. 

In [None]:
px_db = pd.read_csv(raw_data_dir+'/SHARADAR_SEP.csv')

In [None]:
#px_db=px_db.iloc[int(3*px_db.shape[0]/4):]

In [None]:
px_db.date=pd.to_datetime(px_db.date, format='%Y-%m-%d', errors='ignore')


The following loads the meta-data of the stocks/tickers (exhange, sector, market cap etc.)

In [None]:
ticker_meta= pd.read_csv(raw_data_dir+'/SHARADAR_TICKERS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


The following loads the constituents of the spy500 index

In [None]:
sp500 = pd.read_csv(raw_data_dir+'/SHARADAR_SP500.csv', encoding='latin-1')

The following loads the last ~10 years of the relevant ETF prices from yahoo finance.

In [None]:
sector_etf_list = ['XLK', 'XLI', 'XLV', 'XLE', 'XLP', 'XLY', 'XTL', 'XLRE', 'XLB', 'XLF', 'XLU']
etf_list = ['SPY', 'XLK', 'XLI', 'XLV', 'XLE', 'XLP', 'XLY', 'XTL', 'XLRE', 'XLB', 'XLF', 'XLU']
etf_dict = {'SPY': 'Market', 
            'XLK': 'Technology', 
            'XLI': 'Industrials', 
            'XLV': 'Healthcare', 
            'XLE': 'Energy', 
            'XLP': 'Consumer Defensive', 
            'XLY': 'Consumer Cyclical', 
            'XTL': 'Communication Services', 
            'XLRE': 'Real Estate', 
            'XLB': 'Basic Materials', 
            'XLF': 'Financial Services', 
            'XLU': 'Utilities'}


In [None]:
start = datetime.datetime(2010, 1, 1)
end = datetime.date.today() 
etf_data = []
for etf in etf_list:
    df = web.DataReader(etf,'yahoo',  start, end)
    df['sector'] = etf_dict[etf]
    df['etf']    = etf
    df['Date']   = df.index.tolist()
    etf_data.append(df)
    
etf_db = pd.concat(etf_data)
etf_db.reset_index(drop = True, inplace= True)
# etf_df = web.DataReader(etf_list,'yahoo',  start, end)

### Panel data preparation

Join meta-data data to historical prices and filter relevant information:
    1. tickers that belong to the S&P500 index
    2. Firms listed in NYSE and NASDAQ
    3. 2015 to 2019 data

In [None]:
# limiting the meta-data info to the table of historical prices
ticker_sep = ticker_meta.loc[ticker_meta.table == 'SEP', :]
# limiting the tickers to the sp500 universe
ticker_sep_sp500 = ticker_sep.loc[ticker_sep['ticker'].isin(sp500['ticker'].tolist())  , :]
pricing_db_sp500 = px_db.loc[px_db['ticker'].isin(sp500['ticker'].tolist())  , :]
# joining meta-data and historical prices
pricing_db_sp500 = pd.merge(pricing_db_sp500, ticker_sep_sp500[['ticker','sector','scalemarketcap','exchange']], on = 'ticker', how = 'left')
# construct the variable year
pricing_db_sp500['year'] = [str(s)[0:4] for s in pricing_db_sp500.date]

In [None]:
idx = (pricing_db_sp500['year'].isin(['2015','2016','2017', '2018', '2019']) ) & (pricing_db_sp500['exchange'].isin(['NYSE', 'NASDAQ']) ) 

In [None]:
sample_db_sp500 = pricing_db_sp500.loc[idx,:]
sample_db_sp500.reset_index(drop = True, inplace = True)

In [None]:
sample_db_sp500.shape

(827165, 14)

## Feature Engineering

In [None]:
eps = 1e-8
def get_change_perc(X, prior_X, eps):
    return 100*(X-prior_X)/(np.abs(prior_X)+eps)
def get_change(X, prior_X, eps):
    return 10000* (np.log(X) - np.log(prior_X+eps) )


In [None]:
def extract_features(px_db_sample, etf_db):
    
    etf_db['TradeDtKey'] = [ int(str(s)[0:10].replace('-','')) for s in etf_db['Date'] ] 
    etf_db.sort_values(by ='Date', ascending=True, inplace=True)

    etf_db['next_price_close'] = etf_db.groupby('etf')['Adj Close'].shift(-1)
    etf_db['next_price_close_2'] = etf_db.groupby('etf')['Adj Close'].shift(-2)
    etf_db['next_price_close_5'] = etf_db.groupby('etf')['Adj Close'].shift(-5)
    etf_db['next_price_close_10'] = etf_db.groupby('etf')['Adj Close'].shift(-10)
    etf_db['next_price_close_20'] = etf_db.groupby('etf')['Adj Close'].shift(-20)

    etf_db['prior_price_close'] = etf_db.groupby('etf')['Adj Close'].shift(1)
    etf_db['prior_price_close_20'] = etf_db.groupby('etf')['Adj Close'].shift(20)
    etf_db['prior_price_close_60'] = etf_db.groupby('etf')['Adj Close'].shift(60)
    etf_db['prior_price_close_120'] = etf_db.groupby('etf')['Adj Close'].shift(120)
    etf_db['prior_price_close_250'] = etf_db.groupby('etf')['Adj Close'].shift(250)
    

    etf_db['prior_ret'] = get_change(etf_db['Adj Close'] , etf_db['prior_price_close'] , eps)
    etf_db['prior_ret_20'] = get_change(etf_db['Adj Close'] , etf_db['prior_price_close_20'] , eps)
    etf_db['prior_ret_60'] = get_change(etf_db['Adj Close'] , etf_db['prior_price_close_60'] , eps)
    etf_db['prior_ret_120'] = get_change(etf_db['Adj Close'] , etf_db['prior_price_close_120'] , eps)
    etf_db['prior_ret_250'] = get_change(etf_db['Adj Close'] , etf_db['prior_price_close_250'] , eps)

    etf_db['next_ret'] = get_change( etf_db['next_price_close'] ,etf_db['Adj Close'] , eps)
    etf_db['next_ret_2'] = get_change(etf_db['next_price_close_2'] , etf_db['Adj Close'] , eps)
    etf_db['next_ret_5'] = get_change(etf_db['next_price_close_5'] ,etf_db['Adj Close'] ,  eps)
    etf_db['next_ret_10'] = get_change(etf_db['next_price_close_10'] ,etf_db['Adj Close'] ,  eps)
    etf_db['next_ret_20'] = get_change(etf_db['next_price_close_20'] , etf_db['Adj Close'] , eps)
    
    
    
    px_db_sample['Date'] = [ int(str(s)[0:10].replace('-','')) for s in px_db_sample.date ] 
    px_db_sample.sort_values(by ='Date', ascending=True, inplace=True)
    px_db_sample = px_db_sample.loc[px_db_sample['exchange'].isin(['NASDAQ', 'NYSE']), :]
    px_db_sample = px_db_sample.loc[px_db_sample['scalemarketcap'].isin(['3 - Small', '4 - Mid', '5 - Large', '6 - Mega']), :]


    px_db_sample.reset_index(drop = True, inplace =True)
    px_db_sample['px_rank'] = px_db_sample.groupby('Date')['close'].rank(ascending = False, method = 'dense', pct= True)

    px_db_sample['next_px_rank'] = px_db_sample.groupby('ticker')['px_rank'].shift(-1)
    px_db_sample['next_price_close'] = px_db_sample.groupby('ticker')['close'].shift(-1)
    px_db_sample['next_price_close_2'] = px_db_sample.groupby('ticker')['close'].shift(-2)
    px_db_sample['next_price_close_5'] = px_db_sample.groupby('ticker')['close'].shift(-5)
    px_db_sample['next_price_close_10'] = px_db_sample.groupby('ticker')['close'].shift(-10)
    px_db_sample['next_price_close_20'] = px_db_sample.groupby('ticker')['close'].shift(-20)

    px_db_sample['prior_price_close'] = px_db_sample.groupby('ticker')['close'].shift(1)
    px_db_sample['prior_price_close_20'] = px_db_sample.groupby('ticker')['close'].shift(20)
    px_db_sample['prior_price_close_60'] = px_db_sample.groupby('ticker')['close'].shift(60)
    px_db_sample['prior_price_close_120'] = px_db_sample.groupby('ticker')['close'].shift(120)
    px_db_sample['prior_price_close_250'] = px_db_sample.groupby('ticker')['close'].shift(250)


    px_db_sample['range'] = (px_db_sample['high'] - px_db_sample['low'])/ (px_db_sample['close']+eps)
    px_db_sample['px_rank_chg'] = px_db_sample['next_px_rank'] - px_db_sample['px_rank'] 


    for w in [5,10,20,60]:
        px_db_sample['90PctileRank'+'_'+str(w)] = px_db_sample.groupby('ticker')['px_rank'].apply(lambda x:x.rolling(window=w, min_periods=w).quantile(0.9))
        px_db_sample['10PctileRank'+'_'+str(w)] = px_db_sample.groupby('ticker')['px_rank'].apply(lambda x:x.rolling(window=w, min_periods=w).quantile(0.1))
        px_db_sample['MnRank'+'_'+str(w)] = 0.5*(px_db_sample['90PctileRank'+'_'+str(w)]  + px_db_sample['10PctileRank'+'_'+str(w)])
        px_db_sample['RangeRank'+'_'+str(w)] = (px_db_sample['90PctileRank'+'_'+str(w)]  - px_db_sample['10PctileRank'+'_'+str(w)])
        px_db_sample['Rank_Zscore'+'_'+str(w)] =  (px_db_sample['px_rank'] - px_db_sample['MnRank'+'_'+str(w)] )/px_db_sample['RangeRank'+'_'+str(w)]


    px_db_sample['prior_ret'] = get_change(px_db_sample['close'] , px_db_sample['prior_price_close'] , eps)
    px_db_sample['prior_ret_20'] = get_change(px_db_sample['close'] , px_db_sample['prior_price_close_20'] , eps)
    px_db_sample['prior_ret_60'] = get_change(px_db_sample['close'] , px_db_sample['prior_price_close_60'] , eps)
    px_db_sample['prior_ret_120'] = get_change(px_db_sample['close'] , px_db_sample['prior_price_close_120'] , eps)
    px_db_sample['prior_ret_250'] = get_change(px_db_sample['close'] , px_db_sample['prior_price_close_250'] , eps)

    px_db_sample['next_ret'] = get_change( px_db_sample['next_price_close'] ,px_db_sample['close'] , eps)
    px_db_sample['next_ret_2'] = get_change(px_db_sample['next_price_close_2'] , px_db_sample['close'] , eps)
    px_db_sample['next_ret_5'] = get_change(px_db_sample['next_price_close_5'] ,px_db_sample['close'] ,  eps)
    px_db_sample['next_ret_10'] = get_change(px_db_sample['next_price_close_10'] ,px_db_sample['close'] ,  eps)
    px_db_sample['next_ret_20'] = get_change(px_db_sample['next_price_close_20'] , px_db_sample['close'] , eps)
    
    
    px_db_sample['vol_20'] = px_db_sample.groupby('ticker')['prior_ret'].apply(lambda x : x.rolling(20, min_periods = 1).std())
    px_db_sample['vol_60'] = px_db_sample.groupby('ticker')['prior_ret'].apply(lambda x : x.rolling( 60, min_periods = 1).std())
    px_db_sample['vol_90'] = px_db_sample.groupby('ticker')['prior_ret'].apply(lambda x : x.rolling( 90, min_periods = 1).std())
    px_db_sample['vol_120'] = px_db_sample.groupby('ticker')['prior_ret'].apply(lambda x : x.rolling( 120, min_periods = 1).std())
    px_db_sample['vol_250'] = px_db_sample.groupby('ticker')['prior_ret'].apply(lambda x : x.rolling( 250, min_periods = 1).std())



    px_db_sample['DollarVolume'] = px_db_sample['close'] * px_db_sample['volume']
    px_db_sample['adv90'] = px_db_sample.groupby('ticker')['volume'].apply(lambda x : x.rolling( 90, min_periods = 1).mean())/1000000
    px_db_sample['dollar_adv90'] = px_db_sample.groupby('ticker')['DollarVolume'].apply(lambda x: x.rolling(90, min_periods = 1).mean())/1000000

    px_db_sample['FiftyTwoWk_High'] = px_db_sample.groupby('ticker')['close'].apply(lambda x: x.rolling(window= 260, min_periods = 5).max()) 
    px_db_sample['FiftyTwoWk_Low'] = px_db_sample.groupby('ticker')['close'].apply(lambda x: x.rolling(window= 260, min_periods = 5).min()) 

    px_db_sample['px_to_52wk_high'] = get_change(px_db_sample['close'] , px_db_sample['FiftyTwoWk_High'] , eps)
    px_db_sample['px_to_52wk_low'] = get_change(px_db_sample['close'] , px_db_sample['FiftyTwoWk_Low'] , eps)
    
    etf_db_cols = [ 'sector',
                     'etf',
                     
                     'TradeDtKey',
                     'prior_ret',
                     'prior_ret_20',
                     'prior_ret_60',
                     'prior_ret_120',
                     'prior_ret_250',
                     'next_ret',
                     'next_ret_2',
                     'next_ret_5',
                     'next_ret_10',
                     'next_ret_20']
    market_db = etf_db.loc[etf_db['etf'].isin(['SPY']),:][etf_db_cols]
    market_db.rename(columns = dict(zip(etf_db_cols[3:], ['market_'+s for s in etf_db_cols[3:] ])), inplace = True)
    
    sector_db = etf_db.loc[etf_db['etf'].isin(sector_etf_list),:][etf_db_cols]
    sector_db.rename(columns = dict(zip(etf_db_cols[3:], ['sector_'+s for s in etf_db_cols[3:] ])), inplace = True)    
    
    px_db_sample = pd.merge(px_db_sample,
                            sector_db , 
                            left_on = ['Date' , 'sector'], 
                            right_on = ['TradeDtKey', 'sector'] ,
                           how = 'left')
    px_db_sample = pd.merge(px_db_sample,
                            market_db[['TradeDtKey',
                                     'market_prior_ret',
                                     'market_prior_ret_20',
                                     'market_prior_ret_60',
                                     'market_prior_ret_120',
                                     'market_prior_ret_250',
                                     'market_next_ret',
                                     'market_next_ret_2',
                                     'market_next_ret_5',
                                     'market_next_ret_10',
                                     'market_next_ret_20']] , 
                            left_on = ['TradeDtKey'  ], 
                            right_on = ['TradeDtKey' ] ,
                           how = 'left')    
    
    feat_db_gp = px_db_sample.groupby('ticker')
    
    Beta_trail = []
    
    for k,v in feat_db_gp:
        v_tmp = v[['TradeDtKey', 'ticker', 'prior_ret', 'market_prior_ret', 'sector_prior_ret']].copy()
        v_tmp.dropna(inplace = True)
        v_tmp.reset_index(drop = True, inplace = True)
        N = np.minimum(250, v_tmp.shape[0])
        
        if N>10:
            beta = v['prior_ret'].rolling(window = N, min_periods=10).cov(v['market_prior_ret'])/v['market_prior_ret'].rolling(window = N, min_periods=10).var()
            sec_beta = v['prior_ret'].rolling(window = N, min_periods=10).cov(v['sector_prior_ret'])/v['sector_prior_ret'].rolling(window = N, min_periods=10).var()
        else:
            beta = v['prior_ret'] * 0
            sec_beta = v['prior_ret'] * 0
        v_df = pd.DataFrame({'ticker': k,
                            'TradeDtKey': v['TradeDtKey'].tolist() ,
                            'Beta': beta.tolist(),
                            'SecBeta':sec_beta.tolist() })
        
        Beta_trail.append(v_df)
        
    Beta_trail_df  = pd.concat(Beta_trail)
    Beta_trail_df.reset_index(drop = True, inplace = True)
    
    px_db_sample = pd.merge(px_db_sample, 
                            Beta_trail_df,
                           
                            left_on = ['ticker' , 'TradeDtKey'], 
                            right_on = ['ticker', 'TradeDtKey'] ,
                           how = 'left')    
    
    px_db_sample['market_relative_prior_ret'] = px_db_sample['prior_ret'] -px_db_sample['Beta']*px_db_sample['market_prior_ret']
    px_db_sample['market_relative_prior_ret_20'] = px_db_sample['prior_ret_20'] -px_db_sample['Beta']*px_db_sample['market_prior_ret_20']
    px_db_sample['market_relative_prior_ret_60'] = px_db_sample['prior_ret_60'] -px_db_sample['Beta']*px_db_sample['market_prior_ret_60']
    px_db_sample['market_relative_prior_ret_120'] = px_db_sample['prior_ret_120'] -px_db_sample['Beta']*px_db_sample['market_prior_ret_120']
    px_db_sample['market_relative_prior_ret_250'] = px_db_sample['prior_ret_250'] -px_db_sample['Beta']*px_db_sample['market_prior_ret_250']

    px_db_sample['sector_relative_prior_ret'] = px_db_sample['prior_ret'] -px_db_sample['SecBeta']*px_db_sample['sector_prior_ret']
    px_db_sample['sector_relative_prior_ret_20'] = px_db_sample['prior_ret_20'] -px_db_sample['SecBeta']*px_db_sample['sector_prior_ret_20']
    px_db_sample['sector_relative_prior_ret_60'] = px_db_sample['prior_ret_60'] -px_db_sample['SecBeta']*px_db_sample['sector_prior_ret_60']
    px_db_sample['sector_relative_prior_ret_120'] = px_db_sample['prior_ret_120'] -px_db_sample['SecBeta']*px_db_sample['sector_prior_ret_120']
    px_db_sample['sector_relative_prior_ret_250'] = px_db_sample['prior_ret_250'] -px_db_sample['SecBeta']*px_db_sample['sector_prior_ret_250']
    
    
    
    
    return px_db_sample






In [None]:
feat_db = extract_features(sample_db_sp500, etf_db) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
feat_db.fillna(0, inplace = True)
feat_db.replace(+np.Inf,0,inplace=True)
feat_db.replace(-np.Inf,0,inplace=True)

### Potential features 

Potential features/drivers that can be used to predict/forecast future return may include:


 
 'MnRank_5',
 'RangeRank_5',
 'Rank_Zscore_5',
 
 'MnRank_10',
 'RangeRank_10',
 'Rank_Zscore_10',
 
 'MnRank_20',
 'RangeRank_20',
 'Rank_Zscore_20',
 
 'MnRank_60',
 'RangeRank_60',
 'Rank_Zscore_60',
 'prior_ret',
 'prior_ret_20',
 'prior_ret_60',
 'prior_ret_120',
 'prior_ret_250',
 
 'vol_20',
 'vol_60',
 'vol_90',
 'vol_120',
 'vol_250',
 
 'adv90',
 'dollar_adv90',
 
 'px_to_52wk_high',
 'px_to_52wk_low',
 
 
 'sector_prior_ret',
 'sector_prior_ret_20',
 'sector_prior_ret_60',
 'sector_prior_ret_120',
 'sector_prior_ret_250',
 
 
 
 'market_prior_ret',
 'market_prior_ret_20',
 'market_prior_ret_60',
 'market_prior_ret_120',
 'market_prior_ret_250',
 
 'Beta',
 'SecBeta',
 'market_relative_prior_ret',
 'market_relative_prior_ret_20',
 'market_relative_prior_ret_60',
 'market_relative_prior_ret_120',
 'market_relative_prior_ret_250',
 'sector_relative_prior_ret',
 'sector_relative_prior_ret_20',
 'sector_relative_prior_ret_60',
 'sector_relative_prior_ret_120',
 'sector_relative_prior_ret_250'

### Potential response variables

Potential response variables may include:
    
'next_ret',
'next_ret_2',
'next_ret_5',
'next_ret_10',
'next_ret_20'
    
But can also include market relative returns or sector relative returns.    
    

## Jumps Identification

#### Close2Open returns

In [None]:
def close2open(df):
  s=np.log(df["open"]/df.shift(1)["close"])
  s.rename("Close2Open",inplace=True)
  return s


In [None]:
feat_db["Close2Open"]=np.nan

for ticker in feat_db["ticker"].unique():
  df=close2open(feat_db[feat_db["ticker"]==ticker])
  feat_db.update(df)

#### PV30

In [None]:
def log_high_low(df):
  ind=df.index
  return pd.DataFrame(np.log(df["high"]/df["low"]),index=ind)

def PV30(df):
  df_ret=log_high_low(df).rolling(30,min_periods = 1).std()
  df_ret.columns=["PV30"]
  return df_ret

In [None]:
feat_db["PV30"]=np.nan

for ticker in feat_db["ticker"].unique():
  df=PV30(feat_db[feat_db["ticker"]==ticker])
  feat_db.update(df)

#### Median PV30

In [None]:
def medianPV30(df):
  df_ret=df["PV30"].rolling(30,min_periods = 1).median()
  df_ret=pd.DataFrame(df_ret)
  df_ret.columns=["median PV30"]
  return df_ret

In [None]:
feat_db["median PV30"]=np.nan

for ticker in feat_db["ticker"].unique():
  df=medianPV30(feat_db[feat_db["ticker"]==ticker])
  feat_db.update(df)

#### ^VIX

In [None]:
start = datetime.datetime(2010, 1, 1)
end = datetime.date.today()
VIX = web.DataReader("^VIX",'yahoo',  start, end)
VIX.index=pd.to_datetime(VIX.index, format='%Y-%m-%d', errors='ignore')
VIX["date"]=VIX.index
VIX.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,21.68,20.030001,21.68,20.040001,0,20.040001,2010-01-04
2010-01-05,20.129999,19.34,20.049999,19.35,0,19.35,2010-01-05
2010-01-06,19.68,18.77,19.59,19.16,0,19.16,2010-01-06
2010-01-07,19.709999,18.700001,19.68,19.059999,0,19.059999,2010-01-07
2010-01-08,19.27,18.110001,19.27,18.129999,0,18.129999,2010-01-08


In [None]:
VIX["median VIX 30 days"]=VIX["Adj Close"].rolling(30,min_periods = 1).median()

In [None]:
VIX_subset=VIX[["date","Close","median VIX 30 days"]]
VIX_subset.columns=["date","VIX","median VIX 30 days"]

In [None]:
feat_db=pd.merge(feat_db, VIX_subset, how='outer', on='date')

In [None]:
feat_db.dropna(inplace=True)

In [None]:
feat_db.tail()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,sector,scalemarketcap,exchange,year,Date,px_rank,next_px_rank,next_price_close,next_price_close_2,next_price_close_5,next_price_close_10,next_price_close_20,prior_price_close,prior_price_close_20,prior_price_close_60,prior_price_close_120,prior_price_close_250,range,px_rank_chg,90PctileRank_5,10PctileRank_5,MnRank_5,RangeRank_5,Rank_Zscore_5,90PctileRank_10,10PctileRank_10,MnRank_10,RangeRank_10,Rank_Zscore_10,90PctileRank_20,...,px_to_52wk_low,etf,TradeDtKey,sector_prior_ret,sector_prior_ret_20,sector_prior_ret_60,sector_prior_ret_120,sector_prior_ret_250,sector_next_ret,sector_next_ret_2,sector_next_ret_5,sector_next_ret_10,sector_next_ret_20,market_prior_ret,market_prior_ret_20,market_prior_ret_60,market_prior_ret_120,market_prior_ret_250,market_next_ret,market_next_ret_2,market_next_ret_5,market_next_ret_10,market_next_ret_20,Beta,SecBeta,market_relative_prior_ret,market_relative_prior_ret_20,market_relative_prior_ret_60,market_relative_prior_ret_120,market_relative_prior_ret_250,sector_relative_prior_ret,sector_relative_prior_ret_20,sector_relative_prior_ret_60,sector_relative_prior_ret_120,sector_relative_prior_ret_250,Close2Open,PV30,median PV30,VIX,median VIX 30 days
2153701,FCX,2019-12-31,12.98,13.15,12.92,13.12,11716373.0,0.0,13.12,2020-05-01,Basic Materials,5 - Large,NYSE,2019,20191231.0,0.94859,0.0,0.0,0.0,0.0,0.0,0.0,13.01,11.45,8.83,11.1,10.07,0.01753,0.0,0.948743,0.947107,0.947925,0.001637,0.406691,0.94901,0.946932,0.947971,0.002078,0.298104,0.94901,...,4328.958398,XLB,20191231.0,73.534275,382.657263,923.609523,712.595269,2397.861093,-117.917207,-280.684429,-300.798358,-149.268548,-413.851665,24.263135,371.723568,942.515524,821.4874,2948.469634,93.084108,17.074148,80.149258,194.760548,179.208913,1.868948,1.603378,38.848367,666.748546,2198.315272,136.609595,-2864.765362,-33.708354,747.936194,2478.932257,529.366988,-1198.907564,-0.002309,0.010229,0.012756,13.78,12.725
2153702,FDX,2019-12-31,149.22,151.57,148.75,151.21,2153367.0,0.0,151.21,2020-05-01,Industrials,5 - Large,NYSE,2019,20191231.0,0.223881,0.0,0.0,0.0,0.0,0.0,0.0,150.14,158.03,143.0,162.6,157.19,0.01865,0.0,0.225092,0.21925,0.222171,0.005842,0.29259,0.2295,0.215443,0.222471,0.014057,0.100258,0.227415,...,885.938128,XLI,20191231.0,-6.135062,145.791067,796.051492,661.548192,2810.892954,187.261877,167.968265,184.852911,254.496949,189.671196,24.263135,371.723568,942.515524,821.4874,2948.469634,93.084108,17.074148,80.149258,194.760548,179.208913,1.4694,1.371427,35.361816,-987.363544,-826.682722,-1933.329652,-4720.338246,79.427856,-641.094641,-533.476502,-1633.500768,-4242.790022,-0.006146,0.009344,0.006871,13.78,12.725
2153703,FE,2019-12-31,48.49,48.61,48.18,48.6,2929696.0,0.0,48.6,2020-05-01,Utilities,5 - Large,NYSE,2019,20191231.0,0.688226,0.0,0.0,0.0,0.0,0.0,0.0,48.49,47.52,48.23,43.59,36.71,0.008848,0.0,0.695228,0.688226,0.691727,0.007002,-0.5,0.693429,0.686313,0.689871,0.007116,-0.231201,0.693069,...,3045.589425,XLU,20191231.0,38.762464,388.995982,54.612029,729.698826,2481.041029,-126.140747,-105.789247,-115.17677,194.627054,697.766052,24.263135,371.723568,942.515524,821.4874,2948.469634,93.084108,17.074148,80.149258,194.760548,179.208913,0.312841,0.979639,15.068886,108.438083,-218.434789,830.962481,1883.340341,-15.313836,-156.347186,22.922878,373.116013,375.218091,0.0,0.003407,0.004588,13.78,12.725
2153704,F,2019-12-31,9.25,9.33,9.24,9.3,32342009.0,0.0,9.3,2020-05-01,Consumer Cyclical,5 - Large,NYSE,2019,20191231.0,0.965174,0.0,0.0,0.0,0.0,0.0,0.0,9.25,9.01,8.74,10.19,7.78,0.009677,0.0,0.965174,0.963744,0.964459,0.00143,0.5,0.965191,0.962084,0.963637,0.003108,0.494452,0.965174,...,1979.265536,XLY,20191231.0,14.362293,355.3918,539.890839,287.735919,2642.423209,118.10113,32.636667,74.668289,78.625087,-33.543752,24.263135,371.723568,942.515524,821.4874,2948.469634,93.084108,17.074148,80.149258,194.760548,179.208913,1.072312,1.038522,27.890836,-81.810192,-389.628164,-1794.814886,-1377.09736,38.992912,-52.289085,60.353342,-1212.744688,-959.635189,0.0,0.005096,0.005374,13.78,12.725
2153705,DG,2019-12-31,157.17,157.45,155.23,155.98,1097189.0,0.0,155.98,2020-05-01,Consumer Defensive,5 - Large,NYSE,2019,20191231.0,0.20398,0.0,0.0,0.0,0.0,0.0,0.0,157.35,154.68,160.97,140.36,107.84,0.014233,0.0,0.205355,0.20398,0.204668,0.001375,-0.5,0.205702,0.200564,0.203133,0.005138,0.164909,0.206271,...,4561.242442,XLP,20191231.0,9.531811,215.853029,335.653213,682.121515,2541.147785,-79.706836,-95.725099,-114.980822,88.524301,145.021778,24.263135,371.723568,942.515524,821.4874,2948.469634,93.084108,17.074148,80.149258,194.760548,179.208913,0.637505,0.583349,-102.916177,-153.282524,-915.760952,531.46975,1811.126053,-93.008667,-42.224337,-510.705086,657.257657,2208.41593,-0.001145,0.00584,0.007188,13.78,12.725


#### JUMP

In [None]:
#feat_db=pd.read_csv("/content/drive/MyDrive/data/feat_db.csv")

In [None]:
difference=pd.DataFrame(feat_db["Close2Open"]-feat_db["median PV30"]*feat_db["median VIX 30 days"]/feat_db["VIX"].shift(1))

In [None]:
feat_db["JUMP"]=np.sign(difference)

In [None]:
JUMP_subset=feat_db[feat_db["JUMP"]==1]

In [None]:
JUMP_subset.head()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,sector,scalemarketcap,exchange,year,Date,px_rank,next_px_rank,next_price_close,next_price_close_2,next_price_close_5,next_price_close_10,next_price_close_20,prior_price_close,prior_price_close_20,prior_price_close_60,prior_price_close_120,prior_price_close_250,range,px_rank_chg,90PctileRank_5,10PctileRank_5,MnRank_5,RangeRank_5,Rank_Zscore_5,90PctileRank_10,10PctileRank_10,MnRank_10,RangeRank_10,Rank_Zscore_10,90PctileRank_20,...,etf,TradeDtKey,sector_prior_ret,sector_prior_ret_20,sector_prior_ret_60,sector_prior_ret_120,sector_prior_ret_250,sector_next_ret,sector_next_ret_2,sector_next_ret_5,sector_next_ret_10,sector_next_ret_20,market_prior_ret,market_prior_ret_20,market_prior_ret_60,market_prior_ret_120,market_prior_ret_250,market_next_ret,market_next_ret_2,market_next_ret_5,market_next_ret_10,market_next_ret_20,Beta,SecBeta,market_relative_prior_ret,market_relative_prior_ret_20,market_relative_prior_ret_60,market_relative_prior_ret_120,market_relative_prior_ret_250,sector_relative_prior_ret,sector_relative_prior_ret_20,sector_relative_prior_ret_60,sector_relative_prior_ret_120,sector_relative_prior_ret_250,Close2Open,PV30,median PV30,VIX,median VIX 30 days,JUMP
811,EQIX,2015-01-02,228.51,229.56,224.28,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.02651,224.32,219.43,216.89,220.24,218.0,0.0,0.0,0.0,0.0,0.0,0.023296,-0.000316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008173,0.0,0.0,17.790001,14.65,1.0
812,EQIX,2015-01-02,228.51,229.56,224.28,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.02651,224.32,219.43,216.89,220.24,218.0,0.0,0.0,0.0,0.0,0.0,0.023296,-0.000316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008173,0.0,0.0,17.790001,14.65,1.0
813,EQIX,2015-01-02,228.51,229.56,224.28,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.02651,224.32,219.43,216.89,220.24,218.0,0.0,0.0,0.0,0.0,0.0,0.023296,-0.000316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008173,0.0,0.0,17.790001,14.65,1.0
814,EQIX,2015-01-02,228.51,229.56,224.28,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.02651,224.32,219.43,216.89,220.24,218.0,0.0,0.0,0.0,0.0,0.0,0.023296,-0.000316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008173,0.0,0.0,17.790001,14.65,1.0
815,EQIX,2015-01-02,228.51,229.56,224.28,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.02651,224.32,219.43,216.89,220.24,218.0,0.0,0.0,0.0,0.0,0.0,0.023296,-0.000316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008173,0.0,0.0,17.790001,14.65,1.0


In [None]:
#JUMP_subset.to_csv("/content/drive/MyDrive/data/jump_without_returns.csv")

In [None]:
#JUMP_subset=pd.read_csv("/content/drive/MyDrive/data/jump_without_returns.csv")

## Predicting JUMP's Response

### Forward 1,2,5,10 and 20 day returns

In [None]:
def forward_returns(df,window):
  ch="forward returns "+str(window)
  df_ret=np.log(df.shift(-1*window)["close"]/df["close"])
  df_ret=pd.DataFrame(df_ret)
  df_ret.columns=[ch]
  return df_ret

In [None]:
JUMP_subset["forward returns 1"]=np.nan
JUMP_subset["forward returns 2"]=np.nan
JUMP_subset["forward returns 5"]=np.nan
JUMP_subset["forward returns 10"]=np.nan
JUMP_subset["forward returns 20"]=np.nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [None]:
for ticker in JUMP_subset["ticker"].unique():
  df1=forward_returns(JUMP_subset[JUMP_subset["ticker"]==ticker],1)

  JUMP_subset.update(df1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
for ticker in JUMP_subset["ticker"].unique():
  df2=forward_returns(JUMP_subset[JUMP_subset["ticker"]==ticker],2)

  JUMP_subset.update(df2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
for ticker in JUMP_subset["ticker"].unique():
  df5=forward_returns(JUMP_subset[JUMP_subset["ticker"]==ticker],5)

  JUMP_subset.update(df5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
for ticker in JUMP_subset["ticker"].unique():
  df10=forward_returns(JUMP_subset[JUMP_subset["ticker"]==ticker],10)

  JUMP_subset.update(df10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
for ticker in JUMP_subset["ticker"].unique():
  df20=forward_returns(JUMP_subset[JUMP_subset["ticker"]==ticker],20)

  JUMP_subset.update(df20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
JUMP_subset.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Categorization

In [None]:
JUMP_subset["forward returns 1"]=np.sign(JUMP_subset["forward returns 1"])
JUMP_subset["forward returns 2"]=np.sign(JUMP_subset["forward returns 2"])
JUMP_subset["forward returns 5"]=np.sign(JUMP_subset["forward returns 5"])
JUMP_subset["forward returns 10"]=np.sign(JUMP_subset["forward returns 10"])
JUMP_subset["forward returns 20"]=np.sign(JUMP_subset["forward returns 20"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [None]:
JUMP_subset

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,sector,scalemarketcap,exchange,year,Date,px_rank,next_px_rank,next_price_close,next_price_close_2,next_price_close_5,next_price_close_10,next_price_close_20,prior_price_close,prior_price_close_20,prior_price_close_60,prior_price_close_120,prior_price_close_250,range,px_rank_chg,90PctileRank_5,10PctileRank_5,MnRank_5,RangeRank_5,Rank_Zscore_5,90PctileRank_10,10PctileRank_10,MnRank_10,RangeRank_10,Rank_Zscore_10,90PctileRank_20,...,sector_prior_ret_120,sector_prior_ret_250,sector_next_ret,sector_next_ret_2,sector_next_ret_5,sector_next_ret_10,sector_next_ret_20,market_prior_ret,market_prior_ret_20,market_prior_ret_60,market_prior_ret_120,market_prior_ret_250,market_next_ret,market_next_ret_2,market_next_ret_5,market_next_ret_10,market_next_ret_20,Beta,SecBeta,market_relative_prior_ret,market_relative_prior_ret_20,market_relative_prior_ret_60,market_relative_prior_ret_120,market_relative_prior_ret_250,sector_relative_prior_ret,sector_relative_prior_ret_20,sector_relative_prior_ret_60,sector_relative_prior_ret_120,sector_relative_prior_ret_250,Close2Open,PV30,median PV30,VIX,median VIX 30 days,JUMP,forward returns 1,forward returns 2,forward returns 5,forward returns 10,forward returns 20
811,EQIX,2015-01-02,228.51,229.560,224.280,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.026510,224.32,219.43,216.89,220.24,218.00,0.00,0.00,0.00,0.00,0.00,0.023296,-0.000316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008173,0.000000,0.000000,17.790001,14.650,1.0,0.0,0.0,0.0,0.0,0.0
812,EQIX,2015-01-02,228.51,229.560,224.280,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.026510,224.32,219.43,216.89,220.24,218.00,0.00,0.00,0.00,0.00,0.00,0.023296,-0.000316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008173,0.000000,0.000000,17.790001,14.650,1.0,0.0,0.0,0.0,0.0,0.0
813,EQIX,2015-01-02,228.51,229.560,224.280,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.026510,224.32,219.43,216.89,220.24,218.00,0.00,0.00,0.00,0.00,0.00,0.023296,-0.000316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008173,0.000000,0.000000,17.790001,14.650,1.0,0.0,0.0,0.0,0.0,0.0
814,EQIX,2015-01-02,228.51,229.560,224.280,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.026510,224.32,219.43,216.89,220.24,218.00,0.00,0.00,0.00,0.00,0.00,0.023296,-0.000316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008173,0.000000,0.000000,17.790001,14.650,1.0,0.0,0.0,0.0,0.0,0.0
815,EQIX,2015-01-02,228.51,229.560,224.280,226.65,383808.0,0.0,226.65,2018-06-13,Real Estate,5 - Large,NASDAQ,2015,20150102.0,0.026826,0.026510,224.32,219.43,216.89,220.24,218.00,0.00,0.00,0.00,0.00,0.00,0.023296,-0.000316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008173,0.000000,0.000000,17.790001,14.650,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2119795,RDS.A,2019-10-11,57.80,58.205,57.770,57.85,2365519.0,0.0,57.85,2020-05-01,Energy,6 - Mega,NYSE,2019,20191011.0,0.586207,0.588816,57.82,57.54,57.72,59.68,60.25,57.26,56.87,63.13,65.00,65.26,0.007519,0.002609,0.584511,0.580434,0.582472,0.004077,0.915914,0.586264,0.580269,0.583266,0.005995,0.490493,0.591292,...,-1500.983713,-1966.011695,-5.183723,36.209562,-149.672408,277.676756,472.342478,103.134715,-114.971045,-39.627103,210.011546,907.620814,-11.142959,87.373575,56.879273,177.968303,418.423550,0.698609,0.678014,30.460689,251.174738,-845.742535,-1312.054138,-1839.330524,11.723239,472.812393,-418.902127,-147.650060,127.725250,0.009386,0.003576,0.004273,15.580000,16.155,1.0,1.0,1.0,1.0,1.0,1.0
2121524,UN,2019-10-16,59.50,59.785,59.300,59.72,1422886.0,0.0,59.72,2020-05-01,Consumer Defensive,5 - Large,NYSE,2019,20191016.0,0.577741,0.569558,60.46,60.50,59.04,59.05,59.68,58.46,60.33,60.45,59.26,53.71,0.008121,-0.008183,0.583553,0.571523,0.577538,0.012029,0.016911,0.581086,0.570467,0.575776,0.010618,0.185055,0.580645,...,707.863830,1476.305929,46.296730,66.072022,148.054445,149.686489,162.739656,-16.073650,-44.002217,-8.403062,263.230781,817.163042,29.447437,-14.420653,49.475679,190.533503,352.299857,0.365802,0.683541,219.121905,-85.529090,-118.422187,-18.966104,761.757621,206.442274,-81.045124,-198.163072,-406.529954,51.561414,0.017634,0.002960,0.003011,13.680000,15.450,1.0,1.0,-1.0,1.0,-1.0,-1.0
2122187,RDS.A,2019-10-17,58.19,58.325,57.907,58.16,2543197.0,0.0,58.16,2020-05-01,Energy,6 - Mega,NYSE,2019,20191017.0,0.585925,0.596026,57.72,58.28,59.57,57.97,59.13,57.39,58.31,63.17,63.13,64.70,0.007187,0.010102,0.591991,0.586038,0.589014,0.005954,-0.518959,0.589345,0.580566,0.584955,0.008779,0.110415,0.589040,...,-1250.614641,-1948.869235,-52.465931,124.806976,293.911833,109.291384,391.670649,29.447437,-13.890498,-25.840839,277.013955,992.074153,-43.868090,23.694564,36.355074,134.416576,337.399367,0.703705,0.675079,112.555315,-15.982916,-808.132403,-1014.918567,-1763.760754,128.566703,418.533839,-198.803172,24.280495,250.006085,0.013843,0.003486,0.004207,13.790000,15.295,1.0,1.0,1.0,1.0,1.0,1.0
2122301,UN,2019-10-17,60.63,60.630,60.340,60.46,3920417.0,0.0,60.46,2020-05-01,Consumer Defensive,5 - Large,NYSE,2019,20191017.0,0.569558,0.577815,60.50,59.50,59.03,59.24,59.12,59.72,60.34,59.89,59.31,52.70,0.004797,0.008256,0.583553,0.570963,0.577258,0.012590,-0.611601,0.581086,0.570398,0.575742,0.010687,-0.578631,0.580645,...,741.882318,1535.685478,19.775291,52.648957,135.989506,88.685352,113.180910,29.447437,-13.890498,-25.840839,277.013955,992.074153,-43.868090,23.694564,36.355074,134.416576,337.399367,0.358986,0.683648,112.578934,24.854055,104.000941,92.596372,1017.524140,91.499488,7.677202,-59.318922,-315.145899,323.796680,0.015123,0.003054,0.003011,13.790000,15.295,1.0,-1.0,-1.0,-1.0,-1.0,-1.0


Our data finally!

In [None]:
#JUMP_subset.to_csv("/content/drive/MyDrive/data/jump_final.csv")

## Modeling

In [None]:
JUMP_subset=pd.read_csv("/content/drive/MyDrive/jump_final.csv")

#### Final touches

In [None]:
JUMP_subset=JUMP_subset[JUMP_subset["forward returns 1"]!=0]
JUMP_subset=JUMP_subset[JUMP_subset["forward returns 2"]!=0]
JUMP_subset=JUMP_subset[JUMP_subset["forward returns 5"]!=0]
JUMP_subset=JUMP_subset[JUMP_subset["forward returns 10"]!=0]
JUMP_subset=JUMP_subset[JUMP_subset["forward returns 20"]!=0]

In [None]:
train_val_threshold=664978

In [None]:
to_remove=["ticker","date","sector","lastupdated","scalemarketcap","exchange","year","Date","JUMP","etf","Unnamed: 0"]
JUMP_subset=JUMP_subset.drop(to_remove,axis=1)

In [None]:
y=JUMP_subset.iloc[:,-5:]
X=JUMP_subset.iloc[:,:-5]

In [None]:
X_train=X.loc[:train_val_threshold]
X_test=X.loc[train_val_threshold:]
y_train=y.loc[:train_val_threshold]
y_test=y.loc[train_val_threshold:]

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
lr=LogisticRegression()

for i in range(5):
  lr.fit(X_train,y_train.iloc[:,i])
  pred=lr.predict(X_test)
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred,y_test.iloc[:,i]))
  print("\n")

Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.57      0.73     14317

    accuracy                           0.57     14317
   macro avg       0.50      0.29      0.36     14317
weighted avg       1.00      0.57      0.73     14317





  _warn_prf(average, modifier, msg_start, len(result))


Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.52      0.69     14317

    accuracy                           0.52     14317
   macro avg       0.50      0.26      0.34     14317
weighted avg       1.00      0.52      0.69     14317





  _warn_prf(average, modifier, msg_start, len(result))


Classification report, forward_returns  5
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         1
         1.0       1.00      0.49      0.66     14316

    accuracy                           0.49     14317
   macro avg       0.50      0.25      0.33     14317
weighted avg       1.00      0.49      0.66     14317



Classification report, forward_returns  10
              precision    recall  f1-score   support

        -1.0       0.00      1.00      0.00         1
         1.0       1.00      0.49      0.66     14316

    accuracy                           0.49     14317
   macro avg       0.50      0.75      0.33     14317
weighted avg       1.00      0.49      0.66     14317



Classification report, forward_returns  20
              precision    recall  f1-score   support

        -1.0       0.00      1.00      0.00         2
         1.0       1.00      0.53      0.69     14315

    accuracy                           0.53     14

#### L1-Regularized Logistic Regression

In [None]:
l1=LogisticRegression(penalty='l1',solver="saga",l1_ratio=0.1)

for i in range(5):
  l1.fit(X_train,y_train.iloc[:,i])
  pred=l1.predict(X_test)
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred,y_test.iloc[:,i]))
  print("\n")

Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.57      0.73     12360

    accuracy                           0.57     12360
   macro avg       0.50      0.29      0.36     12360
weighted avg       1.00      0.57      0.73     12360





  _warn_prf(average, modifier, msg_start, len(result))
  "(penalty={})".format(self.penalty))


Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.53      0.69     12360

    accuracy                           0.53     12360
   macro avg       0.50      0.26      0.35     12360
weighted avg       1.00      0.53      0.69     12360





  _warn_prf(average, modifier, msg_start, len(result))
  "(penalty={})".format(self.penalty))


Classification report, forward_returns  5
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.51      0.67     12360

    accuracy                           0.51     12360
   macro avg       0.50      0.25      0.34     12360
weighted avg       1.00      0.51      0.67     12360





  _warn_prf(average, modifier, msg_start, len(result))
  "(penalty={})".format(self.penalty))


Classification report, forward_returns  10
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.50      0.66     12360

    accuracy                           0.50     12360
   macro avg       0.50      0.25      0.33     12360
weighted avg       1.00      0.50      0.66     12360



Classification report, forward_returns  20
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.54      0.70     12360

    accuracy                           0.54     12360
   macro avg       0.50      0.27      0.35     12360
weighted avg       1.00      0.54      0.70     12360





  _warn_prf(average, modifier, msg_start, len(result))


#### L2-Regularized Logistic Regression

In [None]:
l2=LogisticRegression(penalty='l2')

for i in range(5):
  l2.fit(X_train,y_train.iloc[:,i])
  pred=l2.predict(X_test)
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred,y_test.iloc[:,i]))
  print("\n")

Classification report, forward_returns  0
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.60      0.75       237

    accuracy                           0.60       237
   macro avg       0.50      0.30      0.37       237
weighted avg       1.00      0.60      0.75       237



Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         1.0       1.00      0.55      0.71       237

    accuracy                           0.55       237
   macro avg       0.50      0.27      0.35       237
weighted avg       1.00      0.55      0.71       237



Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.09      0.50      0.16        20
         1.0       0.92      0.55      0.69       217

    accuracy                           0.54       23

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Artificial Neural Network (Shallow)

In [None]:
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input,Flatten
from keras.layers import Reshape

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(102,)),
    tf.keras.layers.Dense(units=256, activation=tf.nn.relu),
    tf.keras.layers.Dense(units=1, activation=tf.nn.sigmoid)
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 102)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               26368     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 26,625
Trainable params: 26,625
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history=model.fit(
    X_train,
    y_train.iloc[:,0],
    epochs=50,
    validation_data=(X_test,y_test.iloc[:,0])
)

#### Pytorch Sequence model + Autoencoder for feature compression: 

###### Setting for a sequence model & scaling:

Here we transform the data frame to the required shape for the LSTM netowrks (batch_size,sequence length, number of features)

In [None]:
def add_window(data, windows_size, encoder_feat) : 
    dataset = data.copy() 
    for feat in encoder_feat : 
        for time_step in range(1,windows_size) : 
            dataset[f"{feat}(t= N - {time_step})"] = dataset[feat].shift(time_step)
    return dataset

In [None]:
def create_window(data, windows_size, encoder_feat):
    is_start = 0
    for feat in encoder_feat:
        col = [f"{feat}(t= N - {i})" for i in range(windows_size - 1, 0, -1)]
        x = data[col].values
        x = np.expand_dims(x, axis=2)
        if is_start != 0:
            input_data = np.concatenate([input_data, x], axis=2)
        else:
            input_data = x
            is_start = 1
    return input_data

In [None]:
window_size=12
encoder_feat=X_train.columns
train_data = add_window(X_train,window_size,encoder_feat)
test_data = add_window(X_test,window_size,encoder_feat)
train_data.dropna( inplace = True)
test_data.dropna( inplace = True)
train_data  = create_window(train_data,window_size,encoder_feat)
train_labels = y_train[window_size-1:]
test_data  = create_window(test_data,window_size,encoder_feat)
test_labels = y_test[window_size-1:]

Scaling features

A function to scale multi dimensional arrays

In [None]:
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
class NDStandardScaler(TransformerMixin):
    def __init__(self, **kwargs):
        self._scaler = MinMaxScaler(feature_range=(0, 1))
        self._orig_shape = None

    def fit(self, X, **kwargs):
        X = np.array(X)
        # Save the original shape to reshape the flattened X later
        # back to its original shape
        if len(X.shape) > 1:
            self._orig_shape = X.shape[1:]
        X = self._flatten(X)
        self._scaler.fit(X, **kwargs)
        return self

    def transform(self, X, **kwargs):
        X = np.array(X)
        X = self._flatten(X)
        X = self._scaler.transform(X, **kwargs)
        X = self._reshape(X)
        return X

    def _flatten(self, X):
        # Reshape X to <= 2 dimensions
        if len(X.shape) > 2:
            n_dims = np.prod(self._orig_shape)
            X = X.reshape(-1, n_dims)
        return X

    def _reshape(self, X):
        # Reshape X back to it's original shape
        if len(X.shape) >= 2:
            X = X.reshape(-1, *self._orig_shape)
        return X

In [None]:
scaler = NDStandardScaler()
scaler.fit(train_data)
train_data=scaler.transform(train_data)
test_data=scaler.transform(test_data)

##### Modelling the Neural Network

In [None]:
pip install transformers

In [None]:
import torch
import torchvision
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import DataLoader# Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms  # Transformations we can perform on our dataset
from tqdm import tqdm
from torch.cuda import amp
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
class inst_dataset() : 
    def __init__(self,trg=None,data=None): 
        self.trg=trg
        self.data=data
        if trg is None:
          self.feat = data
        else:
          self.target = trg.values 
          self.feat = data
        
    def __len__(self) : 
        l= self.data.shape[0]
        return l 
    def __getitem__(self,item) : 
        out = dict()
        if self.trg is None:
          out['encoder_feat'] = torch.tensor(self.feat[item] ,dtype = torch.float)
        else:

          out['target'] = torch.tensor(self.target[item],dtype = torch.float)
          out['encoder_feat'] = torch.tensor(self.feat[item] ,dtype = torch.float)
        return out  

Due to the Large number of features which can affect the performance of the LSTM network we use an autoencoder to compress the data and try to extract fewer features from the original ones. The Auto Encoder gives the privilege of non linearlity compared to the traditionnal PCA.

In [None]:
class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=False):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):
        if len(x.size()) <= 2:
            return self.module(x)
        x_reshape = x.contiguous().view(-1, x.size(-1))
        y = self.module(x_reshape)
        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))
        else:
            y = y.view(-1, x.size(1), y.size(-1))
        return y
    
    
# AutoEncoder Class    
class LSTM_AE(nn.Module):
    def __init__(self,input_size):
        super(LSTM_AE, self).__init__()
        
        self.init_batchnorm = TimeDistributed(nn.BatchNorm1d(input_size, momentum=0.01),batch_first = True)
        self.activ = nn.Sequential(
            nn.Linear(32,40),
            nn.ReLU(inplace=True),
        )
        
        # encoder
        self.encoder_LSTM1 = nn.LSTM(
            input_size=input_size, hidden_size=25, batch_first=True, bidirectional=True
        )
        self.encoder_LSTM2 = nn.LSTM(
            input_size=50, hidden_size=16, batch_first=True, bidirectional=True
        )
        
        # decoder
        self.decoder_LSTM1 = nn.LSTM(
            input_size=40, hidden_size=35, batch_first=True, bidirectional=True
        )
        self.decoder_LSTM2 = nn.LSTM(
            input_size=70, hidden_size=input_size, batch_first=True, bidirectional=False
        )
    def forward(self, x):
        # encoding
        x = self.init_batchnorm(x)
        x_hat, _ = self.encoder_LSTM1(x)
        x_mapped, _ = self.encoder_LSTM2(x_hat)
        x_mapped = self.activ(x_mapped)
 
        # decoding
        x_hat, _ = self.decoder_LSTM1(x_mapped)
        x_hat, _ = self.decoder_LSTM2(x_hat)
        return x_hat,x_mapped 
    
#Base encoder with three LSTM layers    
class StackedLSTMs(nn.Module):
    """
    Computes an encoder based on LSTM layers
    """

    def __init__(self, input_size, add_feature_selection=False):
        super(StackedLSTMs, self).__init__()
        
        self.init_batchnorm = TimeDistributed(nn.BatchNorm1d(input_size, momentum=0.01),batch_first = True)
        self.LSTM1 = nn.LSTM(
            input_size=input_size, hidden_size=32, batch_first=True, bidirectional=True
        )
        self.LSTM2 = nn.LSTM(
            input_size=64, hidden_size=128, batch_first=True, bidirectional=True
        )
        self.LSTM3 = nn.LSTM(
            input_size=256, hidden_size=32, batch_first=True, bidirectional=True
        )
        
        self.batchnorm = TimeDistributed(nn.BatchNorm1d(64, momentum=0.01),batch_first = True)
        self.output_embedding = TimeDistributed(nn.Linear(64 , 32),batch_first = True)
        self.Relu = nn.ReLU(inplace =True)
    def forward(self, enc):
        enc = self.init_batchnorm(enc)
        x_hat, _ = self.LSTM1(enc)
        x_hat, _ = self.LSTM2(x_hat)
        x_hat, _ = self.LSTM3(x_hat)
        x_hat = self.batchnorm(x_hat)
        x_hat = self.output_embedding(x_hat)
        x_hat = self.Relu(x_hat)
        return x_hat

In [None]:
class FullModel(nn.Module):
    def __init__(self, params ):
        super(FullModel, self).__init__()
        
        self.AE= LSTM_AE(len(params["encoder_feat"]))
        self.encoder  = StackedLSTMs(40)
        self.fc = nn.Sequential(
            nn.Linear(32,16),
            nn.ReLU(inplace=True),
            nn.Linear(16, 5),
            nn.Sigmoid()
        )
    def forward(self, enc_feat):
        rec_feat,enc_feat = self.AE(enc_feat)
        x_hat = self.encoder(enc_feat)
        x_hat = self.fc(x_hat[:,-1,:])
        return rec_feat,x_hat


class TestModel():

    def __init__(self, params):
        self.name = "Test_Model"
        self.model = FullModel(params)
        self.model = self.model.to(params["device"])
        self.params = params

    def _create_dataset(self,y=None,dataset=None):
        return inst_dataset(y,
            dataset
        )

    def _model_trainer(self, model, train_dataset, valid_dataset, params):
        run(
            model,
            train_dataset,
            valid_dataset,
            params["lr"],
            params["epochs"],
            params["batch_size"],
            params["batch_size"],
            params["device"],
            params["save_path"],
            self.params["verbose"],
        )

    def train(self, train_dataset, test_dataset,y_train,y_test, params):
        train_dataset = self._create_dataset(y_train,train_dataset)
        valid_dataset = self._create_dataset(y_test,test_dataset)
        if self.params["verbose"]:
            print("Training Model 1 ...")
        self._model_trainer(self.model, train_dataset, valid_dataset, params)

    def eval(self):
        self.model.eval()

    def __call__(self, x, device):
        return self.model(x, device)

    def predict(self, dataset):
        test_dataset = self._create_dataset(None,dataset)
        pred = predict(self.model, test_dataset)
        return pred

##### Engine Class

In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def train_fn(
    data_loader, model, optimizer, scaler, device, verbose, epoch ):
    """
    computes the model training for one epoch
    """
    
    model.train()
    tr_loss = 0
    counter = 0
    if verbose:
        losses = AverageMeter()
        tk0 = tqdm(enumerate(data_loader), total=len(data_loader))
    else:
        tk0 = enumerate(data_loader)
    for bi, d in tk0:
        targets = d["target"].to(device, dtype=torch.float)
        enc_feat   = d["encoder_feat"].to(device, dtype=torch.float)
        criterion= nn.BCEWithLogitsLoss(reduction = 'mean')
        optimizer.zero_grad()
        rec_feat,outputs = model(enc_feat)
        rec_loss = 0.00005*(torch.nn.L1Loss()(rec_feat,enc_feat))
        loss = criterion(outputs,targets) +rec_loss
        tr_loss += loss.item()
        counter += 1
        loss.backward()
        optimizer.step()
        if verbose:
            losses.update(loss.item(), targets.size(0))
            tk0.set_postfix(loss=losses.avg)
    return tr_loss / counter


def eval_fn(data_loader, model, device, verbose, epoch):
    """
    computes the model evaluation for one epoch
    """
    model.eval()
    fin_loss = 0
    counter = 0
    if verbose:
        losses = AverageMeter()
        tk0 = tqdm(enumerate(data_loader), total=len(data_loader))
    else:
        tk0 = enumerate(data_loader)
    with torch.no_grad():
        for bi, d in tk0:
            targets = d["target"].to(device, dtype=torch.float)
            enc_feat   = d["encoder_feat"].to(device, dtype=torch.float)
            criterion= nn.BCEWithLogitsLoss(reduction = 'mean')
            rec_feat,outputs = model(enc_feat)
            rec_loss = 0.00005*(torch.nn.L1Loss()(rec_feat,enc_feat))
            loss = criterion(outputs,targets) +rec_loss
            if verbose:
                losses.update(loss.item(), targets.size(0))
                tk0.set_postfix(loss=losses.avg)
            fin_loss += loss.item()
            counter += 1   
        return fin_loss / counter


def run(
    model,
    train_dataset,
    valid_dataset,
    lr,
    EPOCHS,
    TRAIN_BATCH_SIZE,
    VALID_BATCH_SIZE,
    device,
    path,
    verbose=True,
):
    """
    trains a given model for a given number of epochs and paramters
    """
    train_data_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True,num_workers=4)
    
    valid_data_loader = DataLoader(dataset=valid_dataset, batch_size=VALID_BATCH_SIZE,num_workers=4, shuffle=False)
    
    optimizer = AdamW(model.parameters(), lr=lr,weight_decay=5e-2)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.2, patience=3, verbose=verbose
    )
    
    scaler = amp.GradScaler()
    train_loss = []
    val_loss = []
    best = 50000
    patience = 0
    for epoch in range(EPOCHS):
        if verbose:
            print(f"--------- Epoch {epoch} ---------")
        tr_loss = train_fn(
            train_data_loader,
            model,
            optimizer,
            scaler,
            device,
            verbose,
            epoch,       
        )
        train_loss.append(tr_loss)
        
        if verbose:
            print(f" train_loss  = {tr_loss}")
        val = eval_fn(
            valid_data_loader, model, device, verbose, epoch
        )
        
        val_loss.append(val)
        scheduler.step(val)
            
        if verbose:
            print(f" val_loss  = {val}")
        if val < best:
            best = val
            patience = 0
            torch.save(model.state_dict(), path)
        else:
            patience += 1
        if patience > 2:
            print(f"Eraly Stopping on Epoch {epoch}")
            print(f"Best Loss =  {best}")
            break
    model.load_state_dict(torch.load(path), strict=False)
    return val_loss, train_loss


def predict(model, dataset, device=torch.device("cuda"), is_diff=False):
    """
    computes the prediction a given model and data
    """
    model.eval()
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=64, num_workers=4, shuffle=False
    )
    losses = AverageMeter()
    rmse = AverageMeter()
    tk0 = tqdm(enumerate(data_loader), total=len(data_loader))
    with torch.no_grad():
        for bi, d in tk0:
            enc_feat   = d["encoder_feat"].to(device, dtype=torch.float)
            _,outputs = model(enc_feat)
            if bi == 0:
                out = outputs
            else:
                out = torch.cat([out, outputs], dim=0)
    return out.cpu().detach().numpy()

In [None]:
params = dict()
params['encoder_feat'] = encoder_feat
params['window_size'] = window_size
params['epochs'] = 100
params['lr'] = 9e-4
params['batch_size'] = 256
params['device'] = 'cuda'
params['verbose'] = True
params['save_path'] = 'test2.pth'

model = TestModel(params)  
model.train(train_data, test_data,train_labels,test_labels, params)

Training Model 1 ...
--------- Epoch 0 ---------


100%|██████████| 186/186 [00:06<00:00, 29.84it/s, loss=0.717]


 train_loss  = 0.7169101940047357


100%|██████████| 56/56 [00:00<00:00, 67.78it/s, loss=0.852]


 val_loss  = 0.8518812443528857
--------- Epoch 1 ---------


100%|██████████| 186/186 [00:05<00:00, 31.41it/s, loss=0.687]

 train_loss  = 0.6873326035596992



100%|██████████| 56/56 [00:00<00:00, 63.62it/s, loss=0.727]


 val_loss  = 0.7269473033291953
--------- Epoch 2 ---------


100%|██████████| 186/186 [00:05<00:00, 31.39it/s, loss=0.687]

 train_loss  = 0.6870953594484637



100%|██████████| 56/56 [00:00<00:00, 64.11it/s, loss=0.692]


 val_loss  = 0.6917321628757885
--------- Epoch 3 ---------


100%|██████████| 186/186 [00:05<00:00, 31.74it/s, loss=0.686]

 train_loss  = 0.6864696036102951



100%|██████████| 56/56 [00:00<00:00, 63.17it/s, loss=0.689]


 val_loss  = 0.6894866728356907
--------- Epoch 4 ---------


100%|██████████| 186/186 [00:05<00:00, 31.74it/s, loss=0.686]

 train_loss  = 0.6860294213858984



100%|██████████| 56/56 [00:00<00:00, 68.95it/s, loss=0.694]

 val_loss  = 0.6937541716865131
--------- Epoch 5 ---------



100%|██████████| 186/186 [00:05<00:00, 31.82it/s, loss=0.686]

 train_loss  = 0.6860337542590275



100%|██████████| 56/56 [00:00<00:00, 68.22it/s, loss=0.692]

 val_loss  = 0.6917337509138244
--------- Epoch 6 ---------



100%|██████████| 186/186 [00:05<00:00, 31.72it/s, loss=0.686]

 train_loss  = 0.6862333727780209



100%|██████████| 56/56 [00:00<00:00, 70.11it/s, loss=0.692]


 val_loss  = 0.6916808528559548
Eraly Stopping on Epoch 6
Best Loss =  0.6894866728356907


In [None]:
pred=model.predict(test_data)
for i in range(len(pred)):
  for j in range(5):
    if pred[i][j]>0.5:
      pred[i][j]=1
    else:
      pred[i][j]=-1

100%|██████████| 224/224 [00:01<00:00, 145.34it/s]


In [None]:
from sklearn.metrics import classification_report
for i in range(5):
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred[:,i],test_labels.iloc[:,i]))
  print("\n")

Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.97      0.46      0.63     12908
         1.0       0.15      0.88      0.26      1398

    accuracy                           0.50     14306
   macro avg       0.56      0.67      0.44     14306
weighted avg       0.89      0.50      0.59     14306



Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.98      0.51      0.67     12976
         1.0       0.15      0.87      0.26      1330

    accuracy                           0.55     14306
   macro avg       0.56      0.69      0.47     14306
weighted avg       0.90      0.55      0.63     14306



Classification report, forward_returns  5
              precision    recall  f1-score   support

        -1.0       0.99      0.52      0.69     13660
         1.0       0.08      0.89      0.15       646

    accuracy                           0.54     1430

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(max_depth=100)

for i in range(5):
  rf.fit(X_train,y_train.iloc[:,i])
  pred=rf.predict(X_test)
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred,y_test.iloc[:,i]))
  print("\n")

Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.74      0.74      0.74      5267
         1.0       0.81      0.80      0.81      7093

    accuracy                           0.78     12360
   macro avg       0.77      0.77      0.77     12360
weighted avg       0.78      0.78      0.78     12360



Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.73      0.77      0.75      5502
         1.0       0.81      0.77      0.79      6858

    accuracy                           0.77     12360
   macro avg       0.77      0.77      0.77     12360
weighted avg       0.77      0.77      0.77     12360



Classification report, forward_returns  5
              precision    recall  f1-score   support

        -1.0       0.59      0.78      0.67      4631
         1.0       0.83      0.68      0.75      7729

    accuracy                           0.71     1236

#### Extremely randomized trees (Extra trees forest)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
extra=ExtraTreesClassifier(max_depth=100)

for i in range(5):
  extra.fit(X_train,y_train.iloc[:,i])
  pred=extra.predict(X_test)
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred,y_test.iloc[:,i]))
  print("\n")

Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.68      0.74      0.71      4903
         1.0       0.82      0.77      0.80      7457

    accuracy                           0.76     12360
   macro avg       0.75      0.76      0.75     12360
weighted avg       0.76      0.76      0.76     12360



Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.66      0.77      0.71      4995
         1.0       0.83      0.73      0.78      7365

    accuracy                           0.75     12360
   macro avg       0.75      0.75      0.75     12360
weighted avg       0.76      0.75      0.75     12360



Classification report, forward_returns  5
              precision    recall  f1-score   support

        -1.0       0.46      0.76      0.57      3693
         1.0       0.86      0.62      0.72      8667

    accuracy                           0.66     1236

#### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb=GradientBoostingClassifier(max_depth=100)

for i in range(5):
  gb.fit(X_train,y_train.iloc[:,i])
  pred=gb.predict(X_test)
  print("Classification report, forward_returns ",[1,2,5,10,20][i])
  print(classification_report(pred,y_test.iloc[:,i]))
  print("\n")

Classification report, forward_returns  1
              precision    recall  f1-score   support

        -1.0       0.67      0.66      0.67      5434
         1.0       0.74      0.75      0.74      6926

    accuracy                           0.71     12360
   macro avg       0.71      0.70      0.71     12360
weighted avg       0.71      0.71      0.71     12360



Classification report, forward_returns  2
              precision    recall  f1-score   support

        -1.0       0.61      0.61      0.61      5852
         1.0       0.65      0.65      0.65      6508

    accuracy                           0.63     12360
   macro avg       0.63      0.63      0.63     12360
weighted avg       0.63      0.63      0.63     12360



Classification report, forward_returns  5
              precision    recall  f1-score   support

        -1.0       0.45      0.59      0.51      4727
         1.0       0.69      0.56      0.62      7633

    accuracy                           0.57     1236

## Conclusions:

The easiest response to predict is the one with the highest accuracy for most of the models: 1-day returns which is totally predictable because the further we go into the future the more we have uncertainties and therefore it is harder to predict. 

Decision Trees based classifiers had the best performance overall, this may be due to their ability to perform feature selection and thus choose the best features that describe best the behaviour of the target variable and due to their ability to beat overfitting with the bagging techniques.

Sequence models based classifiers need more tunning to reach better performance as well as a feature selection mechanism to get rid of the useless features that may introduce noise and affect the performance of the model. Extra study is needed aswell to determin the right compression size for the different features comming out of the Autoencoder because a very low output dimension may lead to loss of information compared to the orginial features.

The 5 targets to predict may be combined to work as a slope prediction that, if predicted with a certain high accuracy, can give a very accurate idea about the trend.