In [1]:
import os
import sys

import feature_modeling.create_feature_objects as fo
import feature_modeling.feature_creation as fc 
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
input_dict = {"table_name":"vix_etf_intraday", "ticker":"vxx", "bench_table_name": "vix_index_intraday",
              "bench_ticker": "vix"}
feat_dict = {
             "distribution": "normal", "lbk_long": 20,
             "lbk_short": 5, "time_frame": "5T", "roll_lbk":20, "feature_distribution": "normal",
             "score_distribution": "normal", "pctile_distribution": "gaussian", "n_quantiles": 10,
             "ticker_file_path":"data/prices/VXX_updated_2020.csv",
             "benchmark_file_path":"data/prices/SPY_2020.csv"
            }
feat_dict_2 = {
             "distribution": "normal", "lbk_long": 20,
             "lbk_short": 5, "time_frame": "5T", "roll_lbk":20, "feature_distribution": "normal",
             "score_distribution": "normal", "pctile_distribution": "gaussian", "n_quantiles": 10,
             "ticker_file_path":"data/prices/SPY_2020.csv",
             "benchmark_file_path":"data/prices/SPY_2020.csv"
            }

In [3]:
feat_obj, prc_obj = fo.run(feat_dict)
feat_obj_2, prc_obj_2 = fo.run(feat_dict_2)

# selected raw features: on_volumes, daily_volumes, gaps, ocr, oc^2r
# selected processed features: 5/20d on_volumes, 5/20d daily volumes, 5d/20d rv, 5d/20d hlr
# adjust everything for lags

In [4]:
# generate raw features
start_time, on_start_time, end_time = '9:15:00', '9:14:00','16:00:00'
on_volumes = feat_obj._on_volumes(on_start_time, end_time)
daily_volumes = feat_obj._daily_volumes(start_time, end_time) # needs to be lagged

on_volumes_spy = feat_obj_2._on_volumes(on_start_time, end_time)

daily_gaps = feat_obj.gen_gaps()
daily_ocr, daily_oc_2_r = feat_obj.gen_ocr_oc2_r() # needs to be lagged
daily_ibs = feat_obj.gen_ibs() # needs to be lagged
daily_hl_range = feat_obj.gen_hl_range()

In [5]:
# generate processed features
on_vol_ratio = prc_obj.gen_feature_ratio("on_volumes", on_start_time, end_time)
on_volumes_ratio_spy = prc_obj_2.gen_feature_ratio("on_volumes", on_start_time, end_time)
daily_vol_ratio = prc_obj.gen_feature_ratio("daily_volumes", start_time, end_time)
rv_ratio = prc_obj.gen_feature_ratio("gen_daily_rv")
on_volumes_spy = on_volumes_spy.rename(columns={"on_volume":"spy_on_volume"}) 
on_volumes_ratio_spy = on_volumes_ratio_spy.rename(columns={"on_volume_ratio":"spy_on_vol_ratio"}) 

## Add VVIX feature

In [6]:
vvix_file_path = "data/prices/VVIX.csv"
vvix_df = pd.read_csv(vvix_file_path)
col_mean = vvix_df["VVIX"].rolling(window=feat_dict["roll_lbk"]).mean()
col_std = vvix_df["VVIX"].rolling(window=feat_dict["roll_lbk"]).std()
vvix_df["vvix_z_score"] = (vvix_df["VVIX"] - col_mean)/col_std
vvix_df = vvix_df.set_index("date")
vvix_z_score = vvix_df["vvix_z_score"].to_frame()

## add contago/backwardation features

In [7]:
vix_file_path = "data/prices/vix_products.csv"
vix_df = pd.read_csv(vix_file_path)
vix_df["vix_ratio1"] = vix_df["vix_close"]/vix_df["vix9_close"]
vix_df["vix_ratio2"] = vix_df["vx1_close"]/vix_df["vix_close"]

col_mean_1 = vix_df["vix_ratio1"].rolling(window=feat_dict["roll_lbk"]).mean()
col_std_1 = vix_df["vix_ratio1"].rolling(window=feat_dict["roll_lbk"]).std()
vix_df["vix_ratio1_z_score"] = (vix_df["vix_ratio1"] - col_mean_1)/col_std_1
vix_df = vix_df.set_index("date")

# vix_ratio1_z_score = vix_df["vix_ratio1_z_score"].to_frame()
vix_ratio1 = vix_df["vix_ratio1"].to_frame()

col_mean_2 = vix_df["vix_ratio2"].rolling(window=feat_dict["roll_lbk"]).mean()
col_std_2 = vix_df["vix_ratio2"].rolling(window=feat_dict["roll_lbk"]).std()
vix_df["vix_ratio2_z_score"] = (vix_df["vix_ratio2"] - col_mean_2)/col_std_2
# vix_ratio2_z_score = vix_df["vix_ratio2_z_score"].to_frame()
vix_ratio2 = vix_df["vix_ratio2"].to_frame()


## Add AR Delta and Turbulance Ratios

In [8]:
# ar_delta_path = "/Users/sujitkhanna/Desktop/Talos/venv/rawdata/US/Stocks/daily/s&p50_denoised_ar_delta.csv"
# turbulance_path = "/Users/sujitkhanna/Desktop/Talos/venv/rawdata/US/Stocks/daily/s&p50_denoised_turbulance.csv"


# ard_df = pd.read_csv(ar_delta_path)
# turb_df = pd.read_csv(turbulance_path)
# ard_df["date"] = pd.to_datetime(ard_df["date"]).dt.date
# turb_df["date"] = pd.to_datetime(turb_df["date"]).dt.date
# ard_df = ard_df.set_index("date")
# turb_df = turb_df.set_index("date")
# ar_delta = ard_df["denoised_ar_delta"].to_frame()
# turbulance = turb_df["turbulance"].to_frame()
# turbulance_exp = turb_df["turbulance_exp"].to_frame()



In [9]:
# lagged daily volumes
def add_lag(df, lag_val):
    new_df = df[:-lag_val]
    new_df.index = df[lag_val:].index
    return new_df
new_daily_volumes = add_lag(daily_volumes, 1)
new_daily_ocr, new_daily_oc_2_r = add_lag(daily_ocr, 1), add_lag(daily_oc_2_r, 1)
new_daily_ibs = add_lag(daily_ibs, 1)
new_daily_vol_ratio = add_lag(daily_vol_ratio, 1)
new_rv_ratio = add_lag(rv_ratio, 1)
new_vvix_z_score = add_lag(vvix_z_score, 1)
new_vix_ratio1 = add_lag(vix_ratio1, 1)
new_vix_ratio2 = add_lag(vix_ratio2, 1)
new_daily_hl_range = add_lag(daily_hl_range, 1)
new_daily_hl_range = new_daily_hl_range.to_frame()

In [10]:

new_daily_hl_range = new_daily_hl_range.reset_index()
new_daily_hl_range = new_daily_hl_range.rename(columns={"index": "date"})
new_daily_hl_range = new_daily_hl_range.set_index("date")
new_daily_hl_range.head()

Unnamed: 0_level_0,hl_range_norm
date,Unnamed: 1_level_1
2009-02-03,0.039238
2009-02-04,0.047481
2009-02-05,0.03604
2009-02-06,0.055858
2009-02-09,0.040808


In [11]:
def merge_iterator(df_list):

    df0 = prc_obj.gen_percentile_scores(df_list[0]).reset_index()
    df0["date"] = pd.to_datetime(df0["date"])
    for i, df in enumerate(df_list[1:]):
        print(i)
        df = prc_obj.gen_percentile_scores(df).reset_index()
        df["date"] = pd.to_datetime(df["date"])
        df0 = pd.merge(df0, df, on="date", how="outer").ffill()
    return df0

def create_bins(df, bins):
    bin_df = pd.DataFrame()
    for name, col in df.items():
        print(name)
        bin_df[name] = pd.cut(col.values.flatten(), bins=bins, labels=False)
    bin_df.index = df.index
    return bin_df

feat_df_list = [on_volumes, daily_gaps, new_daily_volumes, new_daily_ocr,
                new_daily_oc_2_r, new_daily_hl_range, on_vol_ratio, new_daily_vol_ratio, 
                new_rv_ratio, new_daily_ibs, new_vvix_z_score, new_vix_ratio1,
                new_vix_ratio2,  on_volumes_spy, on_volumes_ratio_spy]
    
stacked_feat_df = merge_iterator(feat_df_list).set_index(["date"])
stacked_feat_df = stacked_feat_df.sort_index(ascending=True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [12]:
bin_df = create_bins(stacked_feat_df, bins=5)

on_volume
gaps
volume
ocr
oc2r
hl_range_norm
on_volume_ratio
volume_ratio
gen_daily_rv_ratio
ibs
vvix_z_score
vix_ratio1
vix_ratio2
spy_on_volume
spy_on_vol_ratio


In [13]:
bin_df.tail(100)
# print("Important note: In Meta-Labels remove days when there are no trades made in the market")

Unnamed: 0_level_0,on_volume,gaps,volume,ocr,oc2r,hl_range_norm,on_volume_ratio,volume_ratio,gen_daily_rv_ratio,ibs,vvix_z_score,vix_ratio1,vix_ratio2,spy_on_volume,spy_on_vol_ratio
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-03-25,0.0,2,1.0,2,0,2,2.0,2.0,2.0,2,1.0,2.0,1.0,1,4
2020-03-26,0.0,2,0.0,3,2,2,2.0,2.0,2.0,2,0.0,2.0,2.0,0,4
2020-03-27,0.0,2,1.0,1,3,2,4.0,4.0,3.0,0,0.0,3.0,1.0,2,3
2020-03-30,1.0,2,0.0,3,2,1,4.0,4.0,2.0,3,0.0,4.0,2.0,2,4
2020-03-31,0.0,2,1.0,1,2,1,4.0,3.0,3.0,1,0.0,4.0,2.0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-10,1.0,2,0.0,2,2,2,4.0,4.0,3.0,2,0.0,2.0,2.0,2,4
2020-08-11,1.0,0,0.0,2,2,2,3.0,4.0,2.0,2,0.0,2.0,2.0,2,3
2020-08-12,2.0,0,1.0,3,2,3,2.0,3.0,2.0,3,0.0,2.0,2.0,2,2
2020-08-13,3.0,4,1.0,2,2,2,2.0,4.0,1.0,2,0.0,2.0,2.0,2,2


## Concatenating betsizing features to individual strategies

In [15]:
strategy_list = ["VOL_CARRY", "BO_SLOPE", "MULTI_PIVOT_BO", "IDXOG"]

for strategy in strategy_list:
    strategy_file_path = f"data/strategy_files/{strategy}_full_regime_strategy.csv"
    strategy_df = pd.read_csv(strategy_file_path)
    strategy_df =  strategy_df.drop(columns=["Unnamed: 0"])
    strategy_df["datetime"] = pd.to_datetime(strategy_df["datetime"], format="%Y-%m-%d %H:%M:%S")
    label_thresh = 0.004
    strategy_meta_df = fc.gen_cust_meta_labels(strategy_df, "daily", label_thresh)
    full_df = strategy_meta_df.join(stacked_feat_df, how='left')
    full_df.to_csv(f"data/meta_labels/{strategy}_meta_labels.csv")
    full_bin_df =  strategy_meta_df.join(bin_df, how='left')
    full_bin_df.to_csv(f"data/meta_labels/{strategy}_meta_labels_5bins.csv")

In [None]:
# full_bin_df =  strategy_meta_df.join(bin_df, how='left')
# full_bin_df.to_csv(f"{save_path}meta_labels/bins/{strategy}_meta_labels_5bins_2020.csv")