# Imports and Function Definitions

In [51]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn_extensions.kernel_regression import KernelRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [5]:
def tree_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'feat_imp':m.feature_importances_}
                       ).sort_values('feat_imp', ascending=False)
def ensemble_tree_feat_importance(m, df):
    feature_importances = np.mean([
    tree.feature_importances_ for tree in m.estimators_
                                    ], axis=0)
    return pd.DataFrame({'cols':df.columns, 'feat_imp':feature_importances}
                       ).sort_values('feat_imp', ascending=False)
def plot_fi(fi): return fi.plot('cols', 'feat_imp', 'barh', figsize=(15,20), legend=False)
def xgb_feat_importance(m, df):
    feature_importances = m.feature_importances_
    return pd.DataFrame({'cols':df.columns, 'feat_imp':feature_importances}
                       ).sort_values('feat_imp', ascending=False)

In [52]:
# A bunch of nonparametric regression models + some parametric baselines
def model_selection(ml):
    print(ml)
    if ml == 'GB':
        model = GradientBoostingRegressor()
        grid = dict()
        grid['learning_rate'] = [1e-3] #1e-5,1e-7
        grid['n_estimators'] = [10] #50,100,1000
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = GradientBoostingRegressor(learning_rate=grid_result.best_params_['learning_rate'],\
                    n_estimators=grid_result.best_params_['n_estimators'],\
                                              ).fit(X_train, y_train)

    elif ml == 'RF':
        model = RandomForestRegressor(n_jobs=25)
        grid = dict()
        grid['max_depth'] = [3,5,7,10]
        grid['n_estimators'] = [10,50,100,1000]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)

        model_best = RandomForestRegressor(
                    max_depth=grid_result.best_params_['max_depth'],\
                    n_estimators=grid_result.best_params_['n_estimators'],\

                        ).fit(X_train, y_train)
    elif ml == 'AB':
        model = AdaBoostRegressor()
        grid = dict()
        grid['learning_rate'] = [1e-4,1e-2,1]
        grid['n_estimators'] = [10,50,100,1000]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)

        model_best = AdaBoostRegressor(
                    learning_rate=grid_result.best_params_['learning_rate'],\
                    n_estimators=grid_result.best_params_['n_estimators'],\
                        ).fit(X_train, y_train)
          
    elif ml == 'KR':
        model = KernelRidge()
        grid = dict()
        grid['alpha'] = [1e-2,1e-1,1,10,100]
        grid["kernel"] = ["poly","sigmoid","rbf","cosine"]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = KernelRidge(alpha=grid_result.best_params_["alpha"],
                                 kernel=grid_result.best_params_["kernel"]).fit(X_train, y_train)
    elif ml == 'R':
        model = Ridge()
        grid = dict()
        grid['alpha'] = [1e-2,1e-1,1,10,100]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = Ridge(alpha=grid_result.best_params_["alpha"]).fit(X_train, y_train)
        
    elif ml == "LA":
        model = Lasso()
        grid = dict()
        grid['alpha'] = [1e-2,1e-1,1,10,100]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = Lasso(alpha=grid_result.best_params_["alpha"]).fit(X_train, y_train)
        
        
    elif ml == "EN":
        model = ElasticNet()
        grid = dict()
        grid['alpha'] = [1e-2,1e-1,1,10,100]
        grid["l1_ratio"] = [0.1,0.5,0.7,0.9,0.95]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = ElasticNet(alpha=grid_result.best_params_["alpha"],
                                l1_ratio = grid_result.best_params_["l1_ratio"]).fit(X_train, y_train)
    
    elif ml == "NW":
        # Bandwidth is selected automatically
        model = KernelRegression()
        grid = dict()
        grid["gamma"] = [1e-2, 1e-1, 1, 10,100]
        grid['kernel'] = ["rbf","poly"]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = KernelRegression(gamma=grid_result.best_params_["gamma"],
                            kernel = grid_result.best_params_["kernel"]).fit(X_train, y_train)
        
    elif ml == "KN":
        model = KNeighborsRegressor()
        grid = dict()
        grid["n_neighbors"] = [5,10,40,100]
        grid['weights'] = ["uniform","distance"]
        grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='neg_mean_squared_error', n_jobs=25)
        grid_result = grid_search.fit(X_train, y_train)
        model_best = KNeighborsRegressor(n_neighbors=grid_result.best_params_["n_neighbors"],
                            weights = grid_result.best_params_["weights"]).fit(X_train, y_train)
        
    print(model_best)
    y_pred = model_best.predict(X_test)
    print('MAE: ', mean_absolute_error(y_pred, y_test), 
          'MAE%: ', 100 * mean_absolute_error(y_pred, y_test)/np.mean(y_test), 
          '\nMSE: ', mean_squared_error(y_pred,y_test), 
          '\nRMSE: ', np.sqrt(mean_squared_error(y_pred,y_test)))

    return(model_best, y_test, y_pred)#, [X_train, X_test, y_train, y_test])

# Cleaning and Merging

In [6]:
# uso = pd.read_csv('C:\\Users\\nfischer\\Downloads\\USO.USUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
# spy = pd.read_csv('C:\\Users\\nfischer\\Downloads\\SPY.USUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
# vxx = pd.read_csv('C:\\Users\\nfischer\\Downloads\\VXX.USUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
fut = pd.read_csv('LIGHT.CMDUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
gas = pd.read_csv('GAS.CMDUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
# xom = pd.read_csv('C:\\Users\\nfischer\\Downloads\\XOM.USUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
btc = pd.read_csv('BTCUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')
bond = pd.read_csv('USTBOND.TRUSD_Candlestick_5_M_BID_11.02.2020-11.02.2023.csv')

In [7]:
data_dict = {'fut':fut, 'gas':gas, #'uso':uso, 'spy':spy, 'vxx':vxx, 'xom':xom
            'btc':btc, 'bond':bond}

Datetime and indexing

In [8]:
idx = pd.to_datetime(data_dict['fut']['Local time'][:1000])

In [9]:
for data in data_dict.keys():
    data_dict[data] = data_dict[data][:1000].set_index(idx)
    data_dict[data] = data_dict[data][:1000].drop(['Open', 'High', 'Low', 'Volume', 'Local time'], axis=1)

Merging

In [10]:
df = pd.DataFrame(index=data_dict['fut'].index)
for data in data_dict.keys():
    df = pd.merge(df, data_dict[data], left_index=True, right_index=True, suffixes=['', '_'+data])
df = df.rename(columns={'Close':'Close_fut'})

In [12]:
day = df

In [13]:
day['fut_'+str(5)+'_std'] = day.rolling(5)['Close_fut'].std()

In [14]:
X = day[['Close_gas',"Close_btc","Close_bond"]][77:]
Y = day['fut_5_std'][77:]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

## Initial Model testing

In [53]:
res = {"KN":None}
for model in res.keys():
    r = model_selection(model)
    res[model] = r

KN
KNeighborsRegressor(n_neighbors=100)
MAE:  0.02333569799748211 MAE%:  56.33718570490614 
MSE:  0.0008688225200020373 
RMSE:  0.02947579549396483
