In [1]:
#%% 
import pandas as pd
import numpy as np
import time
import os
import warnings
import pickle
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression, PLSSVD

from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression #F-value between label/feature for regression tasks.

from statsmodels.tsa.api import VAR
from xgboost import XGBRegressor
pd.options.mode.chained_assignment = None  # default='warn'
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from error_metrics_2020 import *

In [3]:
df = pd.read_csv('nasdaq100_padding.csv')
df.shape

(40560, 82)

In [4]:
def get_data_split(data_ready, lookback, test_length):
    
    data = data_ready.iloc[-test_length-lookback:,:]
    length = data.shape[0]
    train_starts = np.arange(0, length - lookback,1)
    # to make the train_starts and test_starts have the same number of elements
    test_starts = np.arange(lookback, length + 1, 1)[:len(train_starts)] 
    trains = [data.iloc[s:s+lookback] for s in train_starts]
    tests = [data.iloc[s:s+1] for s in test_starts]
    print('Split Done. Train Start')
    #print('training set len:', len(trains))
    #print('test set len:', len(tests))
    return trains, tests

In [5]:
def get_lag_diff(raw_data, order, target, exo=True):
    p = order - 1 
    X = raw_data.copy()
    # Generate lag variables 
    composite_col = X.columns[X.columns != target]
    X['y_pch'] = X[target].diff()
    X['y_pch_true'] = X['y_pch'].shift(-1)

    if exo:
        X[composite_col + '_pch'] = X[composite_col].diff()
        # make lag p variables and drop
        for i in range(0, p):
            X['y_pch_' + str(i+1)] = X['y_pch'].shift(i+1)
            X[composite_col + ('_' + str(i+1))] = X[composite_col + '_pch'].shift(i+1)      
    else:
        for i in range(0, p):
            X['y_pch_' + str(i+1)] = X['y_pch'].shift(i+1)

    # drop price series
    # don't drop target because we need to compare later
    X.drop(columns=composite_col, inplace=True) 
    X.dropna(inplace=True)
    print('Lag %d Diff Generated' %order)
    return X

In [6]:
def get_moving_pred(raw_data, order, test_length, target, exo=True):
    X = raw_data.copy()

    # generate diff and its lag variables
    data = get_lag_diff(X, 1, target)

    # dict for models    
    params_ols = {'fit_intercept': [True, False]}
    params_rl = {'alpha':[0.01, 0.05, 0.1, 0.3, 0.5, 1, 5, 10, 20, 100, 200, 300]}
    params_dim = {'n_components': [2,3,5,10,20]}
    params_rfr = {'n_estimators':[5,10,20],
               'max_depth':[5,10,20],
               'min_samples_leaf' :[1,5,10],
               'max_leaf_nodes' :[5,10, 15]
               }
    params_xgr = {'n_estimators': [5, 10, 20],
              'min_child_weight': [10, 50, 100],
              'gamma': [0, 0.5, 1, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5]
              }
    models = {
            'PCA': GridSearchCV(LinearRegression(), 
                               param_grid=params_ols, iid='deprecated', cv=5),
            'XGR': GridSearchCV(XGBRegressor(objective='reg:squarederror'),
                                param_grid=params_xgr, iid='deprecated', cv=3),
            'RFR': GridSearchCV(RandomForestRegressor(), 
                               param_grid=params_rfr, iid='deprecated', cv=3),
            'OLS': GridSearchCV(LinearRegression(), 
                               param_grid=params_ols, iid='deprecated', cv=5),
            'LR': GridSearchCV(Lasso(), 
                               param_grid=params_rl, iid='deprecated', cv=5),
            'RR': GridSearchCV(Ridge(), 
                               param_grid=params_rl, iid='deprecated', cv=5),
            'PLS': GridSearchCV(PLSRegression(), 
                               param_grid=params_dim, iid='deprecated', cv=5)
            }

    pred_result = {}
    df_err_model = pd.DataFrame([])
    lookback1 = [30, 50, 100, 500, 1000, 3000, 6000, 10000, 12000, 15000]
    lookback2 = [500, 1000, 3000, 6000, 10000, 12000, 15000]
    pca_num = np.array([])
    
    for m in models:
        
        min_score = -100000
        print('Model %s Start' %m)
        pred_price_best = np.array([])
        if m =='XGR' or m == 'RFR':
            lookback = lookback2
        else:
            lookback = lookback1
            
        for n in lookback:

            # split the data with different lookback n
            trains, tests = get_data_split(data_ready = data, lookback = n, test_length = test_length)
            print('Lookback : %d' %n)

            pred_price = np.array([])
            
            true_price = np.array([])

            for i in range(len(trains)):
                
                iteration_start = time.monotonic()
                X_train = trains[i].drop(columns=['y_pch_true', target])
                X_test = tests[i].drop(columns=['y_pch_true', target])
                Y_train = trains[i]['y_pch_true']
                   
                if m == 'PLS':
                    X_train_mean = np.mean(X_train, axis=0)
                    X_train = X_train-X_train_mean
                    X_test = tests[i].drop(columns=['y_pch_true', target]).values - X_train_mean[np.newaxis,:] # centered for test set
                    model_fit = models[m].fit(X_train, Y_train) # PLS.fit(X)
                    selected_params = model_fit.best_params_
                    best_model = model_fit.best_estimator_
                    pred_diff = best_model.predict(X_test) # we dont have to do: np.matmul(X_test, X_pls.x_loadings_)
                
                elif m == 'PCA':
                    X_train_mean = np.mean(X_train, axis=0)
                    X_train = X_train-X_train_mean # centered
                    X_test = tests[i].drop(columns=['y_pch_true', target]).values - X_train_mean[np.newaxis,:] # centered for test set
                    
                    # select best n_component for each train set
                    pca_min_score = 100000000
                    
                  
                    for j in [2,3,5,10,20,30]:
                        pca_fit = PCA(n_components=j).fit(X_train)
                        model_fit = models[m].fit(pca_fit.fit_transform(X_train), Y_train) # OLS.fit on selected PCs
                        selected_params = model_fit.best_params_
                        best_model = model_fit.best_estimator_
                        pred_diff_pca = best_model.predict(np.matmul(X_test, np.transpose(pca_fit.components_))) # loadings
                        pca_score = mse(trains[i]['y_pch_true'].iloc[-1:].values, pred_diff_pca)
                        
                        # update whenevere mse gets smaller
                        if pca_score < pca_min_score:
                            pca_min_score = pca_score
                            pred_diff = pred_diff_pca
                            best_j = j
                    pca_num = np.append(pca_num, best_j)
                    #print('PCA choose %d' %pca_num)

                else:
                    model_fit = models[m].fit(X_train, Y_train)
                    selected_params = model_fit.best_params_
                    best_model = model_fit.best_estimator_
                    #print('best model selected')
                    X_new = SelectFromModel(estimator = best_model, threshold='median').fit(X_train,Y_train)
                    #print('best input selected')
                    best_model.fit(X_new.transform(X_train),Y_train)
                    selected_col = X_train.columns[X_new.get_support()]
                    pred_diff = best_model.predict(X_test[selected_col].values)
                
                
                # recover price using predicted difference after feature selection
                true_price = np.append(true_price, trains[i][target].iloc[-1:].values+ trains[i]['y_pch_true'].iloc[-1:].values)
                pred_price = np.append(pred_price, trains[i][target].iloc[-1:].values + pred_diff)
                
                iteration_end = time.monotonic()
                if(i % 500 == 1):
                    print('{:.2f}%'.format(i/(len(trains)+1)*100))
                    print("Iter time of %s: " %m, iteration_end - iteration_start)
                    print(selected_params)
            err_metric = evaluate(true_price, pred_price)
            score = err_metric['mda']

            if min_score < score:
                
                print("Saving... Score: %f for lookback: %d " %(score, n))
                fname = str(m)+'_best.txt'
                f = [true_price, pred_price, n, selected_params]
                with open(fname,"wb") as fp:
                    pickle.dump(f,fp)
                pred_price_best = pred_price
                min_score = score
                print(pred_price_best)
            
            # save every model with every lookback
            fname2 = str(m)+'_'+str(n)+'.txt'
            f2 = pred_price
            with open(fname2, "wb") as fp:
                pickle.dump(f2,fp)
            
            
            df = pd.DataFrame(err_metric.items())
            df = df.transpose()
            df.columns = df.iloc[0]
            df = df.drop(df.index[[0]]).astype(float) # change to float to use round(3)
            df['lookback']=str(n)
            df['model']=str(m)
            #df.columns = [str(col) + '_' + str(m) for col in df.columns]
            df_err_model = df_err_model.append(df, ignore_index = True)
            print(df_err_model)
            fname3 = 'df_err_model.txt'
            with open(fname3,"wb") as fp:
                pickle.dump(df_err_model,fp)


        fname4 = 'pca_num.txt'
        with open(fname4,"wb") as fp:
            pickle.dump(pca_num,fp)
        pred_result[m] = pred_price_best
        print(pred_price_best.shape)
        evaluate(true_price, pred_price_best)

    return  true_price, df_err_model

# Test length of 2730

In [None]:
#%%
true_price, df_err_model = get_moving_pred(raw_data= df, order=1, test_length=2730, target='NDX')

Lag 1 Diff Generated
Model PCA Start
Split Done. Train Start
Lookback : 30
0.04%
Iter time of PCA:  0.20300000000861473
{'fit_intercept': False}
18.34%
Iter time of PCA:  0.125
{'fit_intercept': True}
36.65%
Iter time of PCA:  0.125
{'fit_intercept': True}
54.96%
Iter time of PCA:  0.10999999998603016
{'fit_intercept': False}
73.27%
Iter time of PCA:  0.10899999999674037
{'fit_intercept': True}
91.58%
Iter time of PCA:  0.10899999999674037
{'fit_intercept': True}
Saving... Score: 0.850861 for lookback: 30 
[4950.35837264 4950.23894766 4949.42980709 ... 4921.64178004 4921.81989718
 4921.86063599]
0      rmse       mda      mpda     mnda       mpp       mnp lookback model
0  0.935479  0.850861  0.852128  0.84964  0.845185  0.856418       30   PCA
Split Done. Train Start
Lookback : 50
0.04%
Iter time of PCA:  0.125
{'fit_intercept': False}
18.34%
Iter time of PCA:  0.125
{'fit_intercept': True}
36.65%
Iter time of PCA:  0.125
{'fit_intercept': False}
54.96%
Iter time of PCA:  0.1400000000

In [None]:
with open('PCA_best.txt',"rb") as fp:
    PCA_best = pickle.load(fp) 
with open('PCA_best.txt',"rb") as fp:
    PCA_best = pickle.load(fp) 

In [None]:
PCA_best

In [None]:
plt.figure(figsize=(20,10))
plt.plot(PCA_best[0]) # true
plt.plot(PCA_best[1])

In [None]:
evaluate(PCA_best[0], PCA_best[1])