In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import arch
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor

# Trying to replicate the results using simple OLS:
import statsmodels.api as sm
from statsmodels.tsa.api import acf, graphics, pacf
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tools import add_constant
from datetime import timedelta

# Data

* Import the data;
* Check the patterns in the dataset:
    * Is there NaN, $\infty$ or negative values?
    * If there are, then why?
* Divide the data into 2 parts:
    * Regressand (y_init);
    * Regressors Matrix (X_init).
* Create a copy of X_init and y_init, call them X and y, respectively;
    * Always Operate on the copies to keep the originals safe and;
    * To make sure that re-running the code will work.
* Check Stationarity of explanatory variables:
    * Import ADF library;
    * If non-stationary and positive $\rightarrow$ take the log difference and rename it with pre-fix "dl";
    * If non-stationary and non-positive $\rightarrow$ take the first difference and rename it with pre-fix "d";
    * else:
        * Pass.
* Confirm stationarity of the transformed explanatory variables:
    * Use KPSS:
    * Take the first difference if not stationary;
    * Use KPSS on the transformed dataset and print non-stationary if any.
* Drop NaN values and even out the indices for both y and X.

In [2]:
from IPython.display import display, HTML

# Import the data:
data = pd.read_excel("./data_full.xlsx")
data.rename({"Unnamed: 0":"Date"}, axis = 1, inplace=True)
data = data.set_index("Date")
idx = pd.DatetimeIndex(data.index.values, freq = data.index.inferred_freq)
data = data.set_index(idx)

# Check the patterns in the dataset:
    # Is there NaN, $\infty$ or negative values?
print(f'NaN values:{data.isna().values.sum()}')
print(f'Infinity values: {np.isinf(data).values.sum()}')
print(data.columns[((data<0).sum()>0)])

    #  If there are, then why?
print('It is normal to have negative returns')
print("News why oil prices went negative in 2020:")
display(HTML("""<a href="https://google.at">Follow this link</a>"""))

NaN values:0
Infinity values: 0
Index(['CL=F', 'returns'], dtype='object')
It is normal to have negative returns
News why oil prices went negative in 2020:


In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
# Divide the data into 2 parts:
    ## Regressand (y_init):
y_init = data[["returns"]]
    ## Regressors Matrix (X_init):
X_init = data.drop(columns = ["returns", "rv_d", "rv_w", "rv_m"])

X_init_cop = pd.DataFrame(MinMaxScaler().\
    fit_transform(X_init), columns = X_init.columns, index = X_init.index)

In [5]:
# Create a copy of X_init and y_init, call them X and y, respectively;
    ## Always Operate on the copies to keep the originals safe and;
    ## To make sure that re-running the code will work.
X = X_init.copy()
y = y_init.copy()

# Check Stationarity of explanatory variables:
    ## Import ADF library:
    ## If non-stationary and positive ---> take the log difference and rename with pre-fix dl;
    ## If non-stationary and negative $\rightarrow$ take the first difference and rename it with pre-fix "d";
    ## else:
        ### pass.
from arch.unitroot import ADF
for i in X_init.columns:
    
    if ADF((X_init_cop[i])).pvalue>0.05 and i not in data.columns[((data<0).sum()>0)].to_list():
        X[f'dl_{i}'] = np.log(X[i]/X[i].shift(1))
        X.drop(columns = [i], inplace = True)
    else:
        if ADF(X_init_cop[i]).pvalue>0.05 and i in data.columns[((data<0).sum()>0)].to_list():
            X[f'd_{i}'] = X[i]-X[i].shift(1)
            X.drop(columns = [i], inplace = True)
        else:
            pass
# X.columns

In [6]:
# Confirm stationarity of the transformed explanatory variables:
# Use KPSS:
# Take the first difference if not stationary;
# Use KPSS on the transformed dataset.
from arch.unitroot import KPSS
for i in X.columns:
    try:
        pval = KPSS(X[i]).pvalue
    except:
        pval = KPSS(X[i], lags = 1).pvalue
    if pval < 0.05:
        X[f'd_{i}'] = X[i]-X[i].shift(1)
        X.drop(columns = [i], inplace = True)

for i in X.columns:
    try:
        pval = KPSS(X[i]).pvalue
    except:
        pval = KPSS(X[i], lags = 1).pvalue
    if pval < 0.05:
        print(i)
# X.columns

In [7]:
# Drop NaN values and even out the indices for both y and X:
X = X.dropna()
y = y.loc[X.index]
print(X.isna().values.sum())
print(y.isna().values.sum())
# X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns, index=X.index)

0
0


In [8]:
data_stationary = pd.concat([y, X], axis = 1)
data_stationary['rv_d'] = data.loc[X.index]['rv_d']
data_stationary['rv_m'] = data.loc[X.index]["rv_m"]
data_stationary["rv_w"] = data.loc[X.index]["rv_w"]
data_stationary.to_csv("data_stationary.csv")

In [9]:
np.isinf(data_stationary).sum().sum()

0

In [10]:
np.mean(data_stationary)

returns        4.997432e-03
TRFUS          1.244410e+06
dl_AVBLS       2.022214e-04
dl_BLCHS       6.271717e-04
dl_CPTRA      -5.343820e-04
dl_DIFF        1.747508e-03
dl_ETRVU       2.963655e-04
dl_HRATE       1.377054e-03
dl_MIREV      -7.889167e-04
dl_MKPRU       4.304207e-05
dl_MKTCP       1.268907e-04
dl_MWNTD      -1.026064e-04
dl_MWNUS       8.432564e-04
dl_MWTRV       3.957391e-04
dl_NADDU      -2.004177e-04
dl_NTRAN      -2.735811e-04
dl_NTRAT       5.856124e-04
dl_NTRBL       4.424806e-05
dl_NTREP      -2.470000e-04
dl_TOTBC       7.897464e-05
dl_TOUTV       8.282527e-04
dl_FFER        6.502313e-05
dl_BTC=F       4.218762e-05
d_CL=F         3.191230e-02
dl_CNYUSD=X   -8.093881e-06
dl_NDAQ        4.467480e-04
dl_SPY         2.358402e-04
dl_^DJI        1.138575e-04
dl_tweet       1.865472e-04
d_ATRCT        9.744214e-04
d_CPTRV       -5.713371e-04
d_ETRAV        7.545745e+01
d_TRFEE       -3.442848e-01
d_TRVOU       -1.018302e+06
d_XRP-USD     -2.783349e-04
d_^GVZ         6.303