In [12]:
import os 
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [13]:
from plotly.offline import iplot
import cufflinks as cf
cf.go_offline()

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
index = ['sto_vol_all',
         'sto_vol_twt',
         'sto_rea_all',
         'sto_rea_twt',
         'sto_si1_all', 
         'sto_si2_all', 
         'sto_si3_all', 
         'sto_si1_twt', 
         'sto_si2_twt', 
         'sto_si3_twt',
         'stw_vol_dcl',
         'stw_vol_brt',
         'stw_si1_dcl',
         'stw_si2_dcl',
         'stw_si3_dcl', 
         'stw_si1_brt', 
         'stw_si2_brt',
         'stw_si3_brt']

In [43]:
sc = pd.DataFrame(np.zeros((18, 4)), index=index, columns=['btc', 'eth', 'aapl', 'spx'])
for file in os.listdir('C:/Users/Comarch/Desktop/miko_work/pred'):
    _, source, type1, type2, symbol = file.split('.')[0].split('_')
    
    if type2 == 'fbp' or type1 == 'fbp':
        continue
        
    new = source + '_' + type1 + '_' + type2
    if type2.startswith('si'):
        new = source + '_' + type2 + '_' + type1
    
    data = pd.read_csv('C:/Users/Comarch/Desktop/miko_work/pred/' + file, index_col=0, parse_dates=True)
    prices = pd.read_csv('C:/Users/Comarch/Desktop/miko_work/prices/' + symbol + '_prices.csv', header=None, index_col=0, parse_dates=True)
    prices_sh = prices.shift(1)
    prices_sh.iloc[0] = prices.values[0]
    prices = np.log(prices)
    roll = pd.read_csv('C:/Users/Comarch/Desktop/miko_work/roll/' + symbol + '_roll.csv', index_col=0, parse_dates=True)

    data_train = data[data.index < pd.to_datetime('2019-05-01')]
    data_test = data[data.index >= pd.to_datetime('2019-05-01')]
    prices_train = prices[prices.index < pd.to_datetime('2019-05-01')]
    prices_test = prices[prices.index >= pd.to_datetime('2019-05-01')]
    roll_train = roll[roll.index < pd.to_datetime('2019-05-01')]
    roll_test = roll[roll.index >= pd.to_datetime('2019-05-01')]
    
    tg_train = TimeseriesGenerator(data_train.values, prices_train.values, 3, batch_size=len(data_train))
    x_train, y_train = tg_train[0]
    x_train = x_train.reshape(-1, x_train.shape[1] * x_train.shape[2])
    y_train = y_train.reshape(-1)

    tg_test = TimeseriesGenerator(data_test.values, prices_test.values, 3, batch_size=len(data_test))
    x_test, y_test = tg_test[0]
    x_test = x_test.reshape(-1, x_test.shape[1] * x_test.shape[2])
    y_test = y_test.reshape(-1)

    x_train = np.append(x_train, roll_train[2:-1], axis=1)
    x_test = np.append(x_test, roll_test[2:-1], axis=1)

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    model = LinearRegression()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    r2 = r2_score(y_test, pred)
    sc.loc[new, symbol] = r2
    print(file, str(r2))

pred_sto_all_si1_aapl.csv 0.8372228625213046
pred_sto_all_si1_btc.csv -0.644204938822285
pred_sto_all_si1_eth.csv 0.7324482894677774
pred_sto_all_si1_spx.csv 0.8615685289187118
pred_sto_all_si2_aapl.csv 0.843665265445337
pred_sto_all_si2_btc.csv -0.6468824926486214
pred_sto_all_si2_eth.csv 0.6913087691037898
pred_sto_all_si2_spx.csv 0.861513487780736
pred_sto_all_si3_aapl.csv 0.8485017066511704
pred_sto_all_si3_btc.csv -0.5520774532803918
pred_sto_all_si3_eth.csv 0.735593754392116
pred_sto_all_si3_spx.csv 0.8624118236860284
pred_sto_rea_all_aapl.csv 0.83522313917588
pred_sto_rea_all_btc.csv -0.6647879221957667
pred_sto_rea_all_eth.csv 0.39802411849954655
pred_sto_rea_all_spx.csv 0.8637169127980613
pred_sto_rea_twt_aapl.csv 0.8352092420890393
pred_sto_rea_twt_btc.csv -0.8149869296277947
pred_sto_rea_twt_eth.csv 0.6124053834106651
pred_sto_rea_twt_spx.csv 0.8432212255657165
pred_sto_twt_si1_aapl.csv 0.8434471031873025
pred_sto_twt_si1_btc.csv -0.5394932891201274
pred_sto_twt_si1_eth.csv 

In [41]:
sc

Unnamed: 0,btc,eth,aapl,spx
sto_vol_all,0.005951,0.02452,0.013792,0.026772
sto_vol_twt,0.007042,0.01295,0.005955,0.022143
sto_rea_all,0.010679,0.015662,0.054885,0.015032
sto_rea_twt,0.013854,0.020412,0.047457,0.012343
sto_si1_all,0.014072,0.013285,0.026455,0.016744
sto_si2_all,0.014392,0.010303,0.027434,0.014912
sto_si3_all,0.01536,0.012912,0.033608,0.019392
sto_si1_twt,0.018247,0.00872,0.010192,0.023275
sto_si2_twt,0.017385,0.00802,0.012078,0.023139
sto_si3_twt,0.026507,0.011587,0.068782,0.025114


In [13]:
sc['avg'] = sc.apply(lambda x: x.abs().mean(), axis=1)

In [14]:
sc

Unnamed: 0,btc,eth,aapl,spx,avg
sto_vol_all,0.634868,0.746154,0.474026,0.459052,0.578525
sto_vol_twt,0.569079,0.57641,0.538961,0.44181,0.531565
sto_rea_all,0.542763,0.567692,0.481602,0.56681,0.539717
sto_rea_twt,0.502193,0.618462,0.458874,0.685345,0.566218
sto_si1_all,0.625548,0.557436,0.477273,0.538793,0.549762
sto_si2_all,0.633772,0.522564,0.52381,0.5625,0.560661
sto_si3_all,0.593202,0.522051,0.455628,0.534483,0.526341
sto_si1_twt,0.589912,0.587179,0.49026,0.518319,0.546418
sto_si2_twt,0.592105,0.594359,0.506494,0.516164,0.55228
sto_si3_twt,0.595943,0.534872,0.420996,0.512931,0.516185


In [15]:
sc.sort_values('avg')

Unnamed: 0,btc,eth,aapl,spx,avg
stw_vol_brt,0.524123,0.502051,0.480519,0.548491,0.513796
sto_si3_twt,0.595943,0.534872,0.420996,0.512931,0.516185
stw_vol_dcl,0.534539,0.515385,0.464286,0.560345,0.518639
sto_si3_all,0.593202,0.522051,0.455628,0.534483,0.526341
sto_vol_twt,0.569079,0.57641,0.538961,0.44181,0.531565
sto_rea_all,0.542763,0.567692,0.481602,0.56681,0.539717
sto_si1_twt,0.589912,0.587179,0.49026,0.518319,0.546418
stw_si2_brt,0.597039,0.557949,0.429654,0.602371,0.546753
stw_si2_dcl,0.594846,0.532308,0.433983,0.633621,0.548689
sto_si1_all,0.625548,0.557436,0.477273,0.538793,0.549762
