# G-Research Crypto Forecasting | Linear Regression

### Author
Jose Manuel Rodriguez Caballero (Caballero Software Inc.)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import gresearch_crypto
from pandas import read_csv, to_datetime
from sklearn.linear_model import LinearRegression
from numpy import abs, log, array, mean

In [None]:
def correct(t):
    return t if abs(t) < 1 else 0

In [None]:
DATA_FOLDER = "../input/g-research-crypto-forecasting/"
NUMBER_ASSETS = 14
SAMPLE = 100000

In [None]:
df_train = read_csv(DATA_FOLDER + 'train.csv', 
                 usecols=['Target', 
                          'Asset_ID',
                          'timestamp',
                           'Low',
                           'High'], 
                 dtype={'Asset_ID': 'int8'})
df_train = df_train.dropna()
df_train['datetime'] = to_datetime(df_train['timestamp'], unit='s')
df_train = df_train.set_index('datetime').drop('timestamp', axis=1)
df_train = {asset_id: df_train[df_train['Asset_ID'] == asset_id].drop('Asset_ID',1).iloc[0:SAMPLE] for asset_id in range(NUMBER_ASSETS)}

In [None]:
pred_close = list()
for j in range(NUMBER_ASSETS):
    X_low = df_train[j][["Low"]].to_numpy()
    X_high = df_train[j][["High"]].to_numpy()
    Y = df_train[j][["Target"]].to_numpy()
    X = array([ [ mean([ log(X_low[j+t]/X_low[j]) for t in range(1,15)]), mean([ log(X_high[j+t]/X_high[j]) for t in range(1,15)]) ] for j in range(len(X_high)-15) ])
    X = X.reshape(( X.shape[0], X.shape[1] ))
    Y = Y[0:len(X)]
    linear_regressor = LinearRegression()
    linear_regressor.fit(X, Y)
    pred_close.append( lambda x: linear_regressor.predict(array(x))[0][0]  )

In [None]:
del(X, Y, X_low, X_high, df_train)

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
T_high = [ [1 for i in range(15)] for j in range(NUMBER_ASSETS) ]
T_low = [ [1 for i in range(15)] for j in range(NUMBER_ASSETS) ]

for df_test, df_pred in iter_test:
    for _, row in df_test.iterrows():

        newFeature_high = row[["High"]].to_numpy()[0]
        newFeature_low = row[["Low"]].to_numpy()[0]

        if (newFeature_high == newFeature_high)and(newFeature_low == newFeature_low):
            i = int(row["Asset_ID"])
            
            T_high[i] = T_high[i][1:15] + [newFeature_high]
            T_low[i] = T_low[i][1:15] + [newFeature_low]
            feature = array([ [mean([ log(T_low[j][t]/T_low[j][0]) for t in range(1,15)]), mean([ log(T_high[j][t]/T_high[j][0]) for t in range(1,15)])] ])
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = correct( - pred_close[ i ]( feature ) )
        else:
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
    env.predict(df_pred)