## Load the training set

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (20,7)

# import gresearch_crypto

In [None]:
data_folder = "../input/g-research-crypto-forecasting/"
# !ls $data_folder

In [None]:
crypto_df = pd.read_csv(data_folder + 'train.csv')

In [None]:
asset_details = pd.read_csv(data_folder + 'asset_details.csv')
# asset_details

dict_weights = {}

for i in range(asset_details.shape[0]):
    dict_weights[asset_details.iloc[i,0]] = asset_details.iloc[i,1]
weights = np.array([dict_weights[i] for i in range(14)])

In [None]:
asset_symbols = {'Bitcoin Cash'     :    'BCH',
                 'Binance Coin'     :    'BNB',
                 'Bitcoin'          :    'BTC',
                 'EOS.IO'           :    'EOS',
                 'Ethereum Classic' :    'ETC',
                 'Ethereum'         :    'ETH',
                 'Litecoin'         :    'LTC',
                 'Monero'           :    'XMR',
                 'TRON'             :    'TRX',
                 'Stellar'          :    'XLM',
                 'Cardano'          :    'ADA',
                 'IOTA'             :    'IOTA',
                 'Maker'            :    'MKR',
                 'Dogecoin'         :    'DOGE'
                }
    

# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

def get_asset_dict():
    dict = {}
    for id in range(14):
        df = crypto_df[crypto_df["Asset_ID"]==id].set_index("timestamp") 
        name = asset_details.loc[asset_details['Asset_ID']==id,'Asset_Name'].iloc[0]
        symbol = asset_symbols[name]
        dict[symbol] = df
    return dict

def get_asset_dict_by_id():
    dict = {}
    for id in range(14):
        df = crypto_df[crypto_df["Asset_ID"]==id].set_index("timestamp") 
        name = asset_details.loc[asset_details['Asset_ID']==id,'Asset_Name'].iloc[0]
        dict[id] = df
    return dict

def find_missing_data(symbol):
    return (df_dict[symbol].index[1:]-df_dict[symbol].index[:-1]).value_counts() # returns df

def handle_missing_data(df_dict):
     for symbol,asset in df_dict.items():
        df_dict[symbol] = df_dict[symbol].dropna()
        df_dict[symbol] = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')
     return df_dict

def handle_asset_missing_data(asset_df):
    asset_df = asset_df.dropna()
    asset_df = asset_df.reindex(range(asset_df.index[0],asset_df.index[-1]+60,60),method='pad')
    return asset_df

def view_charts(df_dict):
    figure, axis = plt.subplots(7, 2, figsize=(15,10))
    row,col = 0,0
    for symbol,asset in df_dict.items():
        axis[row, col].plot(asset['Close'])
    #     axis[row, col].set_title(str(id))
        axis[row, col].legend([symbol], loc='upper center')
        col += 1
        if col > 1:
            col = 0
            row += 1
    plt.show()

def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

def show_colleration_map():
    all_assets_2021 = pd.DataFrame([])
    for asset_id, asset_name in zip(asset_details.Asset_ID, asset_details.Asset_Name):
      asset = crypto_df[crypto_df["Asset_ID"]==asset_id].set_index("timestamp")
      asset = asset.loc[totimestamp('01/01/2021'):totimestamp('01/05/2021')]
      asset = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')
      lret = log_return(asset.Close.fillna(0))[1:]
      all_assets_2021 = all_assets_2021.join(lret, rsuffix=asset_name, how="outer")

    plt.imshow(all_assets_2021.corr())
    plt.yticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values)
    plt.xticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values, rotation='vertical')
    plt.colorbar()

    
def Clean_df(x):
    Asset_ID = x[:,1]
    timestamp = x[0,0]

    if len(Asset_ID)<14:
        missing_ID = [i for i in range(14) if i not in Asset_ID]
        for i in missing_ID:
            row = np.array((timestamp,i,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan))
            x = np.concatenate((x,np.expand_dims(row,axis=0)))
    x = x[np.argsort(x[:,1])]
    return (x[:,i] for i in range(x.shape[1]))


# online - deployment
def Base_Feature_fn(timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP):

    VWAP = np.where(np.isinf(VWAP),(C+O)/2,VWAP)
    base = C
    O = O/base
    H = H/base
    L = L/base
    C = C/base
    VWAP = VWAP/base
    Price = base

    Dollars = Volume * Price
    Volume_per_trade = Volume/Count
    Dollars_per_trade = Dollars/Count

    log_ret = np.log(C/O)
    GK_vol = (1 / 2 * np.log(H/L) ** 2 - (2 * np.log(2) - 1) * np.log(C/O) ** 2)
    RS_vol = np.log(H/C)*np.log(H/O) + np.log(L/C)*np.log(L/O)

    return(np.transpose(np.array([Count,O,H,L,C,Price,Volume,VWAP,Dollars,Volume_per_trade,Dollars_per_trade,log_ret,GK_vol,RS_vol])))

# offline - training
def create_features(df):
    
    base = df.Close
    
    df['VWAP'] = np.where(np.isinf(df.VWAP),(df.Close+df.Open)/2,df.VWAP)
    df['O'] = df.Open / base
    df['H'] = df.High / base
    df['L'] = df.Low / base
    df['C'] = df.Close / base
    
    df['VWAP'] = df.VWAP / base
    
    Price = base
    Dollars = df.Volume * Price
    
    df['Dollars'] = Dollars
    df['Price'] = Price
    
    df['Volume_per_trade'] = df.Volume / df.Count
    df['Dollars_per_trade'] = Dollars / df.Count
    
    df['log_ret'] = np.log(df.C / df.O)
    df['GK_vol'] = (1 / 2 * np.log(df.H/df.L) ** 2 - (2 * np.log(2) - 1) * np.log(df.C / df.O) ** 2)
    df['RS_vol'] = np.log(df.H/df.C)*np.log(df.H/df.O) + np.log(df.L/df.C)*np.log(df.L/df.O)
    
    df['slow_MA'] = pd.Series.rolling(df.Price, window=10).mean()
    df['fast_MA'] = pd.Series.rolling(df.Price, window=5).mean()
    
    df['bullish_MA'] = df['fast_MA'] > df['slow_MA'] 
    df['bullish_MA'] = df['bullish_MA'].astype(int)
    
    df['bullish_target'] = df['Target'] > 0
    df['bullish_target'] = df['bullish_target'].astype(int)
    
    df['slow_EMA'] = df['Price'].ewm(span=30, adjust=False).mean()
    df['fast_EMA'] = df['Price'].ewm(span=15, adjust=False).mean()   
    
    df['bullish_EMA'] = df['slow_EMA'] < df['fast_EMA']
    df['bullish_EMA'] = df['bullish_EMA'].astype(int)
    
    
    
    return df   

def weighted_correlation(a, b, weights):
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return corr

# Function log_return_ahead computes R_t = log(P_{t+16} / P_{t+1})
def log_return_ahead(series, periods=1):
    return np.exp(-np.log(series).diff(periods=-periods).shift(-1)) - 1

In [None]:
class Asset:
    def __init__(self, prediction=0.0, target=1.0):
        self.prediction = prediction
        self.target = target
    
    def set_predition(self,prediction):
        self.prediction = prediction
    
    def set_target(self,target):
        self.target = target    
    
    def compute_error(self):
        self.err = abs(self.prediction-self.target)
    
    def get_error(self):
        return self.err       


# Feature creation

In [None]:
# Select some input features from the trading data: 
# 5 min log return, abs(5 min log return), upper shadow, and lower shadow.
upper_shadow = lambda asset: pd.Series(asset.High - np.maximum(asset.Close,asset.Open))
lower_shadow = lambda asset: pd.Series(np.minimum(asset.Close,asset.Open)- asset.Low)


def get_features(asset_df):
    data = handle_asset_missing_data(asset_df)
    data.pop('Target')
    data.pop('Asset_ID')
    return data

# Split data

In [None]:
# select training and test periods
train_window = [totimestamp("01/01/2021"), totimestamp("01/03/2021")]
test_window = [totimestamp("02/03/2021"), totimestamp("03/03/2021")]

# divide data into train and test, compute X and y
# we aim to build simple regression models using a window_size of 1
def split_data(X,y):
    X_train = X.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
    y_train = y.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  
    
    X_test = X.loc[test_window[0]:test_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
    y_test = y.loc[test_window[0]:test_window[1]].fillna(0).to_numpy()  
    return X_train, y_train, X_test, y_test   


# Transform data

In [None]:
from sklearn.preprocessing import StandardScaler
# simple preprocessing of the data 
scaler = StandardScaler()
# def scale_data(data):
#     return scaler.fit_transform(data)
def scale_data(data):
    return data

# Train & Test model

In [None]:
# from sklearn.linear_model import LinearRegression

# implement basic ML baseline (one per asset)
# lr = LinearRegression()
def train(model,X_train,y_train):
    return model.fit(X_train,y_train)

# X_test should be scaled if possible 
def predict(model,X_test):
    return model.predict(X_test)

# def test(y_pred, y_test):
#     return np.corrcoef(y_pred, y_test)[0,1]

def test(y_pred, y_test):
    y_test = np.atleast_2d(y_test).T
    N = y_test.shape[0]
    accuracy = (y_test == y_pred).sum() / N
#     TP = ((predictions == 1) & (true_values == 1)).sum()
#     FP = ((predictions == 1) & (true_values == 0)).sum()
#     precision = TP / (TP+FP)
    return accuracy

from sklearn.metrics import classification_report, confusion_matrix

In [None]:
asset_dict = get_asset_dict_by_id()

In [None]:
asset_dict = handle_missing_data(asset_dict)

In [None]:
# asset_features_dict = {id : get_features(asset) for (id,asset) in asset_dict.items()}
# asset_target_dict = {id: asset.Target for (id,asset) in asset_dict.items()}

In [None]:
asset_features_dict = {id : create_features(asset) for (id,asset) in asset_dict.items()}

In [None]:
asset_features_dict = {id : asset_features_dict[id].dropna(axis=0) for id in range(14)}

In [None]:
asset_features_dict[0][['Price','slow_MA','fast_MA','Target','bullish_MA','bullish_target']].tail(30)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# First XGBoost model for Pima Indians dataset
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

# load data
# dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# # split data into X and y
# X = dataset[:,0:8]
# Y = dataset[:,8]
# # split data into train and test sets
# seed = 7
# test_size = 0.33
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
# model = XGBClassifier()
# model.fit(X_train, y_train)
# # make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# # evaluate predictions
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
def MyModel(X):
    return (1 - X)
    

In [None]:
X = pd.DataFrame([0,1,0,1,1]).to_numpy()
print(X)
print(MyModel(X))

In [None]:
models = []
performance = []

# features = ['O','H','L','C','Volume_per_trade','Dollars_per_trade','log_ret','GK_vol','RS_vol']
# features = ['Count','O','H','L','C','Price','Volume','VWAP','Dollars','Volume_per_trade','Dollars_per_trade','log_ret','GK_vol','RS_vol']
# features = ['Count','O','H','L','VWAP','Volume_per_trade','Dollars_per_trade','log_ret','GK_vol','RS_vol']

# features = ['VWAP','Volume_per_trade','Dollars_per_trade','log_ret','GK_vol','RS_vol','bullish_EMA','bullish_MA']

features = ['bullish_MA']

for id in range(14):
    
    X = asset_features_dict[id]
    
    y = X.bullish_target
    
    X = X[features]
    
    X_train, y_train, X_test, y_test = split_data(X,y)
    
#     X_train_scaled = scale_data(X_train)
#     X_test_scaled = scale_data(X_test)
    
#     model = LinearRegression()
#     model = XGBClassifier()
    
#     model = LogisticRegression(random_state=0)
    
#     model = train(model,X_train_scaled,y_train)
#     y_pred = predict(model,X_test_scaled)

    y_pred = MyModel(X_test)
    
    accuracy = test(y_pred, y_test)
    performance.append(accuracy)
#     models.append(model)


In [None]:
print(performance)

In [None]:
# t0 = asset_features_dict[0].shape[0] - 10000
# T = t0 + 100;
# t = [i for i in range(t0,T)]
# for id in range(14):
    
#     try:
        
#         X = asset_features_dict[id]
#         plt.figure()
#         fig, ax1 = plt.subplots()
#         ax1.plot(t,X.Close[t0:T], color='k')
#         ax1.set_ylabel('Close', color='k')
#         ax1.tick_params(axis='y', color='k', labelcolor='k')

#         ax2 = ax1.twinx()
#         ax2.scatter(t, X.Target[t0:T], c = [ 'r' if i < 0 else 'g' for i in X.Target[t0:T]])
#         ax2.set_ylabel('Target', color='g')
#         ax2.tick_params(axis='y', color='g', labelcolor='g')
#         ax2.spines['right'].set_color('g')
#         ax2.spines['left'].set_color('k')
#         ax1.set_xlabel('Time')   

#         ax1.grid(True, which='both')
#     except:
#         pass

# plt.show()

In [None]:
# print(performance)

In [None]:

# gresearch_crypto.make_env.__called__ = False

# env = gresearch_crypto.make_env()
# iter_test = env.iter_test()

# for (test_df, sample_prediction_df) in iter_test:
    
#     print(test_df)
#     print(sample_prediction_df)
#     print('==========================================')
    
#     pred = []
    
#     timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP,row_id = Clean_df(test_df.values)
    
#     Features = Base_Feature_fn(timestamp,Asset_ID,Count,O,H,L,C,Volume,VWAP)
    
#     for i in range(len(Asset_ID)):
        
#         X_test = np.array([Features[i]])
        
#         y = models[int(Asset_ID[i])].predict(X_test)[0]
        
#         pred.append(y)
        
#     sample_prediction_df['Target'] = pd.Series(pred).values
#     sample_prediction_df['Target'] = 0.0
#     env.predict(sample_prediction_df)  