In [None]:
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
import datetime as dt
from statsmodels.graphics.tsaplots import plot_pacf
import copy
from IPython.display import Image
warnings.simplefilter(action = 'ignore', category = FutureWarning)
pd.options.mode.chained_assignment = None

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def loadData(file):
    df = pd.read_csv(file)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit = 's')
    df.set_index(['timestamp', 'Asset_ID'], inplace = True) # make multi-index
    
    return df

In [None]:
# read data, format, filter time
data = loadData('/kaggle/input/g-research-crypto-forecasting/train.csv')
data = data[data.index.get_level_values('timestamp') > '2020-12-30'] 
print(data.info(show_counts = True))
data.head()

In [None]:
# get supp train data
suppData = loadData('../input/g-research-crypto-forecasting/supplemental_train.csv')
print(suppData.info(show_counts = True))
suppData.head()

In [None]:
# stack dataframes without overlapping index

overlapDate = suppData.index.get_level_values('timestamp').min() # returns earliest time from suppTrain
data = data[data.index.get_level_values('timestamp') < overlapDate] # filter original DF so there's no overlap

stacked = pd.concat([data, suppData], ignore_index = False, levels = 'timestamp')

douplicateRows = stacked.shape[0] - data.shape[0] - suppData.shape[0]
print(f"There are {douplicateRows} missing rows")

In [None]:
# get the asset details into dictionaries

file = '../input/g-research-crypto-forecasting/asset_details.csv'

assetDetails = (pd.read_csv(file)).sort_values(by = ['Asset_ID']).reset_index(drop = True)

names = {}
weights = {}

for row in assetDetails.index:
    assetID = assetDetails.at[row, 'Asset_ID'] 
    names[assetID] = assetDetails.at[row, 'Asset_Name']
    weights[assetID] = assetDetails.at[row, 'Weight']

print(names)
print(weights)

In [None]:
# create functions to add in feature cols
    
def FeatureCols(df):
    df['hlDiff'] = df['High'] - df['Low'] # high - low to measure volitility
    df['avgSize'] = df['Volume'] // df['Count'] # average size of each trade as int
    
    # shadows
    df['uShadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['bShadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    
    # encode minute from timestamp
    df['minute'] = df.index.get_level_values('timestamp').minute
    df['minSin'] = np.sin(df.minute*(2.*np.pi/60))
    df['minCos'] = np.cos(df.minute*(2.*np.pi/60))
    
    # encode day of month from timestamp
    df['mDay'] = df.index.get_level_values('timestamp').day
    df['daySin'] = np.sin(df.mDay*(2.*np.pi/31))
    df['dayCos'] = np.cos(df.mDay*(2.*np.pi/31))
    
    df.drop(columns = ['minute', 'mDay'], axis = 1, inplace = True) # clear progress columns
    
    return (df)

In [None]:
final = FeatureCols(stacked) # Apply feature cols to the entire dataset
final = final[ [ col for col in final.columns if col != 'Target' ] + ['Target'] ] # move target to end
final.head(20)

In [None]:
final.info(show_counts = True) # check dataset after feature cols were added

In [None]:
# get libraries for preprocessing 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer

In [None]:
# add in rsi

windowLen = 30

# https://www.alpharithms.com/relative-strength-index-rsi-in-python-470209/
def addRSI(closePrices):

    # initalize variables
    gains = [] # Initialize containers for avg. gains and losses
    losses = []

    window = [] # Create a container for current lookback prices

    prev_avg_gain = None # Keeps track of previous average values
    prev_avg_loss = None

    output = [] # Create a container for our final output
    
    # caclulate price differences
    for i, price in enumerate(closePrices): # keep track of the price for the first period but don't calculate a difference value.
        
        if i == 0:
            window.append(price)
            output.append(0)
            continue
    
        difference = round(closePrices[i] - closePrices[i - 1], 2) # calculate the difference between price and previous price as a rounded value
    
        # Calculate Gains & Losses
        if difference > 0: # Record positive differences as gains
            gain = difference
            loss = 0

        elif difference < 0: # Record negative differences as losses
            gain = 0
            loss = abs(difference)

        else: # Record no movements as neutral
            gain = 0
            loss = 0
    
        gains.append(gain) # Save gains/losses
        losses.append(loss)

        if i < windowLen: # Continue to iterate until enough gains/losses data is available to calculate the initial RS value
            window.append(price)
            output.append(0)
            continue
        
        # Calculate Average Gains & Losses
    
        if i == windowLen: # Calculate SMA for first gain
            avg_gain = sum(gains) / len(gains)
            avg_loss = sum(losses) / len(losses)
    
        else: # Use WSM after initial window-length period
            avg_gain = (prev_avg_gain * (windowLen - 1) + gain) / windowLen
            avg_loss = (prev_avg_loss * (windowLen - 1) + loss) / windowLen
        
        prev_avg_gain = avg_gain # Keep in memory
        prev_avg_loss = avg_loss
        
        if avg_loss == 0:
            rsi = 0
        
        else:
            # Calculate the RS Value
            rs = np.float16(avg_gain) /  np.float16(avg_loss)
    
            # Calculate the RSI Value
            rsi = np.float16(100 - (100 / (1 + rs)))
    
        # Remove oldest values
        window.append(price)
        window.pop(0)
        gains.pop(0)
        losses.pop(0)

        output.append(rsi)
    
    return output

In [None]:
# select cols
noScale_features = ['minSin', 'minCos', 'daySin', 'dayCos']
highVol_features = ['Volume']
cont_features = ['Close', 'avgSize', 'uShadow', 'bShadow', 'rsi']
featureCols = noScale_features + highVol_features + cont_features

# set up pipeline for different data types
def ScaleData(inputDF, noScale_features = noScale_features, highVol_features = highVol_features, cont_features = cont_features):

    noScale_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent'))])

    highVol_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 1)),
        ('encoder', RobustScaler(quantile_range = (20.0, 80.0)))])

    cont_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'mean')),
        ('encoder', MinMaxScaler(feature_range = (0, 1)))])

    # process
    preprosessor = ColumnTransformer(transformers = [
        ('noScale', noScale_transformer, noScale_features),
        ('negPos', highVol_transformer, highVol_features),
        ('cont', cont_transformer, cont_features)])
    
    cols = noScale_features + highVol_features + cont_features # get cols we want to transform
    df_to_scale = inputDF[cols] # select these cols from input df
    fitScaler = preprosessor.fit(df_to_scale) # fit scaler
    scaled = fitScaler.transform(df_to_scale) # scale
    
    return [(pd.DataFrame(scaled, columns = cols).set_index(inputDF.index)), fitScaler] # df with scaled data & fit model to be used later

In [None]:
# Split each table into a df, fill missing values, create feature cols, scale

assets = []
assetScalers = {} # save the scaler
closingPrices = {}

for asset in names.keys():
    df = final.xs(asset, level = 'Asset_ID')
    
    timeStamps = df.index
    
    # set index so there's no missing times
    minDate = timeStamps.min()
    maxDate = timeStamps.max()
    df = df.reindex(index = list(pd.date_range(minDate, maxDate, freq = 'min')), method = 'pad')
    
    # add in rsi
    df['rsi'] = addRSI(df['Close'].to_list())
    df = df.iloc[windowLen:, :] # remove first rows with nan
    closingPrices[asset] = df['Close'].to_list()[-windowLen:] # save closing prices for predictions
    
    # remove last rows with missing target
    df = df.iloc[:-250, :]

    # scale data
    result = ScaleData(df)
    scaledDF, fitScaler = result[0], result[1] 
    
    # fill na's for target
    scaledDF['Target'] = df['Target'].fillna(0)
    
    assets.append(scaledDF) # save transformed df
    assetScalers[asset] = fitScaler # save scaler 
    
    # visualize data
    print(names[asset])
    
    plt.figure(figsize = (10, 10))
    sns.pairplot(scaledDF.sample(10000, random_state = 10, ignore_index = True))
    plt.show()
    
    plot_pacf(scaledDF['Target'].to_list(), lags = 50)
    plt.show()
    
del data # we no longer need the table. Free up memory.

In [None]:
# create class to store data
class Asset():
    def __init__(self, xTrain, xTest, yTrain, yTest, quants, trainWeights, testWeights, builtModel = None):
        self.xTrain = xTrain
        self.xTest = xTest
        self.yTrain = yTrain
        self.yTest = yTest
        self.quants = quants
        self.trainWeights = trainWeights
        self.testWeights = testWeights
        self.builtModel = builtModel

In [None]:
from sklearn.utils import class_weight # weight each class

trainPct = 0.8
assetNames = names.keys()

preppedData = [] # store a instance for each asset
assetShapes = [] # store the shape of each assets df
origTargets = {} # save the original target values

for i, asset in enumerate(assets):
    
    # drop outlier returns
    minOutlier = asset['Target'].quantile(0.025)
    maxOutlier = asset['Target'].quantile(0.985)
    asset = asset.loc[(asset['Target'] > minOutlier) & (asset['Target'] < maxOutlier)]
    
    # get quants
    middleL = asset['Target'].quantile(0.3333)
    middleH = asset['Target'].quantile(0.6666)
    
    bSplit = asset.loc[(asset['Target'] < middleL)]['Target']
    mSplit = asset.loc[(asset['Target'] > middleL) & (asset['Target'] < middleH)]['Target']
    tSplit =asset.loc[(asset['Target'] > middleH)]['Target']

    bottom = bSplit.mean()
    middle = mSplit.mean()
    top = tSplit.mean()
    
    quants = {'b' : bottom, 'ml' : middleL, 'm' : middle, 
              'mh' : middleH, 't' : top}
    
    # create target function
    def createTarget(x):
        if x < middleL:
            return 0
        elif x > middleH:
            return 1
        else:
            return 2
    
    asset['cTarget'] = asset.apply(lambda x: createTarget(x['Target']), axis = 1)
    
    # compute class weights
    weights = class_weight.compute_sample_weight(class_weight = 'balanced', y = asset['cTarget'])
    
    origTargets[i] = np.array(asset['Target'].values.tolist()) # save original target values
    
    # split data 
    y = np.array(asset['cTarget'].values.tolist())
    asset.drop(['Target', 'cTarget'], axis = 1, inplace = True)
    
    x = np.array(asset) 
    
    # append a class instance with the training and testing data
    trainIndex = int(len(x) * trainPct)
    preppedData.append(Asset(x[:trainIndex], x[trainIndex:], y[:trainIndex], y[trainIndex:], 
                             quants, weights[:trainIndex], weights[trainIndex:]))
    
    # append shapes
    shapes = {}
    shapes['xTrain_shape'] = np.shape(preppedData[i].xTrain)
    shapes['xTest_shape'] = np.shape(preppedData[i].xTest)
    shapes['yTrain_shape'] = np.shape(preppedData[i].yTrain)
    shapes['yTest_shape'] = np.shape(preppedData[i].yTest)
    assetShapes.append(shapes)
    
shapes = pd.DataFrame(assetShapes, index = assetNames)
shapes

In [None]:
# visualize quants
from scipy.stats import kde

#list of all plot locations
rows, cols = 3, 5
plotList = [] 
for row in list(range(rows)):
    for col in list(range(cols)):
        plotList.append([row, col])

# loss values by epoch
fig, axis = plt.subplots(rows, cols, figsize = (10, 10))

for i, asset in enumerate(preppedData):
    target = origTargets[i]
    prob_density = kde.gaussian_kde(target)
    prob_density.covariance_factor = lambda : 0.25
    prob_density._compute_covariance()
    
    x = np.linspace(-0.02, 0.02, 300)
    y = prob_density(x)
    
    pltRow, pltCol = plotList[i][0], plotList[i][1]
    axis[pltRow, pltCol].plot(x, y)
    axis[pltRow, pltCol].axvline(x = asset.quants['b'], color = 'red', linestyle = '--')
    axis[pltRow, pltCol].axvline(x = asset.quants['ml'], color = 'red', linestyle = '--')
    axis[pltRow, pltCol].axvline(x = asset.quants['mh'], color = 'red', linestyle = '--')
    axis[pltRow, pltCol].axvline(x = asset.quants['t'], color = 'red', linestyle = '--')
    axis[pltRow, pltCol].set_title(names[i], y = 1.05)
    axis[pltRow, pltCol].ticklabel_format(style = 'sci')

fig.suptitle('Density of Target')
fig.tight_layout() 

In [None]:
# get libraries
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
# set params
param = {'max_depth' : 15,
         'eta' : 0.05,
         'num_class' : 3,
         'eval_metric' : 'merror',
         'objective' : 'multi:softmax',
         'min_child_weight' : 1,
         'tree_method' : 'hist',
         'gamma' : 0.02}

epochs = 200
esRounds = 10

# save metrics
accScores = {}
predictions = {}

# fit
for i, asset in enumerate(preppedData):
    print(names[i])
    
    dtrain = xgb.DMatrix(asset.xTrain, feature_names = featureCols, label = asset.yTrain) 
    dtest = xgb.DMatrix(asset.xTest, feature_names = featureCols, label = asset.yTest) 
    
    asset.builtModel = xgb.train(param, dtrain, epochs, evals = [(dtest, "Test")], early_stopping_rounds = esRounds)
    prediction = asset.builtModel.predict(dtest) # predict
    
    predictions[i] = prediction # save predictions
    accScore = accuracy_score(asset.yTest, prediction) # get accuracy
    accScores[names[i]] = "{:.2%}".format(accScore) # append
    
pd.DataFrame.from_dict(accScores, orient = 'index', columns = ['Accuracy Score'])

In [None]:
# visualize feature importance
for asset in preppedData:
    xgb.plot_importance(asset.builtModel)
    plt.rcParams['figure.figsize'] = [5, 5]
    plt.show()

In [None]:
# visualize results
from sklearn.metrics import confusion_matrix

# confusion matrix
for i, asset in enumerate(preppedData):
    plt.title('Confusion Matrix for: ' + names[i])
    sns.heatmap(confusion_matrix(asset.yTest, predictions[i]), annot = True, cmap = "YlGn", fmt = 'g')
    plt.xlabel('Predicted classes')
    plt.ylabel('True Classes')
    plt.show()

In [None]:
import gresearch_crypto

In [None]:
env = gresearch_crypto.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    # clean input df, set index
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], unit = 's')
    test_df.set_index(['Asset_ID', 'timestamp'], inplace = True)

    test_df.fillna(0) # fill na's
    test_df = FeatureCols(test_df) # add in feature cols
    
    rowId = test_df['row_id'].to_list()
    assetId = test_df.index.get_level_values('Asset_ID')
    data = test_df[featureCols[:-1]].to_numpy()
  
    # make predictions
    predictions = []
    
    for i, row in enumerate(data):
        asset = assetId[i]
        newRow = pd.DataFrame(row, index = featureCols[:-1]).T # create a row with just the new data
        
        # add in rsi
        closingPrices[i].append(newRow.iloc[0]['Close']) # add close
        newRow['rsi'] = addRSI(closingPrices[i])[-1] # calculate RSI & add to df
        closingPrices[i].pop(0) # remove first element so list doesn't get too long
        
        # predict
        scaledRow = assetScalers[asset].transform(newRow) # scale data
        dInputs = xgb.DMatrix(scaledRow, feature_names = featureCols) # change data type for model
        prediction = int(preppedData[asset].builtModel.predict(dInputs)) # predict
        
        quants = preppedData[asset].quants # get quants for the asset
    
        if prediction == 0:
            target = quants['b']
        elif prediction == 1:
            target = quants['t']
        else:
            target = quants['m']
            
        predictions.append(np.float16(target))
        
    prediction_df = pd.DataFrame(list(zip(rowId, predictions)), columns = ['row_id', 'Target'])
    
    env.predict(prediction_df) # submit