# Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read Train Data

In [None]:
TRAINING = True
DEBUG = False

train_data = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
train_data

In [None]:
asset_details = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv")
asset_details.sort_values(by=['Asset_ID'], inplace=True)
asset_details

## Define Data Cleaning Function

In [None]:
def fix_train_test_data(df, debug=False):
    df['order1'] = range(df.shape[0])
    df['order2'] = df.timestamp
    
    fixed_data = pd.DataFrame(columns=[col for col in df.columns if col != 'timestamp'])
    for asset_id in range(14):
        if debug:
            print("-"*81)
            print("For Asset_ID:", asset_id)
            print()
        
        temp = df[df.Asset_ID == asset_id].set_index('timestamp')
        
        if debug:
            print("Total number of NaN:", sum(temp.isna().sum()))
            
        temp.fillna(method='ffill', inplace=True)
        temp.fillna(method='bfill', inplace=True) #For edge cases
        
        if debug:
            print("NaN after fill:", sum(temp.isna().sum()))
            print()
            print("Before fixing gaps:")
            display((temp.index[1:]-temp.index[:-1]).value_counts().head())
     
        temp = temp.reindex(range(temp.index[0],temp.index[-1]+60,60),method='pad')
        
        if debug:
            print()
            print("After fixing gaps:")
            display((temp.index[1:]-temp.index[:-1]).value_counts().head())
    
        fixed_data = pd.concat([fixed_data, temp])
        if debug:
            print("-"*81)
            print("Total size:", fixed_data.shape)
            
    return fixed_data.sort_values(by=['order1', 'order2'])

fixed_data = fix_train_test_data(train_data, debug=DEBUG)

In [None]:
fixed_data

## Check Target Data Fluctuation

In [None]:
temp = fixed_data[fixed_data.Asset_ID == 0]
plt.plot(temp.index, temp.Target, c='#00aa00', alpha=0.75)
plt.show()

# Test Data

In [None]:
test_data = pd.read_csv("../input/g-research-crypto-forecasting/example_test.csv")
test_data.head(7)

In [None]:
fixed_test = fix_train_test_data(test_data, debug=DEBUG)

In [None]:
with pd.option_context('display.max_rows',10):
    display(fixed_test)

# Exploratory Analysis

In [None]:
if not TRAINING:
    fig, ax = plt.subplots(14, 3, sharex=True, figsize=(24, 32))

    for i, lab in zip(asset_details.Asset_ID, asset_details.Asset_Name):
        temp = fixed_data[fixed_data.Asset_ID == i]
        ax[i, 0].title.set_text(f'{lab} Open Price:')
        ax[i, 0].plot(temp.index, temp.Open, c="#0000aa", alpha=0.75)
        ax[i, 1].title.set_text(f'{lab} Close Price:')
        ax[i, 1].plot(temp.index, temp.Close, c="#aa0000", alpha=0.75)
        ax[i, 2].title.set_text(f'{lab} Price Gap:')
        ax[i, 2].plot(temp.index, temp.Close - temp.Open, c="#00aa00", alpha=0.75)
        del temp
        gc.collect()
    plt.show()

In [None]:
if not TRAINING:
    fig, ax = plt.subplots(14, 3, sharex=True, figsize=(24, 32))

    for i, lab in zip(asset_details.Asset_ID, asset_details.Asset_Name):
        temp = fixed_data[fixed_data.Asset_ID == i]
        ax[i, 0].title.set_text(f'{lab} Minute Lowest Price:')
        ax[i, 0].plot(temp.index, temp.Low, c="#0000aa", alpha=0.75)
        ax[i, 1].title.set_text(f'{lab} Minute Highest Price:')
        ax[i, 1].plot(temp.index, temp.High, c="#aa0000", alpha=0.75)
        ax[i, 2].title.set_text(f'{lab} Minute Gap:')
        ax[i, 2].plot(temp.index, temp.High - temp.Low, c="#00aa00", alpha=0.75)
        del temp
        gc.collect()
    plt.show()

# Model Creation

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Reshape, BatchNormalization

node_size = 128
def create_model(n_features):
    model = Sequential()
    
    #model.add(Dense(units=node_size*2, activation='sigmoid', input_shape=(n_features,)))
    model.add(Reshape((n_features, 1), input_shape=(n_features,)))
    
    model.add(LSTM(units = node_size, kernel_initializer='random_normal', return_sequences = True))
    model.add(Dropout(0.1))

    model.add(LSTM(units = node_size, kernel_initializer='random_normal', return_sequences = True))
    model.add(Dropout(0.1))
    
    model.add(BatchNormalization())

    model.add(LSTM(units = node_size, kernel_initializer='random_normal', return_sequences = True))
    model.add(Dropout(0.1))

    model.add(LSTM(units = node_size, kernel_initializer='random_normal'))
    model.add(Dropout(0.1))
    
    model.add(BatchNormalization())
    
    model.add(Dense(units = 1))
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer = optimizer, loss = 'mean_squared_error')
    return model
model = create_model(7)
model.summary()

In [None]:
models = {}
train_columns = ["Count", "Open", "High", "Low", "Close", "Volume", "VWAP"]

if TRAINING:
    for i, lab in zip(asset_details.Asset_ID, asset_details.Asset_Name):
        print(f"Training for {lab}:")
        x_train = fixed_data[fixed_data.Asset_ID == i]
        y_train = x_train.Target
        x_train = x_train[train_columns]

        batch_size = 256
        epochs = 10
        model = create_model(len(train_columns))
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)
        models[lab] = model
        model.save(f"{lab}_trained_weights.hdf5")
        print(f"Finished training for {lab}.")

# Predict

In [None]:
import gresearch_crypto

if TRAINING:
    env = gresearch_crypto.make_env()
    iter_test = env.iter_test()

    for i, (df_test, df_pred) in enumerate(iter_test):
        for j , row in df_test.iterrows():

            model = models[asset_details[asset_details.Asset_ID == 0].Asset_Name.values[0]]
            x_test = row[train_columns]
            y_pred = model.predict(x_test.values.reshape(1, -1))[0]

            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred

            # Print just one sample row to get a feeling of what it looks like
            if i == 0 and j == 0:
                display(x_test)

        # Display the first prediction dataframe
        if i == 0:
            display(df_pred)

        # Send submissions
        env.predict(df_pred)

In [None]:
for i, lab in zip(asset_details.Asset_ID, asset_details.Asset_Name):
    temp = fixed_data[fixed_data.Asset_ID == i][:100]
    preds = models[lab].predict(temp[train_columns])
    plt.title(f"{lab} Target vs Prediction:")
    plt.plot(temp.index, temp.Target, c='#00aa00')
    plt.plot(temp.index, preds, c='#bb0000')
    plt.legend(['Target', "Prediction"])
    plt.show()
    plt.savefig(f'{lab}_compare_target.png')
    plt.close()

To be continued...