In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_probability as tfp
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import gresearch_crypto
import gc


pd.set_option('display.max_columns', None)

DEBUG = False

In [None]:
train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv').set_index("timestamp")
assets = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
assets_names = dict(zip(assets.Asset_ID, assets.Asset_Name))
#for assets sorting 
assets_order = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv').Asset_ID[:14]
assets_order = dict((t,i) for i,t in enumerate(assets_order))

if DEBUG:
    train = train[1000000:12000000]

In [None]:
def add_features(df):
    df['Upper_Shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['Lower_Shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    return df

### Assets correlation

In [None]:
# data periods where all assets presented
train['assets']=1
train['assets']=train.groupby(by = train.index)['assets'].sum()
train['asset_name'] = train.Asset_ID.map(assets_names)
train['asset_name'].value_counts()

all_same_time = train[train['assets']==14][['Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'asset_name','VWAP','Target']]
all_same_time.shape

In [None]:
corr_target = all_same_time.reset_index().pivot(index='asset_name', columns='timestamp')['Target'].transpose().corr()
corr_matrix = all_same_time.reset_index().drop(['Target', 'Asset_ID'], axis=1).pivot(index='asset_name', columns='timestamp').transpose().corr()

fig, ax = plt.subplots(1,2,figsize=(20,8))
sns.heatmap(np.round(corr_target, 2), annot=True, ax=ax[0], square=True)
sns.heatmap(np.round(corr_matrix, 2), annot=True, ax=ax[1], square=True)
ax[0].title.set_text('Asset Targets correlation')
ax[1].title.set_text('Asset Features correlation')


In [None]:
fig = make_subplots(rows=7, cols=2,shared_xaxes=True, vertical_spacing=0.03, subplot_titles=tuple([assets_names[i] for i in range(14)]))

data = all_same_time[1000:2400]
data['time'] = [pd.to_datetime(x, unit='s') for x in data.index]
for i in range(14):
    
    coin = data[data.Asset_ID == i]
    name = assets_names[i]

    fig.add_trace(go.Scatter(x=coin['time'], y=coin['VWAP'], name = name + ', VWAP'),row=i//2+1, col= i%2 +1)

fig.update_layout(height=1000, title_text=' Weighted average prices')
fig.show()

### Data preparation

In [None]:
train=train.dropna()
train

In [None]:
VWAP_max = np.max(train[np.isfinite(train.VWAP)].VWAP)
VWAP_min = np.min(train[np.isfinite(train.VWAP)].VWAP)
print(VWAP_max, "\n", VWAP_min)

In [None]:
assets=train.Asset_ID.to_numpy()
targets = train['Target'].to_numpy() 

features = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Upper_Shadow','Upper_Shadow']
train = add_features(train)[features]

In [None]:
#scaler = QuantileTransformer(n_quantiles=10000, output_distribution='normal', random_state=0)
scaler = RobustScaler()

train.VWAP = np.nan_to_num(train.VWAP, posinf=VWAP_max, neginf=VWAP_min)
train = scaler.fit_transform(train)
train.shape

### Model

In [None]:
##https://github.com/tensorflow/tensorflow/issues/37495
def MaxCorrelation(y_true,y_pred):
    """
    Goal is to maximize correlation between y_pred, y_true. Same as minimizing the negative.
    """
    return -tf.math.abs(tfp.stats.correlation(y_pred,y_true, sample_axis=None, event_axis=None))

def Correlation(y_true,y_pred):

    return tf.math.abs(tfp.stats.correlation(y_pred,y_true, sample_axis=None, event_axis=None))

def get_model():  
    asset_input = keras.Input(shape=(1,))
    feat_input = keras.Input(shape=(train.shape[-1:]))
    
    x = layers.Embedding(15, 16, input_length=1)(asset_input)
    
    x = keras.layers.Flatten()(x)
    combined = keras.layers.Concatenate()([x, feat_input])
     
    x = layers.Dense(units=512)(combined)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(units=192)(x)
    #x = layers.Dense(units=128)(x)
    x = layers.Dense(units=96)(x)
    #out = layers.Dense(units=1, activation='tanh')(x)
    out = layers.Dense(units=1)(x)
    
    model = keras.Model(inputs=[asset_input, feat_input], outputs=out)
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5), 
                  #loss = 'mse', 
                loss = 'cosine_similarity',
                  #loss = MaxCorrelation,
                  metrics=[Correlation]
                    )
    
    return model  

model=get_model()
model.summary()

In [None]:
size = list(range(len(train)))

train_ind,test_ind = train_test_split(np.array(size), shuffle=True,random_state=42, test_size=0.15)

In [None]:
X_train, y_train, assets_train = train[train_ind], targets[train_ind], assets[train_ind]
X_valid, y_valid, assets_valid = train[test_ind], targets[test_ind], assets[test_ind]

In [None]:
tf.random.set_seed(0)
BATCH_SIZE=2**15

#estop = keras.callbacks.EarlyStopping(monitor='val_Correlation', patience=7, verbose=0, mode='max',restore_best_weights=True)
estop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min',restore_best_weights=True)
scheduler = keras.optimizers.schedules.ExponentialDecay(1e-4, (5e-2*(len(X_train))/BATCH_SIZE), 1e-3)
#scheduler = keras.optimizers.schedules.ExponentialDecay(1e-3, (1e-4*(len(X_train))/BATCH_SIZE), 1e-3)
lr = keras.callbacks.LearningRateScheduler(scheduler, verbose = 1)
        
model.fit([assets_train, X_train], y_train, 
          validation_data = ([assets_valid, X_valid], y_valid), 
          epochs = 20, batch_size = BATCH_SIZE, 
          shuffle=True, callbacks = [lr, estop])


### Submission

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    
    asset = test_df.Asset_ID.to_numpy()
    test = add_features(test_df)[features]
    test.VWAP = np.nan_to_num(test.VWAP, posinf=VWAP_max, neginf=VWAP_min)
    test = scaler.transform(test)
    y_pred = model.predict([asset, test]).squeeze()
    
    sample_prediction_df['Target'] = y_pred
    #display(test_df)
    #display(sample_prediction_df)    
    env.predict(sample_prediction_df)