In [None]:
import numpy as np 
import pandas as pd 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow import keras
from scipy import stats # stats.pearsonr
from tqdm import notebook
import warnings
warnings.filterwarnings('ignore')

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)] 
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
print(train.shape)
train.head()

In [None]:
investment_id = train.pop("investment_id")
time_id = train.pop("time_id")

target = train.pop("target")
target.head()

In [None]:
fig, ax = plt.subplots()  
ax.plot(target);  

In [None]:
train.head()

In [None]:
BATCH = 512 
FOLD = 5
SEED = 42
EPOCHS = 3

skfolds = StratifiedKFold(n_splits=FOLD, 
                          random_state=SEED, 
                          shuffle = True)

LOSS_HISTORY = [] 
TEST_LOSS_HISTORY = []
TRAIN_LOSS = tf.keras.metrics.Mean(name='TRAIN_LOSS', dtype=tf.float32) 
TEST_LOSS = tf.keras.metrics.Mean(name='TEST_LOSS', dtype=tf.float32)    

LOSS_FN = keras.losses.MeanSquaredError()
OPTIMIZER = tf.keras.optimizers.Adam()

investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1 

investment_id_lookup_layer = tf.keras.layers.IntegerLookup(max_tokens=investment_id_size) 
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids}))

def ret(a):
    return  a

tf.executing_eagerly()

In [None]:
class Model(keras.Model):
    def __init__(self, investment_id_size, investment_id_lookup_layer):
        super(Model, self).__init__() 
        self.investment_id_size = investment_id_size
        self.investment_id_lookup_layer = investment_id_lookup_layer
        
        #self.investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
        self.investment_id_inputs = tf.keras.layers.Lambda(ret, input_shape = [1], dtype=tf.uint16)
        self.investment_id_lookup_layer = investment_id_lookup_layer
        self.investment_id_emb = tf.keras.layers.Embedding(investment_id_size, 32, input_length=1)
        self.investment_id_res = tf.keras.layers.Reshape((-1, ))
        self.investment_id_1 = tf.keras.layers.Dense(64, activation='swish')
        self.investment_id_2 = tf.keras.layers.Dense(64, activation='swish')
        self.investment_id_3 = tf.keras.layers.Dense(64, activation='swish')
        
        self.inputs = tf.keras.layers.Lambda(ret, input_shape = [300], dtype=tf.float16)
        self.d1 = tf.keras.layers.Dense(256, activation = 'swish')
        self.d2 = tf.keras.layers.Dense(256, activation = 'swish')
        self.d3 = tf.keras.layers.Dense(256, activation = 'swish')
        
        self.conc = tf.keras.layers.Concatenate(axis=1)
        
        self.dd5 = tf.keras.layers.Dense(512, kernel_regularizer="l2", activation = 'swish')
        self.dd6 = tf.keras.layers.Dense(128, kernel_regularizer="l2", activation = 'swish')
        self.dd7 = tf.keras.layers.Dense(32, kernel_regularizer="l2", activation = 'swish')
        self.out = tf.keras.layers.Dense(1)

    def call(self, input): # x - X_train, b - investment_id_train
        x, b = input
        x = self.inputs(x)
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        
        b = self.investment_id_inputs(b)
        b = self.investment_id_lookup_layer(b)
        b = self.investment_id_emb(b)
        b = self.investment_id_res(b)
        b = self.investment_id_1(b)
        b = self.investment_id_2(b)
        b = self.investment_id_3(b)
        
        c = self.conc([x,b])
        
        y = self.dd5(c)
        y = self.dd6(c)
        y = self.dd7(c)
        return self.out(y)

In [None]:
model = Model(investment_id_size, investment_id_lookup_layer)

In [None]:
@tf.function
def train_step(investment, labels):
    with tf.GradientTape() as tape:
        predictions = model(investment) # investment[0] - X_train : investment[1] - investment_id_train
        loss_value = LOSS_FN(labels, predictions)
        #tf.keras.losses.MeanSquaredError(labels, predictions) # tf.keras.losses.MeanAbsoluteError(labels, predictions) 
        #print(loss_value)
    
    LOSS_HISTORY.append(TRAIN_LOSS(loss_value))

    grads = tape.gradient(loss_value, model.trainable_variables)
    OPTIMIZER.apply_gradients(zip(grads, model.trainable_variables))
    
    TRAIN_LOSS(loss_value)

In [None]:
@tf.function
def test_step(investment, labels):
    predictions = model(investment)
    test_loss_value = LOSS_FN(labels, predictions)
    TEST_LOSS_HISTORY.append(TEST_LOSS(test_loss_value))
    
    TEST_LOSS(test_loss_value)

In [None]:
for num_fold, (train_index, valid_index) in enumerate(skfolds.split(train, investment_id)):
    print('num_fold:', num_fold+1)
    
    if num_fold > 0:
        del train_dataset
        del test_dataset
    '''
    if 'train_dataset' in globals():
        del train_dataset 
    if 'test_dataset' in globals():
        del test_dataset 
    '''     
    X_train, X_valid = train.iloc[train_index], train.iloc[valid_index]
    Y_train, Y_valid = target.iloc[train_index], target.iloc[valid_index] 
    print(X_train.shape, Y_train.shape)
    print(X_valid.shape, Y_valid.shape)
    investment_id_train = investment_id[train_index]
    investment_id_val = investment_id[valid_index]
    print(investment_id_train.shape, investment_id_val.shape)
   
    train_dataset = tf.data.Dataset.from_tensor_slices(
    ((X_train, investment_id_train), Y_train))
    train_dataset = train_dataset.shuffle(4096).batch(BATCH)
    
    test_dataset = tf.data.Dataset.from_tensor_slices(
    ((X_valid, investment_id_val), Y_valid))
    test_dataset = test_dataset.shuffle(4096).batch(BATCH)
    
    del X_train
    del Y_train
    del X_valid
    del Y_valid
    del train_index
    del valid_index
    
    for epoch in notebook.tqdm(range(EPOCHS)):
        
        TRAIN_LOSS.reset_states()
        TEST_LOSS.reset_states()
        
        for (batch, (investment, labels)) in enumerate(train_dataset):
            train_step(investment, labels)
            
        for (batch, (investment, labels)) in enumerate(test_dataset):
            test_step(investment, labels)
            
        print(
        f'Epoch {epoch + 1}, '
        f'Loss: {TRAIN_LOSS.result()}, '
        f'Test Loss: {TEST_LOSS.result()}')

    num_fold+=1

In [None]:
del train_dataset
del test_dataset

In [None]:
def preprocess_test(feature, investment_id):
    return (feature, investment_id), 0


def make_test_dataset(feature, in_id,  batch_size=512):
    ds = tf.data.Dataset.from_tensor_slices((feature, in_id))
    ds = ds.map(preprocess_test)
    ds = ds.batch(BATCH)
    return ds

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    print("shape", test_df.shape)
    ds  = make_test_dataset(test_df[features], test_df["investment_id"])

    #for test_dataset in ds:
    preds = model.predict(ds) 
    print(type(preds))
    print(preds)
    sample_prediction_df['target'] = preds
    env.predict(sample_prediction_df) 