In [None]:
import datatable as dt
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, SequentialSampler
from torch.nn import CrossEntropyLoss
from sklearn.metrics import log_loss, accuracy_score

In [None]:
print('Loading data...')

train_datatable = dt.fread('/kaggle/input/jane-street-market-prediction/train.csv')
df = train_datatable.to_pandas()
del train_datatable

display(df)

In [None]:
def utility_score_bincount(date, weight, resp, action):
    '''
    Credits to Kaggle user Lindada with this implementation of the utility score for the competition
    '''
    
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

Preprocessing data

In [None]:
def preprocessing(df):
    
    # Someone observe that it is better using train.date > 85
    # train = train.loc[train.date > 85].reset_index(drop=True)
    
    # Add action column based on the resp
    df['action'] = (df['resp'] > 0).astype('int')
    
    # NaN values: fill with mean
    fill_val = df.mean()
    df = df.fillna(fill_val)
    
    # Split the training and validation data, leave the last 50 dates for validation
    valid = df.loc[(df.date >= 450) & (df.date < 500)].reset_index(drop=True)
    train = df.loc[df.date < 450].reset_index(drop=True)
    
    # Save validation set for testing
    valid.to_csv('/kaggle/working/val.csv')
    
    return train, valid
    

Construct dataset for PyTorch training

In [None]:
class Dataset:
    
    def __init__(self, data):
        feat_cols = [f'feature_{i}' for i in range(130)]
        self.features = data[feat_cols].values
        self.label = data['action'].values
        
    def __getitem__(self, idx):
        return {
                'features': torch.tensor(self.features[idx], dtype=torch.float),
                'label': torch.tensor(self.label[idx], dtype=torch.float)
                }
    
    def __len__(self):
        return len(self.features)

Define the neural network I use.

In [None]:
class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        
        self.linear1 = nn.Linear(129, 256)
        self.dropout1 = nn.Dropout(p=0.1)
        self.BatchNorm1d_1 = nn.BatchNorm1d(256)
        
        self.linear2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(p=0.2)
        self.BatchNorm1d_2 = nn.BatchNorm1d(128)
        
        self.linear3 = nn.Linear(128, 64)
        self.dropout3 = nn.Dropout(p=0.1)
        self.BatchNorm1d_3 = nn.BatchNorm1d(64)
        
        self.linear4 = nn.Linear(65,1)
        
        self.sigmoid = nn.Sigmoid()
        
        self.LeakyReLU = nn.LeakyReLU(inplace=True)
  

    def forward(self,x):
        
        x_continuous = x[:,1:]
        x_binary = x[:,0] # Take out the only binary variable
        
        out = self.linear1(x_continuous)
        out = self.LeakyReLU(out)
        out = self.dropout1(out)
        out = self.BatchNorm1d_1(out)
        
        out = self.linear2(out)
        out = self.LeakyReLU(out)
        out = self.dropout2(out)
        out = self.BatchNorm1d_2(out)
        
        out = self.linear3(out)
        out = self.LeakyReLU(out)
        out = self.dropout3(out)
        out = self.BatchNorm1d_3(out)
        
        
        out = torch.cat((out, x_binary.unsqueeze(1)),dim=1) # Combine the binary variable back
        out = self.linear4(out)
        out = self.sigmoid(out)
        
        return out

In [None]:
def train_one_epoch(model, optimizer, dataloader, loss_fn, device):
    '''
    Train for one epoch and returns training loss
    '''
    
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        x = batch['features'].to(device)
        y = batch['label'].to(device)
        
        output = model(x)
        loss = loss_fn(output.squeeze(),y.squeeze())
        loss.backward()
        optimizer.step()
        
        total_loss += loss
        
    return total_loss/len(dataloader)


def validation(model, dataloader, device, valid):
    """
    Function for validation at the end of each epoch
    """
    
    model.eval()
    prediction = []
    ground_truth = []
    
    for batch in dataloader:
        x = batch['features'].to(device)
        y = batch['label'].to(device)
        with torch.no_grad():
            output = model(x)
        prediction.append(output.squeeze().detach().cpu().numpy())
        ground_truth.append(y.squeeze().detach().cpu().numpy())
        
    prediction = np.concatenate(prediction).reshape(-1,1).squeeze()
    ground_truth = np.concatenate(ground_truth).reshape(-1,1).squeeze()
    
    logloss = log_loss(ground_truth, prediction) # Calculate log loss of the predicted probability
    
    pred_label = np.where(prediction >= 0.5, 1, 0).astype(int) # Assign label based on predicted probability
    
    accuracy = accuracy_score(ground_truth, pred_label) # Calculate accuracy based on predicted label
    
    # Calculate utility score (cumulative returns)
    utility = utility_score_bincount(date=valid.date.values, weight=valid.weight.values,
                                                   resp=valid.resp.values, action=pred_label)
        
    return logloss, accuracy, utility
    

    
def train(df, n_epochs):
    '''
    Function for training
    '''
    
    train, valid = preprocessing(df)
    
    train_set = Dataset(train)
    val_set = Dataset(valid)
    
    train_loader = DataLoader(train_set, batch_size=4096, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_set, batch_size=4096, shuffle=False, num_workers=4)
    
    device = torch.device("cuda")
    model = Model()
    model.to(device)
    
    #optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)
    optimizer = torch.optim.SGD(model.parameters(),0.05,0.9) # Use SGD because it empirically performs better according to my experiments
    loss_fn = nn.BCELoss() # Binary cross entropy loss
    
    for epoch in range(n_epochs):
        epoch_loss = train_one_epoch(model, optimizer, train_loader, loss_fn, device)
        logloss, accuracy, utility = validation(model, val_loader, device, valid)
        print(f"EPOCH: {epoch}")
        print(f"Training loss: {epoch_loss: .5f}  " 
               f"Validation logloss: {logloss: .3f}  "
                f"Validation accuracy: {accuracy: .2f}  "
                 f"Utility score: {utility: .2f}")
        print('\n')
    
    torch.save(model.state_dict(), '/kaggle/working/model.pt')
    print('Training finished, model saved!')
    
    backtest_set = val_set
    
    return backtest_set # Return the validation dataset as the backtest dataset
    

Training

In [None]:
backtest_set = train(df, n_epochs=8)

Define function to do inference. Predict the label on the validation dataset by sequence.

In [None]:
def inference(backtest_data, model_path):
    device = torch.device("cuda")
    model = Model()
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    
    inf_sampler = SequentialSampler(backtest_data)
    inf_loader = DataLoader(backtest_data, sampler = inf_sampler)
    
    prediction = []
    ground_truth = []
    
    for data in inf_loader:
        x = data['features'].to(device)
        y = data['label'].to(device)
        with torch.no_grad():
            output = model(x)
        prediction.append(output.squeeze().detach().cpu().numpy())
        ground_truth.append(y.squeeze().detach().cpu().numpy())
        
    prediction = np.array(prediction).reshape(-1,1).squeeze()
    pred_label = np.where(prediction >= 0.5, 1, 0).astype(int)
    
    return pred_label

In [None]:
pred_label = inference(backtest_set, '/kaggle/working/model.pt')

Save results for back testing

In [None]:
np.savetxt('/kaggle/working/prediction_NN.csv', pred_label, delimiter=',')