In [None]:
import os
import janestreet
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd
import numpy as np
import time
import torch
from torch.utils.data import Dataset, DataLoader

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def load_data(PATH):
    dt = pd.read_csv(PATH)
    dt = pd.DataFrame(dt)
    dt['action'] = (dt['resp'] > 0).astype('int')
    dt.drop(columns=['resp', 'date', 'ts_id'], inplace=True)
    return dt

In [None]:
class CustomDataset:
    def __init__(self, dataset, target):
        self.dataset = dataset
        self.target = target

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        return {
            'x': torch.tensor(self.dataset[item, :], dtype=torch.float),
            'y': torch.tensor(self.target[item, :], dtype=torch.long)
        }

In [None]:
class MLP(torch.nn.Module):

    def __init__(self, number_features, number_classes):
        super(MLP, self).__init__()
        
        ### 1st hidden layer
        self.linear_1 = torch.nn.Linear(num_features, num_hidden_1)
        self.linear_1.weight.detach().normal_(0.0, 0.1)
        self.linear_1.bias.detach().zero_()
        self.linear_1_bn = torch.nn.BatchNorm1d(num_hidden_1)
        
        ### 2nd hidden layer
        self.linear_2 = torch.nn.Linear(num_hidden_1, num_hidden_2)
        self.linear_2.weight.detach().normal_(0.0, 0.1)
        self.linear_2.bias.detach().zero_()
        self.linear_2_bn = torch.nn.BatchNorm1d(num_hidden_2)
        
        ### Output layer
        self.linear_out = torch.nn.Linear(num_hidden_2, num_classes)
        self.linear_out.weight.detach().normal_(0.0, 0.1)
        self.linear_out.bias.detach().zero_()
        
    def forward(self, x):
        out = self.linear_1(x)
        # note that batchnorm is in the classic
        # sense placed before the activation
        out = self.linear_1_bn(out)
        out = F.relu(out)
        
        out = self.linear_2(out)
        out = self.linear_2_bn(out)
        out = F.relu(out)
        
        logits = self.linear_out(out)
        probas = F.softmax(logits, dim=1)
        return logits, probas

In [None]:
def compute_accuracy(net, data_loader):
    net.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch in data_loader:
            features = batch['x'].to(device)
            targets = batch['y'].to(device)
            targets = torch.squeeze(targets)
            logits, probas = net(features)
            _, predicted_labels = torch.max(probas, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
        return correct_pred.float() / num_examples * 100

In [None]:
def training(model, optimizer, number_epochs):
    start_time = time.time()
    torch.manual_seed(random_seed)
    for epoch in range(number_epochs):
        model.train()
        for iteration, batch in enumerate(train_loader):
            features = batch['x'].to(device)
            targets = batch['y'].to(device)
            targets = torch.squeeze(targets)
            # FORWARD AND BACK PROP
            logits, probas = model(features)
            cost = F.cross_entropy(logits, targets)
            optimizer.zero_grad()
            cost.backward()

            # UPDATE MODEL PARAMETERS
            optimizer.step()

            # LOGGING
            if not iteration % 50:
                print('Epoch: %03d/%03d | Batch %03d/%03d | Cost: %.4f'
                      % (epoch + 1, num_epochs, iteration, len(train_loader), cost))

        with torch.set_grad_enabled(False):
            print('Epoch: %03d/%03d training accuracy: %.2f%%' % (
                epoch + 1, num_epochs, compute_accuracy(model, train_loader)))

        print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))
    print('Total Training Time: %.2f min' % ((time.time() - start_time) / 60))

In [None]:
def predict(net):
    net.eval()
    env = janestreet.make_env()
    env_iter = env.iter_test()
    for (test_df, pred_df) in tqdm(env_iter):
        test_df = pd.DataFrame(test_df, columns=feature_columns)
        test_df.fillna(-1, inplace=True)
        X_test = torch.FloatTensor(np.array(test_df)).to(device)
        logits, probas = net(X_test)
        probas = probas.cpu().detach().numpy()
        action = ((test_df['weight'].values * probas[:, 1]) > 0).astype('int')
        pred_df.action = action
        env.predict(pred_df)

In [None]:
if __name__ == "__main__":
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    train_path = "/kaggle/input/jane-street-market-prediction/train.csv"
    data = load_data(train_path)
    data.fillna(-1, inplace=True)
    target_column = 'action'
    feature_columns = data.columns[~data.columns.isin([target_column])]
    
    random_seed = 1
    learning_rate = 0.1
    num_epochs = 1
    batch_size = 2048
    num_features = len(feature_columns)
    num_hidden_1 = 128
    num_hidden_2 = 64
    num_classes = 2
    
    train, validation = data[:int(len(data) * 0.75)], data[int(len(data) * 0.75):]
    train_data, train_target = train[feature_columns], train[[target_column]]
    validation_data, validation_target = validation[feature_columns], validation[target_column]
    train_dataset = CustomDataset(dataset=train_data.values, target=train_target.values)
    validation_dataset = CustomDataset(dataset=validation_data.values, target=validation_target.values)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    
    model_nn = MLP(number_features=num_features, number_classes=num_classes)
    model_nn = model_nn.to(device)
    optimizer_nn = torch.optim.SGD(model_nn.parameters(), lr=learning_rate)

    training(model_nn, optimizer_nn, num_epochs)
    predict(model_nn)