In [1]:
import numpy as np 
import pandas as pd 
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

from tqdm import tqdm
from google.colab import drive 
drive.mount('/content/drive/')
import os
os.chdir("/content/drive/My Drive/jane_street")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim.lr_scheduler import ReduceLROnPlateau
from maf import MAF, RealNVP
from sklearn.metrics import  precision_score, recall_score, f1_score
from IPython import embed
from sklearn.preprocessing import MinMaxScaler
import statistics 

LATENT_DIM = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = MinMaxScaler()

## 2. Preprocessing

In [3]:
class CustomDataset:
    def __init__(self, dataset, target):
        self.dataset = dataset
        self.target = target

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        return {
            'x': torch.tensor(self.dataset[item, :], dtype=torch.float),
            'y': torch.tensor(self.target[item, :], dtype=torch.long)
        }

def load_data(PATH):
    dt = pd.read_csv(PATH)
    dt = pd.DataFrame(dt)
    dt['action'] = (dt['resp'] > 0).astype('int')
    dt.drop(columns=['resp', 'date', 'ts_id'], inplace=True)
    
    return dt

data = load_data('train.csv')
data.fillna(-1, inplace=True)
target_column = 'action'
feature_columns = data.columns[~data.columns.isin([target_column])]

random_seed = 1
learning_rate = 0.1
num_epochs = 1
batch_size = 2048
num_features = len(feature_columns)
num_hidden_1 = 128
num_hidden_2 = 64
num_classes = 2


# data = scaler.fit_transform(data)


train, validation = data[:int(len(data) * 0.75)], data[int(len(data) * 0.75):]
train_data, train_target = train[feature_columns], train[[target_column]]
validation_data, validation_target = validation[feature_columns], validation[[target_column]]
train_dataset = CustomDataset(dataset=train_data.values, target=train_target.values)
validation_dataset = CustomDataset(dataset=validation_data.values, target=validation_target.values)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

## 3. AutoEncoder
THX for sharing [this great work](https://www.kaggle.com/snippsy/bottleneck-encoder-mlp-keras-tuner)

In [15]:
def softclip(tensor, min):
    """ Clips the tensor values at the minimum value min in a softway. Taken from Handful of Trials """
    result_tensor = min + F.softplus(tensor - min)
    return result_tensor

class CNN_sigmaVAE(nn.Module):

    def __init__(self,latent_dim=LATENT_DIM, window_size=20, use_probabilistic_decoder=False):
        super(CNN_sigmaVAE, self).__init__()
        
        self.window_size=window_size
        self.latent_dim = latent_dim
        self.prob_decoder = use_probabilistic_decoder
        
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=5, stride=1, padding=0)
        self.bn1 = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=5, stride=1, padding=0)
        self.bn2 = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=4, kernel_size=5, stride=1, padding=0)
        self.bn3 = nn.BatchNorm1d(4)
        
        
        self.fc41 = nn.Linear(4*123, self.latent_dim)
        self.fc42 = nn.Linear(4*123, self.latent_dim)

        self.defc1 = nn.Linear(self.latent_dim, 4*123)
        
        self.deconv1 = nn.ConvTranspose1d(in_channels=4, out_channels=16, kernel_size=5, stride=1, padding=0, output_padding=0)
        self.debn1 = nn.BatchNorm1d(16)
        self.deconv2 = nn.ConvTranspose1d(in_channels=16, out_channels=8, kernel_size=5, stride=1, padding=0, output_padding=0)
        self.debn2 = nn.BatchNorm1d(8)
        self.deconv3 = nn.ConvTranspose1d(in_channels=8, out_channels=1, kernel_size=5, stride=1, padding=0, output_padding=0)

        self.log_sigma = 0
        self.log_sigma = torch.nn.Parameter(torch.full((1,), 0.0)[0], requires_grad=True)
        
        
        self.decoder_fc41 = nn.Linear(self.window_size, self.window_size)
        self.decoder_fc42 = nn.Linear(self.window_size, self.window_size)
        
        self.decoder_fc43 = nn.Linear(self.window_size, self.window_size)
        self.decoder_fc44 = nn.Linear(self.window_size, self.window_size)

        self.flow = MAF(n_blocks=1, input_size=2, cond_label_size=latent_dim, hidden_size=50, n_hidden=1)
        
    def encoder(self, x):
        concat_input = x #torch.cat([x, c], 1)
        h = self.bn1(F.relu(self.conv1(concat_input)))
        h = self.bn2(F.relu(self.conv2(h)))
        h = self.bn3(F.relu(self.conv3(h)))
        
        self.saved_dim = [h.size(1), h.size(2)]
        
        h = h.view(h.size(0), h.size(1) * h.size(2))
        # from IPython import embed
        # embed()
        return self.fc41(h), self.fc42(h)
    
    
    def sampling(self, mu, log_var):
        std = torch.exp(0.5*log_var)
        eps = torch.randn_like(std)
        return eps.mul(std).add(mu) # return z sample
    
    def decoder(self, z):
        concat_input = z #torch.cat([z, c], 1)
        concat_input = self.defc1(concat_input)
        concat_input = concat_input.view(concat_input.size(0), self.saved_dim[0], self.saved_dim[1])

        h = self.debn1(F.relu(self.deconv1(concat_input)))
        h = self.debn2(F.relu(self.deconv2(h)))     
        out = torch.sigmoid(self.deconv3(h))
        
        if self.prob_decoder:
            rec_mu = self.decoder_fc43(out).tanh()
            rec_sigma = self.decoder_fc44(out).tanh()
            return out, rec_mu, rec_sigma
        
        else:
            return out, 0, 0
    
    def forward(self, x):

        mu, log_var = self.encoder(x)
        z = self.sampling(mu, log_var)
        output, rec_mu, rec_sigma = self.decoder(z)

        kl_div = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
        
        return output, rec_mu, rec_sigma, kl_div


    def gaussian_nll(self, mu, log_sigma, x):
        return 0.5 * torch.pow((x - mu) / log_sigma.exp(), 2) + log_sigma + 0.5 * np.log(2 * np.pi)

    
    def reconstruction_loss(self, x_hat, x):

        log_sigma = self.log_sigma
        log_sigma = softclip(log_sigma, -6)
        
        rec_comps = self.gaussian_nll(x_hat, log_sigma, x)
        rec = rec_comps.sum()

        return rec_comps, rec

    
    def loss_function(self, recon_x, x, rec_mu, rec_sigma, kl):
        
        rec_comps, rec = self.reconstruction_loss(recon_x, x)
        #kl = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
        rec_mu_sigma_loss = 0
        if self.prob_decoder:
            rec_mu_sigma_loss = self.gaussian_nll(rec_mu, rec_sigma, x).sum()
        
        return rec_comps, rec, rec_mu_sigma_loss, kl


def train_flow_model(model, num_epochs, learning_rate, dataloader):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    epochs=num_epochs
    tq = tqdm(range(epochs))

    torch.manual_seed(random_seed)
    for epoch in range(num_epochs):
        model.train()
        for iteration, batch in enumerate(dataloader):
            inputs = batch['x'].to(device)
            labels = batch['y'].to(device)
            labels = torch.squeeze(labels)

            inputs = inputs.unsqueeze(1)

            optimizer.zero_grad()

            mu, _ = model.encoder(inputs)
            labels = labels.unsqueeze(1).repeat(1, 2).float()
            zk, loss = model.flow.log_prob(x=labels, y=mu)

            # VAE Training
            # outputs, rec_mu, rec_sigma, kl = model(inputs)
            # _, rec, _, kl = model.loss_function(outputs, inputs, rec_mu, rec_sigma, kl)

            loss = -loss.mean()

            if(np.isnan(loss.item())):
                print("Noped out at", epoch, j, kl, rec_comps)

            loss.backward()
            optimizer.step()

        print(epoch, 'total :' + str(loss.item()))

        torch.save(model, 'bitch_street_VAEflow.pth')
        torch.save(model.state_dict(), 'bitch_street_VAEflow_state_dict.pth')

        #break
    return model

def test_flow_model(model, dataloader):

    torch.manual_seed(random_seed)
    model.eval()

    precision = []
    recall = []
    f1 = []

    for iteration, batch in enumerate(dataloader):
      inputs = batch['x'].to(device)
      labels = batch['y'].to(device)
      labels = torch.squeeze(labels)
      # inputs = torch.from_numpy(validation_data.to_numpy()).float().to(device)
      # labels = torch.from_numpy(validation_target.to_numpy()).float().to(device)

      if inputs.shape[0] < batch_size:
        break

      labels = torch.squeeze(labels)
      inputs = inputs.unsqueeze(1)

      mu, _ = model.encoder(inputs)
      zero_test = torch.zeros([batch_size, 2]).to(device)
      ones_test = torch.ones_like(zero_test).to(device)

      _, zero_log_prob = model.flow.log_prob(x=zero_test, y=mu)
      _, one_log_prob = model.flow.log_prob(x=ones_test, y=mu)

      z = torch.zeros([batch_size, 2]).cpu()
      norm_zeros = scaler.fit_transform(zero_log_prob.cpu().detach().numpy().reshape(-1, 1))
      norm_ones = scaler.fit_transform(one_log_prob.cpu().detach().numpy().reshape(-1, 1))

      norm_zeros = torch.from_numpy(norm_zeros).squeeze(1)
      norm_ones = torch.from_numpy(norm_ones).squeeze(1)
      z[:, 0] = norm_zeros
      z[:, 1] = norm_ones

      preds = torch.argmax(z, dim=1)
      labels = labels.cpu()
      precision.append(precision_score(labels, preds))
      recall.append(recall_score(labels, preds))
      f1.append(f1_score(labels, preds))
    print('iter: ' + str(iteration), 'precision : ' + str(statistics.mean(precision)) + ' recall : ' + str(statistics.mean(recall)) + ' f1 : ' + str(statistics.mean(f1)))


### 3-1  AutoEncoder Training

In [16]:
latent_dim=8
model = CNN_sigmaVAE(latent_dim=latent_dim)
model.to(device)
model.cuda() if torch.cuda.is_available() else model.cpu()

model = torch.load('bitch_street_VAEflow.pth')
# model = train_flow_model(model, 1000, .001, train_loader)
test_flow_model(model, validation_loader)

iter: 291 precision : 0.49791471534798704 recall : 0.5631578941185342 f1 : 0.5224703074519649


## Submission

In [None]:
import janestreet
env = janestreet.make_env()
env_iter = env.iter_test()

In [None]:
opt_th = 0.5

if not train_mode:
    for (test_df, pred_df) in env_iter:
        
        if test_df['weight'].item() > 0:
            test_df = test_df.loc[:, features].values
            if np.isnan(test_df[:, 1:].sum()):
                test_df[:, 1:] = np.nan_to_num(test_df[:, 1:]) + np.isnan(test_df[:, 1:]) * f_mean

            pred_vector = np.mean([fit(model, test_df) for model in models],axis=0)
            pred = np.mean(pred_vector)
            pred_df.action = (pred_vector > opt_th).astype(int) 
            

        else:
            pred_df.action = 0
        env.predict(pred_df)