<a href="https://colab.research.google.com/github/ruygonzalez/CS155KaggleCompetition1/blob/master/NeuralNetworkForOrderbookPredictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torch.utils.data as data_utils
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Import and Process the Data
First, we import the data into dataframes and change the data a little bit.

In [0]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

In [0]:
def process_dataframe(df):
  '''Given the training or test data frame, process it to
     remove the NaN values'''

  # Add new columns!
  df["bid_ask_spread1"] = df["bid1"].subtract(df["ask1"])
  df["bid_ask_spread2"] = df["bid2"].subtract(df["ask2"])
  df["bid_ask_spread3"] = df["bid3"].subtract(df["ask3"])
  df["bid_ask_spread4"] = df["bid4"].subtract(df["ask4"])
  df["bid_ask_spread5"] = df["bid5"].subtract(df["ask5"])

  df["bid_ask_div1"] = df["bid1"].div(df["ask1"])
  df["bid_ask_div2"] = df["bid2"].div(df["ask2"])
  df["bid_ask_div3"] = df["bid3"].div(df["ask3"])
  df["bid_ask_div4"] = df["bid4"].div(df["ask4"])
  df["bid_ask_div5"] = df["bid5"].div(df["ask5"])

  df["mid_price2"] = (df["bid2"].add(df["ask2"])).div(2)
  df["mid_price3"] = (df["bid3"].add(df["ask3"])).div(2)
  df["mid_price4"] = (df["bid4"].add(df["ask4"])).div(2)
  df["mid_price5"] = (df["bid5"].add(df["ask5"])).div(2)

  df["last_div_mid"] = df["last_price"].div(df["mid"])
  df["last_div_mid2"] = df["last_price"].div(df["mid_price2"])
  df["last_div_mid3"] = df["last_price"].div(df["mid_price3"])
  df["last_div_mid4"] = df["last_price"].div(df["mid_price4"])
  df["last_div_mid5"] = df["last_price"].div(df["mid_price5"])

  df["transact_div_mid"] = df["transacted_qty"].div(df["mid"])
  df["transact_div_last"] = df["transacted_qty"].div(df["last_price"])

  df["bid1_vs_last"] = df["bid1"].div(df["last_price"])
  df["bid2_vs_last"] = df["bid2"].div(df["last_price"])
  df["bid1vol_vs_last"] = df["bid1vol"].div(df["last_price"])
  df["bid2vol_vs_last"] = df["bid2vol"].div(df["last_price"])

  df["bid3_vs_last"] = df["bid3"].div(df["last_price"])
  df["bid4_vs_last"] = df["bid4"].div(df["last_price"])
  df["bid3vol_vs_last"] = df["bid3vol"].div(df["last_price"])
  df["bid4vol_vs_last"] = df["bid4vol"].div(df["last_price"])
  df["bid5_vs_last"] = df["bid5"].div(df["last_price"])
  df["bid5vol_vs_last"] = df["bid5vol"].div(df["last_price"])

  df["ask1_vs_last"] = df["ask1"].div(df["last_price"])
  df["ask2_vs_last"] = df["ask2"].div(df["last_price"])
  df["ask1vol_vs_last"] = df["ask1vol"].div(df["last_price"])
  df["ask2vol_vs_last"] = df["ask2vol"].div(df["last_price"])

  df["ask3_vs_last"] = df["ask3"].div(df["last_price"])
  df["ask4_vs_last"] = df["ask4"].div(df["last_price"])
  df["ask3vol_vs_last"] = df["ask3vol"].div(df["last_price"])
  df["ask4vol_vs_last"] = df["ask4vol"].div(df["last_price"])
  df["ask5_vs_last"] = df["ask5"].div(df["last_price"])
  df["ask5vol_vs_last"] = df["ask5vol"].div(df["last_price"])

  df["bid1_vs_mid"] = df["bid1"].div(df["mid"])
  df["bid2_vs_mid"] = df["bid2"].div(df["mid"])
  df["bid1vol_vs_mid"] = df["bid1vol"].div(df["mid"])
  df["bid2vol_vs_mid"] = df["bid2vol"].div(df["mid"])

  df["bid3_vs_mid"] = df["bid3"].div(df["mid"])
  df["bid4_vs_mid"] = df["bid4"].div(df["mid"])
  df["bid3vol_vs_mid"] = df["bid3vol"].div(df["mid"])
  df["bid4vol_vs_mid"] = df["bid4vol"].div(df["mid"])
  df["bid5_vs_mid"] = df["bid5"].div(df["mid"])
  df["bid5vol_vs_mid"] = df["bid5vol"].div(df["mid"])

  df["ask1_vs_mid"] = df["ask1"].div(df["mid"])
  df["ask2_vs_mid"] = df["ask2"].div(df["mid"])
  df["ask1vol_vs_mid"] = df["ask1vol"].div(df["mid"])
  df["ask2vol_vs_mid"] = df["ask2vol"].div(df["mid"])

  df["ask3_vs_mid"] = df["ask3"].div(df["mid"])
  df["ask4_vs_mid"] = df["ask4"].div(df["mid"])
  df["ask3vol_vs_mid"] = df["ask3vol"].div(df["mid"])
  df["ask4vol_vs_mid"] = df["ask4vol"].div(df["mid"])
  df["ask5_vs_mid"] = df["ask5"].div(df["mid"])
  df["ask5vol_vs_mid"] = df["ask5vol"].div(df["mid"])

  df["transact_div_bid1"] = df["transacted_qty"].div(df["bid1"])
  df["transact_div_bid1vol"] = df["transacted_qty"].div(df["bid1vol"])
  df["transact_div_bid2"] = df["transacted_qty"].div(df["bid2"])
  df["transact_div_bid2vol"] = df["transacted_qty"].div(df["bid2vol"])
  df["transact_div_bid3"] = df["transacted_qty"].div(df["bid3"])
  df["transact_div_bid3vol"] = df["transacted_qty"].div(df["bid3vol"])
  df["transact_div_bid4"] = df["transacted_qty"].div(df["bid4"])
  df["transact_div_bid4vol"] = df["transacted_qty"].div(df["bid4vol"])
  df["transact_div_bid5"] = df["transacted_qty"].div(df["bid5"])
  df["transact_div_bid5vol"] = df["transacted_qty"].div(df["bid5vol"])

  df["transact_div_ask1"] = df["transacted_qty"].div(df["ask1"])
  df["transact_div_ask1vol"] = df["transacted_qty"].div(df["ask1vol"])
  df["transact_div_ask2"] = df["transacted_qty"].div(df["ask2"])
  df["transact_div_ask2vol"] = df["transacted_qty"].div(df["ask2vol"])
  df["transact_div_ask3"] = df["transacted_qty"].div(df["ask3"])
  df["transact_div_ask3vol"] = df["transacted_qty"].div(df["ask3vol"])
  df["transact_div_ask4"] = df["transacted_qty"].div(df["ask4"])
  df["transact_div_ask4vol"] = df["transacted_qty"].div(df["ask4vol"])
  df["transact_div_ask5"] = df["transacted_qty"].div(df["ask5"])
  df["transact_div_ask5vol"] = df["transacted_qty"].div(df["ask5vol"])

  df["last_vs_mid"] = df["last_price"].div(df["mid"])

  df["sum_of_asks"] = df["ask1"].add(df["ask2"]).add(df["ask3"]).add(df["ask4"]).add(df["ask5"])
  df["sum_of_bids"] = df["bid1"].add(df["bid2"]).add(df["bid3"]).add(df["bid4"]).add(df["bid5"])
  df["total_volume_bids"] = df["bid1vol"].add(df["bid2vol"]).add(df["bid3vol"]).add(df["bid4vol"]).add(df["bid5vol"])
  df["total_volume_asks"] = df["ask1vol"].add(df["ask2vol"]).add(df["ask3vol"]).add(df["ask4vol"]).add(df["ask5vol"])

  df["total_ask_bid_difference"] = df["sum_of_asks"].sub(df["sum_of_bids"])
  df["total_volume_ask_bid_difference"] = df["total_volume_asks"].sub(df["total_volume_bids"])

  df["price_diff_bid_12"] = df["bid1"].sub(df["bid2"])
  df["price_diff_bid_13"] = df["bid1"].sub(df["bid3"])
  df["price_diff_bid_14"] = df["bid1"].sub(df["bid4"])
  df["price_diff_bid_15"] = df["bid1"].sub(df["bid5"])

  df["price_diff_bid_23"] = df["bid2"].sub(df["bid3"])
  df["price_diff_bid_24"] = df["bid2"].sub(df["bid4"])
  df["price_diff_bid_25"] = df["bid2"].sub(df["bid5"])  
  df["price_diff_bid_34"] = df["bid3"].sub(df["bid4"])
  df["price_diff_bid_35"] = df["bid3"].sub(df["bid5"])   
  df["price_diff_bid_45"] = df["bid4"].sub(df["bid5"])

  df["price_diff_ask_12"] = df["ask2"].sub(df["ask1"])
  df["price_diff_ask_13"] = df["ask3"].sub(df["ask1"])
  df["price_diff_ask_14"] = df["ask4"].sub(df["ask1"])
  df["price_diff_ask_15"] = df["ask5"].sub(df["ask1"])

  df["price_diff_ask_23"] = df["ask3"].sub(df["ask2"])
  df["price_diff_ask_24"] = df["ask4"].sub(df["ask2"])
  df["price_diff_ask_25"] = df["ask5"].sub(df["ask2"])
  df["price_diff_ask_34"] = df["ask4"].sub(df["ask3"])
  df["price_diff_ask_35"] = df["ask5"].sub(df["ask3"])
  df["price_diff_ask_45"] = df["ask5"].sub(df["ask4"])

  df["open_interest_vs_mid"] = df["d_open_interest"].div(df["mid"])
  df["open_interest_vs_last_price"] = df["d_open_interest"].div(df["last_price"])
  df["open_interest_vs_ask1"] = df["d_open_interest"].div(df["ask1"])
  df["open_interest_vs_bid1"] = df["d_open_interest"].div(df["bid1"])
  df["open_interest_vs_ask1vol"] = df["d_open_interest"].div(df["ask1vol"])
  df["open_interest_vs_bid1vol"] = df["d_open_interest"].div(df["bid1vol"])
  
  df["open_interest_vs_ask2"] = df["d_open_interest"].div(df["ask2"])
  df["open_interest_vs_bid2"] = df["d_open_interest"].div(df["bid2"])
  df["open_interest_vs_ask2vol"] = df["d_open_interest"].div(df["ask2vol"])
  df["open_interest_vs_bid2vol"] = df["d_open_interest"].div(df["bid2vol"])

  df["open_interest_vs_ask3"] = df["d_open_interest"].div(df["ask3"])
  df["open_interest_vs_bid3"] = df["d_open_interest"].div(df["bid3"])
  df["open_interest_vs_ask3vol"] = df["d_open_interest"].div(df["ask3vol"])
  df["open_interest_vs_bid3vol"] = df["d_open_interest"].div(df["bid3vol"])

  df["open_interest_vs_ask4"] = df["d_open_interest"].div(df["ask4"])
  df["open_interest_vs_bid4"] = df["d_open_interest"].div(df["bid4"])
  df["open_interest_vs_ask4vol"] = df["d_open_interest"].div(df["ask4vol"])
  df["open_interest_vs_bid4vol"] = df["d_open_interest"].div(df["bid4vol"])

  df["open_interest_vs_ask5"] = df["d_open_interest"].div(df["ask5"])
  df["open_interest_vs_bid5"] = df["d_open_interest"].div(df["bid5"])
  df["open_interest_vs_ask5vol"] = df["d_open_interest"].div(df["ask5vol"])
  df["open_interest_vs_bid5vol"] = df["d_open_interest"].div(df["bid5vol"])

  # Remove columns
  df_new = df.drop(columns=['id'])
  # Replace NaN with 0
  df_new = df_new.fillna(0)

  return df_new

In [0]:
train_df_processed = process_dataframe(train_df)
test_df_processed = process_dataframe(test_df)

In [0]:
X = train_df_processed.drop(columns=['y'])
y = train_df_processed['y']

# Scale the X data
X_scaled = preprocessing.scale(X)
train_dataset = data_utils.TensorDataset(torch.tensor(X_scaled), torch.tensor(y))

test_x = test_df_processed
# Scale the X data
test_x_scaled = preprocessing.scale(test_x)
test_y = np.zeros(len(test_x))
actual_test_dataset = data_utils.TensorDataset(torch.tensor(test_x_scaled), torch.tensor(test_y))

In [0]:
model = nn.Sequential(
    nn.Linear(156, 128),
    nn.ReLU(),
    nn.Dropout(0.01),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Dropout(0.01),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Dropout(0.01),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Dropout(0.01),
    nn.Linear(128, 2),
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

train_loader = data_utils.DataLoader(train_dataset, batch_size=128, shuffle=False)
actual_test_loader = data_utils.DataLoader(actual_test_dataset, batch_size=191859, shuffle=False)

for epoch in range(25):
    # First set to train
    model.train()
    print(f'Epoch {epoch+1}/10:')
    train_correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data.float())

        pred = output.argmax(dim=1, keepdim=True)
        train_correct += pred.eq(target.view_as(pred)).sum().item()

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()


    print('Train Loss: %.4f' % loss.item())
    print('Train Accuracy: %d/%d (%.4f)' % (train_correct, 
                                            len(train_loader.dataset),
                                            100. * train_correct /
                                            len(train_loader.dataset)))

Epoch 1/10:
Train Loss: 0.6657
Train Accuracy: 392798/592380 (66.3085)
Epoch 2/10:
Train Loss: 0.6628
Train Accuracy: 394229/592380 (66.5500)
Epoch 3/10:
Train Loss: 0.6593
Train Accuracy: 394615/592380 (66.6152)
Epoch 4/10:
Train Loss: 0.6550
Train Accuracy: 394946/592380 (66.6711)
Epoch 5/10:
Train Loss: 0.6472
Train Accuracy: 395213/592380 (66.7161)
Epoch 6/10:
Train Loss: 0.6490
Train Accuracy: 395192/592380 (66.7126)
Epoch 7/10:
Train Loss: 0.6524
Train Accuracy: 395458/592380 (66.7575)
Epoch 8/10:
Train Loss: 0.6460
Train Accuracy: 395612/592380 (66.7835)
Epoch 9/10:
Train Loss: 0.6518
Train Accuracy: 395657/592380 (66.7911)
Epoch 10/10:
Train Loss: 0.6554
Train Accuracy: 395878/592380 (66.8284)
Epoch 11/10:
Train Loss: 0.6521
Train Accuracy: 396007/592380 (66.8502)
Epoch 12/10:
Train Loss: 0.6526
Train Accuracy: 396088/592380 (66.8638)
Epoch 13/10:
Train Loss: 0.6563
Train Accuracy: 396102/592380 (66.8662)
Epoch 14/10:
Train Loss: 0.6484
Train Accuracy: 395927/592380 (66.8367)
E

In [0]:
model.eval()

test_loss = 0
correct = 0
probabilities = []

with torch.no_grad():
  for data, y in actual_test_loader:
      output = model(data.float())
      probs = F.softmax(output, dim=1).tolist()
      probabilities += probs

probabilities[:5:]

[[0.4118609130382538, 0.5881391167640686],
 [0.7737354636192322, 0.22626450657844543],
 [0.5571276545524597, 0.4428722560405731],
 [0.5880458354949951, 0.4119541049003601],
 [0.7005999684333801, 0.2994000315666199]]

In [0]:
test_df['Predicted'] = np.array(probabilities)[:,1]
test_df[['id','Predicted']].to_csv("submission7.csv", index=False)