In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
import numpy as np

In [2]:
df = pd.read_csv("/mnt/scratch/tairaeli/cse_dat/train.csv")
df = df.dropna()
df["target_change"] = np.ones_like(df["target"])

df.loc[df["target"]<0,'target_change'] = 0

df = df.drop(["target","row_id"], axis=1)

# trying a thing
df = df.drop(["stock_id","time_id","date_id"], axis = 1)

df.head()

Unnamed: 0,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target_change
5730,300,0.0,0,1.000241,26670118.88,1.000241,1.000241,1.000026,19319.31,1.000241,16149.55,1.000143,0.0
5731,300,242332.96,-1,1.000073,3242054.27,0.981974,0.99449,0.999544,43205.4,1.000308,2042.76,1.000273,0.0
5732,300,0.0,0,1.000193,4671376.0,1.000193,1.000193,0.999035,18971.0,1.001036,59688.26,0.999518,1.0
5733,300,2914730.16,1,0.99987,41057776.66,1.00387,1.002279,0.999827,25569.5,1.000042,37897.5,0.999914,0.0
5734,300,3396923.02,1,0.998496,38356174.88,1.007894,1.007894,0.998357,14446.0,0.998703,2601.18,0.99865,1.0


In [3]:
scaler = StandardScaler()
X = scaler.fit_transform(df.drop("target_change", axis=1))
y = df["target_change"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

X_train_tensor.shape, y_train_tensor.shape

(torch.Size([1874910, 12]), torch.Size([1874910]))

In [4]:

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Assuming your data has 12 features based on the dropped columns
        self.fc1 = nn.Linear(12, 64)  # Input layer to hidden layer 1
        self.fc2 = nn.Linear(64, 32)  # Hidden layer 1 to hidden layer 2
        self.fc3 = nn.Linear(32, 2)   # Hidden layer 2 to output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation function for hidden layer 1
        x = F.relu(self.fc2(x))  # Activation function for hidden layer 2
        x = self.fc3(x)  # No activation for the output layer
        return x

net = Net()


In [5]:

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}')

print('Finished Training')

: 

In [None]:
def predict(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, labels in data_loader:
            outputs = model(inputs).squeeze()
            predicted = (outputs > 0.5).float()  # Using 0.5 as the threshold
            predictions.extend(predicted.numpy())
            actuals.extend(labels.numpy())
    return actuals, predictions

# Predict on the test set
y_true, y_pred = predict(net, test_loader)

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)  # This requires the probability scores, so adjust accordingly

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")
