<a href="https://colab.research.google.com/github/MonsterOnLeash/PID/blob/master/Baseline_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import display, clear_output


import torch
from torch import nn
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from torch.nn import functional as F

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_auc_score

from tqdm import trange

import time

In [None]:
data = pd.read_csv("PID-dataset-subsample.csv")

In [None]:
le = LabelEncoder()

In [None]:
X = data.drop(columns = ["Label"])
y_init = data["Label"]
le.fit(y_init)
y = le.transform(y_init)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.65, random_state=42, stratify=y)

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [None]:
device = "cpu"

In [None]:
oh = OneHotEncoder()
oh.fit(y.reshape(-1, 1));


In [None]:
y_train6 = oh.transform(y_train.reshape(-1, 1)).toarray()
y_test6 =  oh.transform(y_test.reshape(-1, 1)).toarray()

In [None]:
X_trainT = torch.from_numpy(np.array(X_train)).float()
X_testT = torch.from_numpy(np.array(X_test)).float()
y_trainT = torch.from_numpy(np.array(y_train6)).float()
y_testT = torch.from_numpy(np.array(y_test6)).float()

In [None]:
class OneLayer(nn.Module):
    def __init__(self, input_shape, output_shape, hidden_layer, activation = None):
        super(OneLayer, self).__init__()
        self.sequential = nn.Sequential(
            nn.Linear(input_shape, hidden_layer),
            activation,
            nn.Linear(hidden_layer, output_shape)
        )

    def forward(self, x):
        x = self.sequential(x)
        return x

In [None]:
torch.manual_seed(42);

# One NN with 6 outputs

In [None]:
model = OneLayer(49, 6, 140, nn.ReLU())
model.to(device)
criterion = nn.MSELoss()
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min')

n_epoches = 5000
debug_frequency = 100

pbar = trange(n_epoches, leave=True, position=0)
for epoch in pbar:
    opt.zero_grad()
    preds = model(X_trainT).squeeze()
    loss = criterion(preds, y_trainT)
    loss.backward()
    scheduler.step(loss)
    opt.step()
    loss_train = float(criterion(preds, y_trainT).detach().cpu().numpy())
    preds = model(X_testT).squeeze()
    loss_test = float(criterion(preds, y_testT).detach().cpu().numpy())
    pbar.set_description('MSE (train): %.3f\tMSE (test): %.3f' %
                         (loss_train, loss_test))
    pbar.update()

MSE (train): 0.056	MSE (test): 0.061: 100%|██████████| 5000/5000 [07:39<00:00, 10.88it/s]


In [None]:
start_time = time.time()
pred = model(X_testT)
print(time.time() - start_time)
pred = pd.DataFrame(pred.detach().numpy())
pred = pred.idxmax(axis = 1)
pred = le.inverse_transform(pred)
y_testL = le.inverse_transform(y_test)
lb = LabelBinarizer().fit(y_testL)

0.015720129013061523


In [None]:
for label in ['Electron', 'Ghost', 'Kaon', 'Muon', 'Pion', 'Proton']:
    class_id = np.flatnonzero(lb.classes_ == label)[0]
    print(f"{label} score is {roc_auc_score(lb.transform(y_testL)[:, class_id], lb.transform(pred)[:, class_id])}")

Electron score is 0.9220534553822972
Ghost score is 0.8609794746237357
Kaon score is 0.7691175843910755
Muon score is 0.9362265414848623
Pion score is 0.851251136686017
Proton score is 0.7348779354562212


# 6 independent binary classification NNs

In [None]:
LABELS = ['Electron', 'Ghost', 'Kaon', 'Muon', 'Pion', 'Proton']

In [None]:
y_train1 = dict()
y_test1 = dict()

In [None]:
for label in LABELS:
    label_code = le.transform([label])[0]
    y_train1[label] = torch.from_numpy(np.array((y_train == label_code).astype(int))).float()
    y_test1[label] = torch.from_numpy(np.array((y_test == label_code).astype(int))).float()

In [None]:
models = []
for i in range(6):
    models.append(OneLayer(49, 1, 35, nn.ReLU()))
    models[i].to(device)
    criterion = nn.MSELoss()
    opt = torch.optim.Adam(models[i].parameters(), lr=1e-2)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min')

    n_epoches = 3000
    debug_frequency = 100

    pbar = trange(n_epoches, leave=True, position=0)
    for epoch in pbar:
        opt.zero_grad()
        preds = models[i](X_trainT).squeeze()
        loss = criterion(preds, y_train1[le.inverse_transform([i])[0]])
        loss.backward()
        scheduler.step(loss)
        opt.step()
        loss_train = float(criterion(preds, y_train1[le.inverse_transform([i])[0]]).detach().cpu().numpy())
        preds = models[i](X_testT).squeeze()
        loss_test = float(criterion(preds, y_test1[le.inverse_transform([i])[0]]).detach().cpu().numpy())
        pbar.set_description('MSE (train): %.3f\tMSE (test): %.3f' %
                            (loss_train, loss_test))
        pbar.update()

MSE (train): 0.032	MSE (test): 0.037: 100%|██████████| 3000/3000 [01:27<00:00, 34.47it/s]
MSE (train): 0.056	MSE (test): 0.062: 100%|██████████| 3000/3000 [01:27<00:00, 34.18it/s]
MSE (train): 0.080	MSE (test): 0.086: 100%|██████████| 3000/3000 [01:27<00:00, 34.41it/s]
MSE (train): 0.025	MSE (test): 0.029: 100%|██████████| 3000/3000 [01:27<00:00, 34.33it/s]
MSE (train): 0.063	MSE (test): 0.068: 100%|██████████| 3000/3000 [01:26<00:00, 34.54it/s]
MSE (train): 0.081	MSE (test): 0.088: 100%|██████████| 3000/3000 [01:27<00:00, 34.46it/s]


In [None]:
pred = pd.DataFrame(columns=[0,1,2,3,4,5])
set_time = time.time()
for i in range(6):
    pr = pd.Series(models[i](X_testT).detach().numpy().reshape(-1))
    pred[i] = pr
print(time.time() - set_time)

0.03463149070739746


In [None]:
pred = pred.idxmax(axis = 1)
pred = le.inverse_transform(pred)
y_testL = le.inverse_transform(y_test)
lb = LabelBinarizer().fit(y_testL)

In [None]:
for label in ['Electron', 'Ghost', 'Kaon', 'Muon', 'Pion', 'Proton']:
    class_id = np.flatnonzero(lb.classes_ == label)[0]
    print(f"{label} score is {roc_auc_score(lb.transform(y_testL)[:, class_id], lb.transform(pred)[:, class_id])}")

Electron score is 0.9236496348513912
Ghost score is 0.8608007959805553
Kaon score is 0.7604310108665875
Muon score is 0.9337392381577901
Pion score is 0.8485518915022743
Proton score is 0.7437610698634946


In [None]:
set_time = time.time()
models[0](X_testT)
models[1](X_testT)
models[2](X_testT)
models[3](X_testT)
models[4](X_testT)
models[5](X_testT)
print(time.time() - set_time)

0.028741836547851562
