In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import math
from nn_model import *
import torch.optim as optim

In [2]:
X_train = pd.read_csv("trainset.csv", index_col=0)
X_test = pd.read_csv("testset.csv", index_col=0)
X_val = pd.read_csv("valset.csv", index_col=0)

y_train = X_train.label
y_test = X_test.label
y_val = X_val.label

X_train = X_train.drop(columns=["label","seq"])
X_test = X_test.drop(columns=["label", "seq"])
X_val = X_val.drop(columns=["label", "seq"])

In [3]:
X_train.shape

(109661, 38)

In [4]:
X_train.columns

Index(['dwell_time_-1', 'sd_-1', 'mean_-1', 'dwell_time_0', 'sd_0', 'mean_0',
       'dwell_time_1', 'sd_1', 'mean_1', 'pwm_score', 'A_1', 'C_1', 'G_1',
       'T_1', 'A_2', 'C_2', 'G_2', 'T_2', 'A_3', 'C_3', 'G_3', 'T_3', 'A_4',
       'C_4', 'G_4', 'T_4', 'A_5', 'C_5', 'G_5', 'T_5', 'A_6', 'C_6', 'G_6',
       'T_6', 'A_7', 'C_7', 'G_7', 'T_7'],
      dtype='object')

## DataLoader

In [5]:
from nn_dataset import GeneDataset

In [6]:
train = GeneDataset('trainset.csv')
test = GeneDataset('testset.csv')
val = GeneDataset('valset.csv')

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test, batch_size=16, shuffle=False)
val_dataloader = DataLoader(val, batch_size=16, shuffle=False)

In [8]:
clf = ClassificationNN()
clf = clf.float()

In [9]:
criterion = nn.BCELoss()
optimizer = optim.Adam(clf.parameters(), lr=0.001)

In [11]:
for epoch in range(10):

    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs (data is in a list of [X_train, y_train])
        inputs, labels = data
        inputs = inputs.float()
        labels = labels[:, None].float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = clf(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print verbose
        running_loss += loss.item()
        if i % 1000 == 999:
            print(f"Epoch: {epoch+1}, batch: {i + 1:4d}, loss: {running_loss:.3f}")

print('Finish training')

Epoch: 1, batch: 1000, loss: 246.649
Epoch: 1, batch: 2000, loss: 423.533
Epoch: 1, batch: 3000, loss: 643.419
Epoch: 1, batch: 4000, loss: 812.730
Epoch: 1, batch: 5000, loss: 981.854
Epoch: 1, batch: 6000, loss: 1156.514
Epoch: 2, batch: 1000, loss: 190.568
Epoch: 2, batch: 2000, loss: 361.307
Epoch: 2, batch: 3000, loss: 576.740
Epoch: 2, batch: 4000, loss: 742.641
Epoch: 2, batch: 5000, loss: 908.278
Epoch: 2, batch: 6000, loss: 1080.555
Epoch: 3, batch: 1000, loss: 187.465
Epoch: 3, batch: 2000, loss: 356.123
Epoch: 3, batch: 3000, loss: 568.593
Epoch: 3, batch: 4000, loss: 732.387
Epoch: 3, batch: 5000, loss: 896.015
Epoch: 3, batch: 6000, loss: 1066.992
Epoch: 4, batch: 1000, loss: 185.613
Epoch: 4, batch: 2000, loss: 352.518
Epoch: 4, batch: 3000, loss: 562.995
Epoch: 4, batch: 4000, loss: 725.702
Epoch: 4, batch: 5000, loss: 888.055
Epoch: 4, batch: 6000, loss: 1058.157
Epoch: 5, batch: 1000, loss: 184.294
Epoch: 5, batch: 2000, loss: 350.324
Epoch: 5, batch: 3000, loss: 559.6

In [12]:
dataiter = iter(test_dataloader)


In [19]:
x_test, y_test = next(dataiter)

In [26]:
y_test.tolist()

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [21]:
output = clf(x_test.float())

In [22]:
output

tensor([[0.0078],
        [0.0078],
        [0.0113],
        [0.0192],
        [0.0065],
        [0.0146],
        [0.0308],
        [0.0134],
        [0.0425],
        [0.0357],
        [0.0462],
        [0.0196],
        [0.0098],
        [0.0210],
        [0.0122],
        [0.0190]], grad_fn=<SigmoidBackward>)

In [27]:
preds = torch.flatten(output).tolist()

In [28]:
preds = list(map(lambda x: 1 if x >= 0.5 else 0, preds))

In [30]:
target = y_test.tolist()

In [31]:
preds, target

([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [40]:
list(map(lambda x: 1 if x[0]==x[1] else 0, zip(preds, target)))

[1, 1, 1, 1, 1, 1, 1]

In [41]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_dataloader:
        x_test, y_test = data
        x_test = x_test.float()

        output = clf(x_test)
        preds = torch.flatten(output).tolist()
        preds = list(map(lambda x: 1 if x >= 0.5 else 0, preds))
        y_test = y_test.tolist()
        total += len(y_test)
        correct += sum(list(map(lambda x: 1 if x[0]==x[1] else 0, zip(preds, target))))

In [42]:
print(f"Accuracy: {100 * correct//total}")

Accuracy: 93
