In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import math
from nn_model import *
from nn_dataset import GeneDataset
import torch.optim as optim
from torch.utils.data import DataLoader

## Resampling

In [2]:
df = pd.read_csv('merged_data.csv')

In [3]:
df.drop(columns=['gene_id', 'transcript_pos'], inplace=True)

In [4]:
all_genes = df.transcript_id.unique().tolist()
train_genes_count = int(len(all_genes)*0.8)
train_genes = all_genes[:train_genes_count]
val_genes = all_genes[train_genes_count:]

In [5]:
len(train_genes), len(val_genes)

(4266, 1067)

In [6]:
train_df = df[df.transcript_id.isin(train_genes)]
val_df = df[df.transcript_id.isin(val_genes)]

In [7]:
X_train = train_df.drop(columns=['transcript_id', 'nucleo_seq', 'transcript_position', 'label'])
y_train = train_df.label

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
for_scaler = df.drop(columns=['transcript_id', 'nucleo_seq', 'transcript_position', 'label'])
scaler.fit(for_scaler)

In [10]:
clf = ClassificationNN()
clf = clf.float()

criterion = nn.BCELoss()
optimizer = optim.Adam(clf.parameters(), lr=0.001)

In [11]:
strtfdKFold = StratifiedKFold(n_splits=5)
kfold = strtfdKFold.split(X_train, y_train)
accuracy = []
auc_roc = []
pr_roc = []

In [12]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

In [13]:
for k, (train, test) in enumerate(kfold):
    curr_X_train = X_train.iloc[train, :]
    curr_y_train = y_train.iloc[train]

    balanced_X_train, balanced_y_train = oversample.fit_resample(curr_X_train, curr_y_train)
    balanced_X_train = scaler.transform(balanced_X_train)
    train_dataset = GeneDataset(balanced_X_train, balanced_y_train)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    train_loop(train_dataloader, optimizer, criterion, clf, epochs=3)

    curr_X_test = X_train.iloc[test, :]
    curr_X_test = scaler.transform(curr_X_test)
    curr_y_test = y_train.iloc[test]
    test_dataset = GeneDataset(curr_X_test, curr_y_test)
    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
    
    preds, probas, labels = eval_loop(test_dataloader, clf)
    
    auc_score = roc_auc_score(labels, probas)
    ap = average_precision_score(labels, probas)

    acc_score = accuracy_score(labels, preds)
    auc_score = roc_auc_score(labels, probas)
    pr_score = average_precision_score(labels, probas)

    accuracy.append(acc_score)
    auc_roc.append(auc_score)
    pr_roc.append(pr_score)

    print(f"Fold {k+1} | Accuracy: {acc_score} | AUC ROC: {auc_score} | PR ROC {pr_score} || label 0: {len(balanced_y_train[balanced_y_train==0])}, label 1: {len(balanced_y_train[balanced_y_train==1])}")

print(f"Cross Val Accuracy: {np.mean(accuracy)} | AUC ROC: {np.mean(auc_roc)} | PR ROC: {np.mean(pr_roc)}")

Epoch: 0, final loss: 5985.104
Epoch: 1, final loss: 5967.990
Epoch: 2, final loss: 5965.389
Finish training
Fold 1 | Accuracy: 0.6636626340794436 | AUC ROC: 0.727265182004766 | PR ROC 0.1011346905645211
Epoch: 0, final loss: 5921.627
Epoch: 1, final loss: 5921.762
Epoch: 2, final loss: 5920.376
Finish training
Fold 2 | Accuracy: 0.648266065830721 | AUC ROC: 0.7052192032371926 | PR ROC 0.09644445661549494
Epoch: 0, final loss: 5948.629
Epoch: 1, final loss: 5948.370
Epoch: 2, final loss: 5948.354
Finish training
Fold 3 | Accuracy: 0.6413597178683386 | AUC ROC: 0.721695126595331 | PR ROC 0.10330524849060814
Epoch: 0, final loss: 5955.149
Epoch: 1, final loss: 5956.887
Epoch: 2, final loss: 5955.924
Finish training
Fold 4 | Accuracy: 0.6557112068965517 | AUC ROC: 0.723866003082891 | PR ROC 0.10156738417988977
Epoch: 0, final loss: 5931.233
Epoch: 1, final loss: 5931.881
Epoch: 2, final loss: 5931.574
Finish training
Fold 5 | Accuracy: 0.6487558777429467 | AUC ROC: 0.7139447011774811 | PR

In [15]:
X_val = val_df.drop(columns=['transcript_id', 'nucleo_seq', 'transcript_position', 'label'])
y_val = val_df.label

In [16]:
X_val = scaler.transform(X_val)

In [19]:
val_dataset = GeneDataset(X_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
val_preds, val_proba, val_labels = eval_loop(val_dataloader, clf)

In [20]:
acc_score = accuracy_score(val_labels, val_preds)
auc_score = roc_auc_score(val_labels, val_proba)
pr_score = average_precision_score(val_labels, val_preds)

print("Accuracy:", acc_score)
print("AUC-ROC:", auc_score)
print("PR-ROC:", ap)

Accuracy: 0.6416966138583793
AUC-ROC: 0.7093601452664723
PR-ROC: 0.09361120684335912


## V1

In [5]:
from nn_dataset import GeneDataset

In [6]:
train = GeneDataset(X_train, y_train)
test = GeneDataset(X_test, y_test)
val = GeneDataset(X_val, y_val)

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val, batch_size=16, shuffle=True)

In [8]:
clf = ClassificationNN()
clf = clf.float()

In [9]:
criterion = nn.BCELoss()
optimizer = optim.Adam(clf.parameters(), lr=0.001)

In [10]:
for epoch in range(10):

    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs (data is in a list of [X_train, y_train])
        inputs, labels = data
        inputs = inputs.float()
        labels = labels[:, None].float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = clf(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print verbose
        running_loss += loss.item()
        if i % 1000 == 999:
            print(f"Epoch: {epoch+1}, batch: {i + 1:4d}, loss: {running_loss:.3f}")

print('Finish training')

Epoch: 1, batch: 1000, loss: 186.017
Epoch: 1, batch: 2000, loss: 351.833
Epoch: 1, batch: 3000, loss: 523.749
Epoch: 1, batch: 4000, loss: 699.231
Epoch: 1, batch: 5000, loss: 863.191
Epoch: 1, batch: 6000, loss: 1033.188
Epoch: 2, batch: 1000, loss: 167.041
Epoch: 2, batch: 2000, loss: 331.576
Epoch: 2, batch: 3000, loss: 499.482
Epoch: 2, batch: 4000, loss: 671.408
Epoch: 2, batch: 5000, loss: 844.963
Epoch: 2, batch: 6000, loss: 1015.327
Epoch: 3, batch: 1000, loss: 170.410
Epoch: 3, batch: 2000, loss: 342.344
Epoch: 3, batch: 3000, loss: 506.097
Epoch: 3, batch: 4000, loss: 675.651
Epoch: 3, batch: 5000, loss: 841.802
Epoch: 3, batch: 6000, loss: 1012.435
Epoch: 4, batch: 1000, loss: 165.142
Epoch: 4, batch: 2000, loss: 332.557
Epoch: 4, batch: 3000, loss: 503.062
Epoch: 4, batch: 4000, loss: 675.893
Epoch: 4, batch: 5000, loss: 841.435
Epoch: 4, batch: 6000, loss: 1014.391
Epoch: 5, batch: 1000, loss: 171.448
Epoch: 5, batch: 2000, loss: 339.938
Epoch: 5, batch: 3000, loss: 506.2

In [64]:
clf.eval()
preds = []
probas = []
labels = []
with torch.no_grad():
    for data in test_dataloader:
        x_test, y_test = data
        x_test = x_test.float()

        output = clf(x_test)
        y_test = torch.flatten(y_test).tolist()
        proba = torch.flatten(output).tolist()
        pred = list(map(lambda x: 1 if x >= 0.5 else 0, proba))
        preds += pred
        probas += proba
        labels += y_test

In [65]:
len(preds), len(probas), len(labels), len(test)

(12177, 12177, 12177, 12177)

In [66]:
test_results = pd.DataFrame(data={'labels': labels,
                                  'preds': preds,
                                  'probas': probas})

In [67]:
test_results

Unnamed: 0,labels,preds,probas
0,0,0,0.027629
1,1,0,0.087624
2,0,0,0.062826
3,1,0,0.092178
4,0,0,0.061673
...,...,...,...
12172,0,0,0.028079
12173,0,0,0.022819
12174,0,0,0.018578
12175,0,0,0.020540


In [68]:
auc_score = roc_auc_score(labels, probas)
ap = average_precision_score(labels, probas)

print("Accuracy:", accuracy_score(labels, preds))
print("AUC-ROC:", auc_score)
print("PR-ROC:", ap)

Accuracy: 0.9462921901946292
AUC-ROC: 0.6734661245253146
PR-ROC: 0.11185075734590805
