In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import math
from nn_model import *
from nn_dataset import GeneDataset
import torch.optim as optim
from torch.utils.data import DataLoader

## Resampling

In [2]:
df = pd.read_csv('merged_data.csv')

In [3]:
df.drop(columns=['gene_id', 'transcript_pos'], inplace=True)

In [4]:
all_genes = df.transcript_id.unique().tolist()
train_genes_count = int(len(all_genes)*0.8)
train_genes = all_genes[:train_genes_count]
val_genes = all_genes[train_genes_count:]

In [5]:
len(train_genes), len(val_genes)

(4266, 1067)

In [6]:
train_df = df[df.transcript_id.isin(train_genes)]
val_df = df[df.transcript_id.isin(val_genes)]

In [7]:
X_train = train_df.drop(columns=['transcript_id', 'nucleo_seq', 'transcript_position', 'label'])
y_train = train_df.label

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
for_scaler = df.drop(columns=['transcript_id', 'nucleo_seq', 'transcript_position', 'label'])
scaler.fit(for_scaler)

In [10]:
clf = ClassificationNN()
clf = clf.float()

criterion = nn.BCELoss()
optimizer = optim.Adam(clf.parameters(), lr=0.001)

In [11]:
strtfdKFold = StratifiedKFold(n_splits=5)
kfold = strtfdKFold.split(X_train, y_train)
accuracy = []
auc_roc = []
pr_roc = []

In [12]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

## Train loop

In [13]:
# K fold cross validation
for k, (train, test) in enumerate(kfold):

    # current Kth fold splits
    curr_X_train = X_train.iloc[train, :]
    curr_y_train = y_train.iloc[train]

    # balance dataset (oversample label 1)
    balanced_X_train, balanced_y_train = oversample.fit_resample(curr_X_train, curr_y_train)
    balanced_X_train = scaler.transform(balanced_X_train)

    # Load data into pytorch dataloader
    train_dataset = GeneDataset(balanced_X_train, balanced_y_train)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    # training
    train_loop(train_dataloader, optimizer, criterion, clf, epochs=3)

    # 
    curr_X_test = X_train.iloc[test, :]
    curr_X_test = scaler.transform(curr_X_test)
    curr_y_test = y_train.iloc[test]
    test_dataset = GeneDataset(curr_X_test, curr_y_test)
    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
    
    preds, probas, labels = eval_loop(test_dataloader, clf)
    
    auc_score = roc_auc_score(labels, probas)
    ap = average_precision_score(labels, probas)

    acc_score = accuracy_score(labels, preds)
    auc_score = roc_auc_score(labels, probas)
    pr_score = average_precision_score(labels, probas)

    accuracy.append(acc_score)
    auc_roc.append(auc_score)
    pr_roc.append(pr_score)

    print(f"Fold {k+1} | Accuracy: {acc_score} | AUC ROC: {auc_score} | PR ROC {pr_score} || label 0: {len(balanced_y_train[balanced_y_train==0])}, label 1: {len(balanced_y_train[balanced_y_train==1])}")

print(f"Cross Val Accuracy: {np.mean(accuracy)} | AUC ROC: {np.mean(auc_roc)} | PR ROC: {np.mean(pr_roc)}")

Epoch: 0, final loss: 6000.627
Epoch: 1, final loss: 5985.886
Epoch: 2, final loss: 5982.136
Finish training
Fold 1 | Accuracy: 0.645981290101386 | AUC ROC: 0.7262552380708753 | PR ROC 0.10116539040063995 || label 0: 78008, label 1: 78008
Epoch: 0, final loss: 5923.384
Epoch: 1, final loss: 5922.386
Epoch: 2, final loss: 5921.727
Finish training
Fold 2 | Accuracy: 0.6459149686520376 | AUC ROC: 0.7045805434980915 | PR ROC 0.09637595170751365 || label 0: 78009, label 1: 78009
Epoch: 0, final loss: 5956.953
Epoch: 1, final loss: 5958.072
Epoch: 2, final loss: 5957.460
Finish training
Fold 3 | Accuracy: 0.640967868338558 | AUC ROC: 0.720694359575307 | PR ROC 0.10365868713148574 || label 0: 78009, label 1: 78009
Epoch: 0, final loss: 5957.770
Epoch: 1, final loss: 5957.091
Epoch: 2, final loss: 5957.325
Finish training
Fold 4 | Accuracy: 0.6521845611285266 | AUC ROC: 0.7238027766663443 | PR ROC 0.10138597725352012 || label 0: 78009, label 1: 78009
Epoch: 0, final loss: 5925.969
Epoch: 1, fi

## Test dataset

In [14]:
X_val = val_df.drop(columns=['transcript_id', 'nucleo_seq', 'transcript_position', 'label'])
X_val = scaler.transform(X_val)
y_val = val_df.label

In [15]:
val_dataset = GeneDataset(X_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
val_preds, val_proba, val_labels = eval_loop(val_dataloader, clf)

In [16]:
acc_score = accuracy_score(val_labels, val_preds)
auc_score = roc_auc_score(val_labels, val_proba)
pr_score = average_precision_score(val_labels, val_preds)

print("Accuracy:", acc_score)
print("AUC-ROC:", auc_score)
print("PR-ROC:", ap)

Accuracy: 0.6183124968365643
AUC-ROC: 0.7107275573733403
PR-ROC: 0.09065117949166407
