In [11]:
! pip install optuna  scikit-learn gensim imbalanced-learn xgboost torch shap

Collecting shap
  Downloading shap-0.42.1-cp310-cp310-win_amd64.whl (462 kB)
                                              0.0/462.3 kB ? eta -:--:--
     ------------------------------------  460.8/462.3 kB 14.5 MB/s eta 0:00:01
     -------------------------------------- 462.3/462.3 kB 9.6 MB/s eta 0:00:00
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Collecting numba (from shap)
  Downloading numba-0.57.1-cp310-cp310-win_amd64.whl (2.5 MB)
                                              0.0/2.5 MB ? eta -:--:--
     --------------                           0.9/2.5 MB 28.7 MB/s eta 0:00:01
     -------------------------------          2.0/2.5 MB 25.7 MB/s eta 0:00:01
     ----------------------------------       2.2/2.5 MB 23.2 MB/s eta 0:00:01
     ---------------------------------------- 2.5/2.5 MB 14.7 MB/s eta 0:00:00
Collecting cloudpickle (from shap)
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting llvmlite<0.41,>=0.40


[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
 #Import All Libraries Here
import pandas as pd
from sklearn.metrics import accuracy_score ,  roc_curve, auc , classification_report

import numpy as np
from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import mean_squared_error
import optuna
from torch.utils.data import WeightedRandomSampler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import time
from collections import Counter
# PyTorch Import

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm

import pickle

from sklearn.preprocessing import LabelEncoder

import shap


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## Load dataset and Encode with n-mer encoding techniques 
## train Model 
## test model 
## get summary report


In [10]:

# from google.colab import drive
# drive.mount('/content/drive')
# BINARY_DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/Atol_with_ROS.csv"
#ENCODING_FILE = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/3-mer-dictionary.pkl"

# Record Constants
# Record Constants

BINARY_DATA_PATH = "../../../data/hAm_with_ROS.csv"
ENCODING_FILE = "3-mer-dictionary.pkl"

# INPUT_TRAIN_IN = "../../../data/train_in.csv"
# INPUT_TRAIN_OUT = "../../../data/train_out.csv"
# INPUT_TEST_IN = "../../../data/test_in.csv"
# INPUT_TEST_OUT = "../../../data/test_out.csv"
# INPUT_VALIDATION_IN = "../../../data/valid_in_nucleo.csv"
# INPUT_VALIDATION_OUT  = "../../../data/valid_out.csv"

# TARGET_MODEL_PATH = '../../webapp/model_files'




### Model Class , Train and Valid Function

In [11]:
import torch.nn as nn

class RNADataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

class RNATransformerModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim, dropout=0.5):
        super(RNATransformerModel, self).__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # If batch size first is true then it should be batch size , sequence lenght , embedding dimension
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=8, dim_feedforward=hidden_dim , batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embedding_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.long()
        #print("Shape of Original X  ", x.shape)
        x_embedded = self.embedding(x)
        #print("Shape of X embedded" , x_embedded.shape)
        x_transformed = self.transformer_encoder(x_embedded)
        #print("Shape of Transformed X" , x_transformed.shape)
        x_transformed = x_transformed[:, -1, :]  # taking the last token's output

        output = self.dropout(x_transformed)
        out = self.fc(output)
        return out.squeeze()


def validate_model(model, test_dataloader , device ,loss_function):
    model.eval()
    running_loss = 0.0
    total = 0
    correct = 0
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            inputs = inputs.to(device)
            labels = labels.float().to(device)
            outputs = model(inputs)
            if outputs.size() != labels.size(): # skip if batch size mismatch
              continue
            #print(f"Shape of Label : {labels.shape} and output shape : {outputs.shape} ")
            loss = loss_function(outputs, labels)
            running_loss += loss.item()

            predicted = (torch.sigmoid(outputs) > 0.5).float() # Set threshold at 0.5
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            true_labels.extend(labels.cpu().numpy())  # Capture True Labels for Summary Report
            predicted_labels.extend(predicted.cpu().numpy()) # Capture Predicted Labels Lables for Summary Report

    validation_loss = running_loss / len(test_dataloader)
    validation_accuracy = correct / total

    return validation_loss , validation_accuracy , true_labels , predicted_labels



def train_model(model, train_dataloader, test_dataloader, device, epochs, optimizer, loss_function):
    best_val_loss = float('inf')
    no_improvement_count = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        start_time = time.time()
        #progress_bar = tqdm(train_dataloader, desc='Epoch {:03d}'.format(epoch + 1), leave=False, disable=False)
        for i, (inputs, labels) in enumerate(train_dataloader):
            inputs = inputs.to(device)
            labels = labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            if outputs.size() != labels.size(): # skip if batch size mismatch
              continue
            #print(f"Shape of Label : {labels.shape} and output shape : {outputs.shape} ")
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            #progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(inputs))})

        epoch_loss = running_loss / len(train_dataloader)
        val_loss,  validation_accuracy , true_labels , predicted_labels = validate_model(model, test_dataloader, device, loss_function)
        end_time = time.time()
        elapsed_time = end_time - start_time

        print(f"Epoch {epoch + 1}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Test Accuracy: {validation_accuracy:.4f} , Time Taken : {elapsed_time}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count == 10:
                print("No improvement in validation loss for 5 epochs. Training stopped.")
                break



### Load dataset 

In [27]:
#Read X Varaibles and Y Varaibles

binary_raw_dataframe =  pd.read_csv(BINARY_DATA_PATH, header=None  , skiprows=1)
print(f"Shape of Dataset  : {binary_raw_dataframe.shape}")


Shape of Dataset  : (309214, 102)


In [28]:
binary_raw_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,T,T,G,C,C,A,C,A,C,T,...,C,A,G,T,A,T,C,T,C,hAm
1,T,T,T,G,A,A,A,A,A,A,...,T,C,A,T,C,G,T,G,C,hAm
2,A,G,A,A,A,C,A,T,T,C,...,T,T,C,T,G,T,T,C,A,hAm
3,T,T,A,G,T,T,T,T,A,C,...,A,A,A,A,A,T,T,T,C,hAm
4,C,A,A,C,A,G,A,A,G,T,...,A,A,A,A,T,G,T,A,C,hAm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309209,A,A,C,A,C,T,A,T,A,C,...,G,T,T,T,T,A,A,A,C,hAm
309210,A,T,T,C,A,G,T,C,C,T,...,A,C,C,T,G,A,A,G,G,hAm
309211,C,T,T,G,A,G,T,C,G,T,...,A,A,G,T,T,A,A,C,C,hAm
309212,G,T,T,A,A,T,G,G,A,G,...,A,T,A,A,C,T,C,A,G,hAm


### Calculate Sequence Positions to extracted from Original Sequence

In [29]:
x_data = binary_raw_dataframe.iloc[:,0:101] # Select data
y_data = binary_raw_dataframe.iloc[:,101] # Select Target

In [30]:
x_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,T,T,G,C,C,A,C,A,C,T,...,G,C,A,G,T,A,T,C,T,C
1,T,T,T,G,A,A,A,A,A,A,...,C,T,C,A,T,C,G,T,G,C
2,A,G,A,A,A,C,A,T,T,C,...,T,T,T,C,T,G,T,T,C,A
3,T,T,A,G,T,T,T,T,A,C,...,G,A,A,A,A,A,T,T,T,C
4,C,A,A,C,A,G,A,A,G,T,...,T,A,A,A,A,T,G,T,A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309209,A,A,C,A,C,T,A,T,A,C,...,A,G,T,T,T,T,A,A,A,C
309210,A,T,T,C,A,G,T,C,C,T,...,T,A,C,C,T,G,A,A,G,G
309211,C,T,T,G,A,G,T,C,G,T,...,C,A,A,G,T,T,A,A,C,C
309212,G,T,T,A,A,T,G,G,A,G,...,G,A,T,A,A,C,T,C,A,G


In [31]:
y_data

0         hAm
1         hAm
2         hAm
3         hAm
4         hAm
         ... 
309209    hAm
309210    hAm
309211    hAm
309212    hAm
309213    hAm
Name: 101, Length: 309214, dtype: object

### Load Fixed Numerical Encoding From File For Model Interpretability 

In [32]:

concatenated_column= x_data.apply(lambda row: ''.join(map(str, row)), axis=1)
x_data_with_complete_sequence = x_data.assign(Sequence=concatenated_column)


with open(ENCODING_FILE, 'rb') as f:
    kmer_dict = pickle.load(f)

In [46]:
x_data_with_complete_sequence

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,Sequence
0,T,T,G,C,C,A,C,A,C,T,...,C,A,G,T,A,T,C,T,C,TTGCCACACTGCTGGACGCCTGCAAGGCCAAGGGTACGGAGGTCAT...
1,T,T,T,G,A,A,A,A,A,A,...,T,C,A,T,C,G,T,G,C,TTTGAAAAAATATTAGCAATGTGAGGACACTTAAGCAGTTTTGTCA...
2,A,G,A,A,A,C,A,T,T,C,...,T,T,C,T,G,T,T,C,A,AGAAACATTCAACCTCCCTTCTTTTTATTCCAGTTGTCCTTTTCTC...
3,T,T,A,G,T,T,T,T,A,C,...,A,A,A,A,A,T,T,T,C,TTAGTTTTACTATGGAATCATAATAACCCACATAGAAGACTGATAT...
4,C,A,A,C,A,G,A,A,G,T,...,A,A,A,A,T,G,T,A,C,CAACAGAAGTTTCTCATCTATAATCAGTAGCACTAAACTCTTGGTT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309209,A,A,C,A,C,T,A,T,A,C,...,G,T,T,T,T,A,A,A,C,AACACTATACACTTAAGGCTACACTAAATTCATTTTTTAAAATTTT...
309210,A,T,T,C,A,G,T,C,C,T,...,A,C,C,T,G,A,A,G,G,ATTCAGTCCTCTGAATTAAGATATTAGGTTATAAGGCCATGTACAA...
309211,C,T,T,G,A,G,T,C,G,T,...,A,A,G,T,T,A,A,C,C,CTTGAGTCGTGATCACACCACTGTACTCCAGCTTGTCTCCAAATAA...
309212,G,T,T,A,A,T,G,G,A,G,...,A,T,A,A,C,T,C,A,G,GTTAATGGAGAAGACATATACATTACTTGAATAATTTAAGTCTGAA...


### 3 mer coding

In [33]:
kmer_dict = {}
encoding = 0
k = 3

# for sequence in x_data_with_complete_sequence['Sequence']:
#     for i in range(len(sequence) - k + 1):
#         kmer = sequence[i:i+k]
#         if kmer not in kmer_dict:
#             kmer_dict[kmer] = encoding
#             encoding += 1



number_of_unique_kmers = set()
def encode_seq(kmer_token):

    # A 1 0 0 0
    # C 0 1 0 0
    # T/U 0 0 0 1
    # G 0 0 1 0
    # N 0 0 0 0

    encoding_dict = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1],
        'U': [0, 0, 0, 1],
        'N': [0, 0, 0, 0],
    }

    encoded_sequence = []
    number_of_unique_kmers.add(kmer_token)
    for  base in kmer_token:
        encoded_sequence.append(encoding_dict[base])
    return np.array(encoded_sequence).flatten()

def applyOneHotEncoding(tokenized_sequences):
    encoded_sequences = []
    for seq in tokenized_sequences:
        encoded_sequences.append(encode_seq(seq)) 
 
    return np.array(encoded_sequences).flatten()


def encode_with_k_mer_codon(sequence):
    encoded_sequence = []
    for i in range(len(sequence) - k + 1):
        encoded_sequence.append(kmer_dict[sequence[i:i+k]] )

    return np.array(encoded_sequence)

def convertToTensor(x):
    return torch.tensor(x)

### Encode X

In [34]:
X_encoded  =  x_data_with_complete_sequence['Sequence'].apply(encode_with_k_mer_codon)
X_encoded = torch.tensor(X_encoded , dtype =  torch.long)

In [35]:
X_encoded.shape

torch.Size([309214, 99])

### Encode Y

In [36]:

# encoder = LabelEncoder()
# y_data = pd.Series(y_data.squeeze())
# y_encoded = encoder.fit_transform(y_data)

print("Unique Values : " , y_data.value_counts())
y_data = pd.Series(y_data.squeeze())
y_encoded = y_data.where(y_data=='NonMoD', other=1).replace('NonMoD', 0)


unique, counts = np.unique(y_encoded, return_counts=True)
value_counts = dict(zip(unique, counts))
print(value_counts)

Unique Values :  hAm       154607
NonMoD    154607
Name: 101, dtype: int64
{0: 154607, 1: 154607}


In [37]:
y_encoded  = np.array(y_encoded)
y_encoded.shape

(309214,)

In [38]:
print("Generate Train and Split..")
# Train set
X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# Test and Validation set
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


Generate Train and Split..


In [20]:
X_encoded = None
y_encoded = None
x_data_filtered , y_data_filtered = None,None
x_data , y_data = None , None

x_data_with_complete_sequence = None

print("Train Y Count : " ,Counter(y_train))
print("Test Y Count : " ,Counter(y_test))


Train Y Count :  Counter({1: 108282, 0: 108167})
Test Y Count :  Counter({1: 23252, 0: 23131})


### Balance Datset

In [39]:
y_train = torch.tensor(y_train , dtype=torch.long)
y_test = torch.tensor(y_test , dtype=torch.long)
y_valid = torch.tensor(y_valid , dtype=torch.long)

In [40]:
print(X_train.shape)
print(y_train.shape)

torch.Size([216449, 99])
torch.Size([216449])


In [23]:
print(f"Shape of X Train : {X_train.shape}")

Shape of X Train : torch.Size([216449, 99])


In [23]:
hyperparameter = {}
hyperparameter['INPUT_DIMENSION'] = len(kmer_dict) # For One Hot Encoding Input Dimension would be 4 as there only 4 unique nucleocide
hyperparameter['HIDDEN_DIMENSION'] = 32
hyperparameter['NO_OF_LAYERS'] = 4
hyperparameter['BATCH_SIZE'] = 32
hyperparameter['OUTPUT_DIMENSION'] = 1
hyperparameter['EMBEDDING_DIMENSION'] = 256 # if you are using Word2Vec Encoding then this should be same as Word2Vec Embedding Dim
hyperparameter['DROP_OUT'] = 0.4
hyperparameter['LEARNING_RATE'] = 0.00001




train_dataset = RNADataset(X_train, y_train)
test_dataset = RNADataset(X_test, y_test)
valid_dataset = RNADataset(X_valid, y_valid)


train_dataloader = DataLoader(train_dataset, batch_size=hyperparameter['BATCH_SIZE'], shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size=hyperparameter['BATCH_SIZE'], shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=hyperparameter['BATCH_SIZE'], shuffle=False)

In [25]:
# Check data is in correct shape - batch size , sequece len , embedding dimension size
for inputs, labels in train_dataloader:
    print(inputs.shape)
    break

torch.Size([32, 99])


In [None]:
model = RNATransformerModel(input_dim=hyperparameter['INPUT_DIMENSION'],
                            embedding_dim=hyperparameter['EMBEDDING_DIMENSION'],
                            hidden_dim=hyperparameter['HIDDEN_DIMENSION'] ,
                            num_layers = hyperparameter['NO_OF_LAYERS'],
                            output_dim=hyperparameter['OUTPUT_DIMENSION'],
                            dropout=hyperparameter['DROP_OUT'] )


loss_function = nn.BCEWithLogitsLoss()  ## MSELoss of Regression problem  # BCELoss for binary classification
optimizer = optim.Adam(model.parameters() ,  lr=hyperparameter['LEARNING_RATE'])

# Number of Parameters for Model
total_parameters = []
for p in model.parameters():
    total_parameters.append(p.numel())

print(f"Total Number of Parameters for Model Training : { sum(total_parameters)} " )

# Train the model
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Model Parameters  : " , hyperparameter)

# Train Model with configured Parameter
train_model(model, train_dataloader ,test_dataloader, device ,num_epochs,optimizer,loss_function)

In [None]:
model

In [None]:
# Evaluate the model on the test dataset
_, final_accuracy, true_labels, predicted_labels = validate_model(model, valid_dataloader,device,loss_function)

# Print the final accuracy
print(f"Final Accuracy: {final_accuracy:.4f}")

# Print the classification summary
print("\n Classification Summary:")
print(classification_report(true_labels, predicted_labels))

## Automate Train and Test for All Class

### Prepare Dataset

In [8]:
k = 1

if k != 1 :
    with open(ENCODING_FILE, 'rb') as f:
        kmer_dict = pickle.load(f)

hyperparameter = {}
hyperparameter['INPUT_DIMENSION'] = len(kmer_dict) # For One Hot Encoding Input Dimension would be 4 as there only 4 unique nucleocide
hyperparameter['HIDDEN_DIMENSION'] = 32
hyperparameter['NO_OF_LAYERS'] = 4
hyperparameter['BATCH_SIZE'] = 32
hyperparameter['OUTPUT_DIMENSION'] = 1
hyperparameter['EMBEDDING_DIMENSION'] = 256 # if you are using Word2Vec Encoding then this should be same as Word2Vec Embedding Dim
hyperparameter['DROP_OUT'] = 0.4
hyperparameter['LEARNING_RATE'] = 0.00001


number_of_unique_kmers = set()
def encode_seq(kmer_token):

    # A 1 0 0 0
    # C 0 1 0 0
    # T/U 0 0 0 1
    # G 0 0 1 0
    # N 0 0 0 0

    encoding_dict = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1],
        'U': [0, 0, 0, 1],
        'N': [0, 0, 0, 0],
    }

    encoded_sequence = []
    number_of_unique_kmers.add(kmer_token)
    for  base in kmer_token:
        encoded_sequence.append(encoding_dict[base])
    return np.array(encoded_sequence).flatten()

def applyOneHotEncoding(tokenized_sequences):
    encoded_sequences = []
    for seq in tokenized_sequences:
        encoded_sequences.append(encode_seq(seq)) 
 
    return np.array(encoded_sequences).flatten()

def encode_with_k_mer_codon(sequence):
    encoded_sequence = []
    for i in range(len(sequence) - k + 1):
        encoded_sequence.append(kmer_dict[sequence[i:i+k]] )

    return np.array(encoded_sequence)

def convertToTensor(x):
    return torch.tensor(x)


def prepare_dataset(x_data , y_data):

    concatenated_column= x_data.apply(lambda row: ''.join(map(str, row)), axis=1)
    x_data_with_complete_sequence = x_data.assign(Sequence=concatenated_column)

    X_encoded  =  x_data_with_complete_sequence['Sequence'].apply(encode_with_k_mer_codon)
    X_encoded = torch.tensor(X_encoded , dtype =  torch.long)


    print("Unique Values : " , y_data.value_counts())
    y_data = pd.Series(y_data.squeeze())
    y_encoded = y_data.where(y_data=='NonMoD', other=1).replace('NonMoD', 0)
    y_encoded  = np.array(y_encoded)

    print("Generate Train and Split..")
    # Train set
    X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

    # Test and Validation set
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


    X_encoded = None
    y_encoded = None
    x_data_filtered , y_data_filtered = None,None
    x_data , y_data = None , None

    x_data_with_complete_sequence = None

    #print("Train Y Count : " ,Counter(y_train))
    #print("Test Y Count : " ,Counter(y_test))


    y_train = torch.tensor(y_train , dtype=torch.long)
    y_test = torch.tensor(y_test , dtype=torch.long)
    y_valid = torch.tensor(y_valid , dtype=torch.long)

    train_dataset = RNADataset(X_train, y_train)
    test_dataset = RNADataset(X_test, y_test)
    valid_dataset = RNADataset(X_valid, y_valid)


    train_dataloader = DataLoader(train_dataset, batch_size=hyperparameter['BATCH_SIZE'], shuffle = True)
    test_dataloader = DataLoader(test_dataset, batch_size=hyperparameter['BATCH_SIZE'], shuffle=False)
    valid_dataloader = DataLoader(valid_dataset, batch_size=hyperparameter['BATCH_SIZE'], shuffle=False)


    return train_dataloader , test_dataloader ,  valid_dataloader



In [6]:
def train(train_dataloader ,test_dataloader):
    model = RNATransformerModel(input_dim=hyperparameter['INPUT_DIMENSION'],
                            embedding_dim=hyperparameter['EMBEDDING_DIMENSION'],
                            hidden_dim=hyperparameter['HIDDEN_DIMENSION'] ,
                            num_layers = hyperparameter['NO_OF_LAYERS'],
                            output_dim=hyperparameter['OUTPUT_DIMENSION'],
                            dropout=hyperparameter['DROP_OUT'] )


    loss_function = nn.BCEWithLogitsLoss()  ## MSELoss of Regression problem  # BCELoss for binary classification
    optimizer = optim.Adam(model.parameters() ,  lr=hyperparameter['LEARNING_RATE'])

    # Number of Parameters for Model
    total_parameters = []
    for p in model.parameters():
        total_parameters.append(p.numel())

    print(f"Total Number of Parameters for Model Training : { sum(total_parameters)} " )

    # Train the model
    num_epochs = 20
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)


    print("Model Parameters  : " , hyperparameter)

    # Train Model with configured Parameter
    train_model(model, train_dataloader ,test_dataloader, device ,num_epochs,optimizer,loss_function)

    return model , loss_function

In [7]:
def calculate_accuracy(valid_dataloader , model , loss_function):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Evaluate the model on the test dataset
    _, final_accuracy, true_labels, predicted_labels = validate_model(model, valid_dataloader,device,loss_function)

    # Print the final accuracy
    print(f"Final Accuracy: {final_accuracy:.4f}")

    # Print the classification summary
    print("\n Classification Summary:")
    print(classification_report(true_labels, predicted_labels))

    return final_accuracy

In [None]:
#BINARY_DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/Atol_with_ROS.csv"
#ENCODING_FILE = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/3-mer-dictionary.pkl"

# 'hAm', 'hCm', 'hGm','hTm','hm1A', 'hm5C', 'hm5U',

class_list = ['hm6A','hm6Am','hm7G','hPsi','Atol']
class_accuracy_dict = {}

for mclass in class_list:

    BINARY_DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/"+mclass+"_with_ROS.csv"
    #ENCODING_FILE = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/3-mer-dictionary.pkl"

    #BINARY_DATA_PATH = "../../../data/"+mclass+"_with_ROS.csv"
    print("Processing File : " , BINARY_DATA_PATH)
    binary_raw_dataframe =  pd.read_csv(BINARY_DATA_PATH, header=None  , skiprows=1)
    print(f"Shape of Dataset  : {binary_raw_dataframe.shape}")

    x_data = binary_raw_dataframe.iloc[:,0:101] # Select data
    y_data = binary_raw_dataframe.iloc[:,101] # Select Target

    train_dataloader , test_dataloader ,  valid_dataloader = prepare_dataset(x_data,y_data)

    model , loss_function  = train(train_dataloader, test_dataloader)

    print("Accuracy for Class " , mclass)
    final_accuracy  = calculate_accuracy(valid_dataloader , model , loss_function)

    # Nullify Object
    x_data = None
    y_data = None
    binary_raw_dataframe = None

    class_accuracy_dict[mclass] = final_accuracy

    model_path =  '/content/drive/My Drive/Colab Notebooks/Capstone Project/model/Binary/'+mclass+'_model.pt'
    torch.save(model, model_path)

    break # Reduce Unnecessary GPU usage

print("Final Result : " , class_accuracy_dict)



### Load Pre Trained Model and Perform Model Interpretability

In [3]:

class RNATransformerModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim, dropout=0.5):
        super(RNATransformerModel, self).__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # If batch size first is true then it should be batch size , sequence lenght , embedding dimension
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=8, dim_feedforward=hidden_dim , batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embedding_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.long()
        #print("Shape of Original X  ", x.shape)
        x_embedded = self.embedding(x)
        #print("Shape of X embedded" , x_embedded.shape)
        x_transformed = self.transformer_encoder(x_embedded)
        #print("Shape of Transformed X" , x_transformed.shape)
        x_transformed = x_transformed[:, -1, :]  # taking the last token's output

        output = self.dropout(x_transformed)
        out = self.fc(output)
        return out.squeeze()

model_path =  "../../../models/Transfomer_3Mer/hAm_model.pt"
model = torch.load(model_path ,map_location=torch.device('cpu'))
model

RNATransformerModel(
  (embedding): Embedding(94, 256)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=32, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=32, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=32, bias=True)
        (dropout):

In [67]:
'''
Helper Function for Feature Encoding.
'''

import pickle
import torch
import numpy as np


def encode_with_k_mer_codon(sequence, kmer_dict, k):
    encoded_sequence = []
    for i in range(len(sequence) - k + 1):
        encoded_sequence.append(kmer_dict[sequence[i:i + k]])
    return np.array(encoded_sequence)


def encode_sequence(sequence: str, encoding_file: str):
    k = 3
    kmer_dict = {}
    try:
        with open(encoding_file, 'rb') as f:
            kmer_dict = pickle.load(f)
    except FileNotFoundError:
        raise ValueError("File not found! Please ensure the file path is correct.")
    except Exception as e:
        raise ValueError("An error occurred while loading the file: " + str(e))

    print(f"Encoding file successfully loaded.")

    if len(sequence) != 101:
        raise ValueError('Invalid Sequence Length. Expected Sequence Length is 101.')

    x_encoded = encode_with_k_mer_codon(sequence, kmer_dict, k)
    X_encoded = torch.tensor([x_encoded], dtype=torch.long)

    return X_encoded # Adding extra dimension for batch size 



encoding_file = 'C:/Users/shashi.vish/Documents/Shashi/Education/HigherEducation/NUS/Capstone Project/Git/RNA-ModX/RNAModXApp//notebooks/model_building/LSTM//3-mer-dictionary.pkl'
sequence = 'TTGCCACACTGCTGGACGCCTGCAAGGCCAAGGGTACGGAGGTCATCATCATCACCACCGATACCTCGCCCTCAGGCACCAAGAAGACCCGGCAGTATCTC'
x_train = encode_sequence(sequence, encoding_file)
print(x_train)

Encoding file successfully loaded.
tensor([[35, 28, 29,  1, 36, 37, 36, 58, 25, 28, 24, 25, 45, 51, 57, 41, 30, 29,
          6, 25, 28, 18, 60, 48, 20, 17, 29,  1, 60, 48, 20, 40, 21, 46,  4, 41,
         16, 51, 39, 20, 21, 10, 11,  2, 12, 11,  2, 12, 11,  2, 12, 11, 36,  5,
          1, 36,  5, 23, 38, 55,  3,  4,  5,  6, 14, 15, 30, 29,  0,  6, 14, 11,
         19, 20, 17, 18, 36,  5,  1, 60, 48, 56, 53, 48, 56, 57,  5,  0, 23, 16,
         17, 18, 19,  9, 46, 32, 12, 22, 14]])


In [68]:
# Make Predictoin

# 0  - Non Modified RNA Nucleoside
# 1  - Corresponding Modified Nucleoside

print("Shape of X_train " , x_train.shape)

model.eval()
with torch.no_grad():
    output = model(x_train)
    print(output.unsqueeze(-1))
    print("Raw Output : " , output ," Shape : " , output.shape )
    probabilities = torch.sigmoid(output)
    print("Probabilities : " , probabilities)
    predicted_class = (probabilities > 0.5).float()  
    print("Predicted Class : " , predicted_class)  

Shape of X_train  torch.Size([1, 99])
tensor([6.1201])
Raw Output :  tensor(6.1201)  Shape :  torch.Size([])
Probabilities :  tensor(0.9978)
Predicted Class :  tensor(1.)


In [1]:
class WrapperModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x):
        return self.model(x).unsqueeze(-1)
        
wrapped_model = WrapperModel(model)

X_train = X_train.float()
X_test = X_test.float()



# we use the first 100 training examples as our background dataset to integrate over
explainer = shap.DeepExplainer(wrapped_model,  X_train[:1000])

# explain the first 10 predictions
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(x_train)

print(shap_values)

# # plot the feature attributions
shap.summary_plot(shap_values, x_train)




NameError: name 'nn' is not defined

In [1]:
!pip install flask 

Collecting flask
  Downloading Flask-2.3.2-py3-none-any.whl (96 kB)
                                              0.0/96.9 kB ? eta -:--:--
     ----------------                         41.0/96.9 kB 2.0 MB/s eta 0:00:01
     ------------------------------------   92.2/96.9 kB 880.9 kB/s eta 0:00:01
     -------------------------------------- 96.9/96.9 kB 794.7 kB/s eta 0:00:00
Collecting itsdangerous>=2.1.2 (from flask)
  Using cached itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Installing collected packages: itsdangerous, flask
Successfully installed flask-2.3.2 itsdangerous-2.1.2



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip
