In [1]:
! pip install torch



In [2]:
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import csv
import pandas as pd
import numpy as np
from numpy import array

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score ,  roc_curve, auc , classification_report


In [3]:
# BINARY_DATA_PATH = "../../../data/hAm_with_ROS.csv"

# If you running with dataset on drive . Use below code to mount on drive.

from google.colab import drive
drive.mount('/content/drive')
BINARY_DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Capstone Project/data/hAm_with_ROS.csv"



x_train_raw =  pd.read_csv(BINARY_DATA_PATH, header=None , skiprows=1 )
# x_train_raw


Mounted at /content/drive


In [4]:
print(f"Shape of Input Data : {x_train_raw.shape}")

Shape of Input Data : (309214, 102)


In [5]:
number_of_unique_kmers = set()
def one_hot_encode_sequence(kmer_token):

    # A 1 0 0 0
    # C 0 1 0 0
    # T/U 0 0 0 1
    # G 0 0 1 0
    # N 0 0 0 0

    encoding_dict = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1],
        'U': [0, 0, 0, 1],
        'N': [0, 0, 0, 0],
    }

    encoded_sequence = []
    number_of_unique_kmers.add(kmer_token)
    for  base in kmer_token:
        encoded_sequence.append(encoding_dict[base])
    return np.array(encoded_sequence).flatten()

def applyOneHotEncoding(tokenized_sequences):
    encoded_sequences = []
    for seq in tokenized_sequences:
        encoded_sequences.append(one_hot_encode_sequence(seq))
    return np.array(encoded_sequences).flatten()

def applyKmersAndEncoding(seq):
    k=3
    tokens = [seq[i:i+k] for i in range(0, len(seq)-k+1)]
    return tokens


def encode_x_with_k_mer_one_hot_encoding(truncated_df):
    truncated_df['Sequence'] = truncated_df.apply(lambda row: ''.join(map(str, row)), axis=1)
    tokenized_sequences =  truncated_df['Sequence'].apply(applyKmersAndEncoding).tolist()
    #print(tokenized_sequences)
    # The result, tokenized_sequences, is a list of lists, where each inner list
    #  contains the k-mers of the corresponding RNA sequence from the truncated_df list.


    result = []
    for seq in tokenized_sequences:
        embedding = applyOneHotEncoding(seq)
        result.append(embedding)
    return np.array(result)

# def generate_3mers(sequence):
#     k = 3  # Length of k-mer
#     three_mers = []
#     for i in range(len(sequence) - k + 1):
#         kmer = sequence[i:i+k]
#         three_mers.append(''.join(kmer))
#     return three_mers


In [6]:
y = x_train_raw.iloc[: , -1]
x = x_train_raw.iloc[: , :-1]
x_train_raw = None

In [7]:
x_3mers = encode_x_with_k_mer_one_hot_encoding(x)
x_3mers


array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 1]])

In [8]:
print(f"Shape of Encoded Data : {x_3mers.shape} ")

Shape of Encoded Data : (309214, 1188) 


In [9]:
y

0         hAm
1         hAm
2         hAm
3         hAm
4         hAm
         ... 
309209    hAm
309210    hAm
309211    hAm
309212    hAm
309213    hAm
Name: 101, Length: 309214, dtype: object

In [10]:
# y_encoded_df = pd.get_dummies(y)
# y_encoded_df

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_data = pd.Series(y.squeeze())
y_encoded = encoder.fit_transform(y_data)

unique, counts = np.unique(y_encoded, return_counts=True)
value_counts = dict(zip(unique, counts))
print(value_counts)

{0: 154607, 1: 154607}


In [11]:
# First Convert it to Tensor then create Train/Test/Validation
x_data = torch.tensor(x_3mers , dtype = torch.float32)
y_data = torch.tensor(y_encoded , dtype = torch.float32)

print("Generate Train and Split..")
# Train set
X_train, X_temp, y_train, y_temp = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

# Test and Validation set
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


Generate Train and Split..


In [12]:
x_data = None
y_data = None
y_encoded_df = None
x_3mers = None
x  = None
y = None
X_temp = None
y_temp = None

In [13]:
X_train.shape

torch.Size([216449, 1188])

In [14]:
# define a class to build trainloader
class MyDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [15]:
batch_size = 32

train_dataset = MyDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, shuffle=True,  batch_size=batch_size)

# get testloader
test_dataset = MyDataset(X_test, y_test)
test_dataloader = DataLoader(test_dataset, shuffle=False,  batch_size=batch_size)

valid_dataset = MyDataset(X_valid, y_valid)
valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=batch_size)


In [16]:
print("Shape of X train : " , len(X_train[0]))
for inputs, labels in test_dataloader:
    print(inputs.shape)
    break

Shape of X train :  1188
torch.Size([32, 1188])


In [17]:
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob , embedding_dim):
        super(GRUModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.layer_dim = layer_dim #layer of GRU
        self.hidden_dim = hidden_dim #32,64...
        # input_dim is the number of feature dimension (X)

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # GRU layers
        self.gru = nn.GRU(
            embedding_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout_prob)


    def forward(self, x): #what to do when 1 epoch layer is done
        # Initializing hidden state for first input with zeros
        #h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        x  = x.long()
        #print(f"Shape of X  : {x.shape}")

        x_embedded =  self.embedding(x)

        #print(f"Shape of X  : {x_embedded.shape}")
        # Forward propagation by passing in the input and hidden state into the model
        gru_out , h = self.gru(x_embedded)

        #print(f"Shape of gru_out  : {gru_out.shape}")

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer

        #print(f"Shape of Ouput Layer : {h.shape}")
        out = gru_out[:,-1,:] # pick up last hidden state

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        # Adding drop layer to avoid over fitting
        out = self.dropout(out)

        return out.squeeze() # Squeeze to remove extra dimension

In [18]:
def get_model(model, model_params):
    models = {
        "gru": GRUModel,
    }
    return models.get("gru")(**model_params)

In [24]:

def validate_model(model, test_dataloader , device ,loss_function):
    model.eval()
    running_loss = 0.0
    total = 0
    correct = 0
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for inputs, labels in test_dataloader:

            inputs = inputs.to(device)
            labels = labels.float().to(device)
            outputs = model(inputs)
            if outputs.size() != labels.size(): # skip if batch size mismatch
              print(f"Test Mismatch Found {outputs.size()} , and {labels.size()}")
              #break
              continue
            #print(f"Shape of Label : {labels.shape} and output shape : {outputs.shape} ")
            loss = loss_function(outputs, labels)
            running_loss += loss.item()

            predicted = (torch.sigmoid(outputs) > 0.5).float() # Set threshold at 0.5
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            true_labels.extend(labels.cpu().numpy())  # Capture True Labels for Summary Report
            predicted_labels.extend(predicted.cpu().numpy()) # Capture Predicted Labels Lables for Summary Report

    validation_loss = running_loss / len(test_dataloader)
    validation_accuracy = correct / total

    return validation_loss , validation_accuracy , true_labels , predicted_labels



def train_model(model, train_dataloader, test_dataloader, device, epochs, optimizer, loss_function):
    best_val_loss = float('inf')
    no_improvement_count = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        start_time = time.time()
        #progress_bar = tqdm(train_dataloader, desc='Epoch {:03d}'.format(epoch + 1), leave=False, disable=False)
        for i, (inputs, labels) in enumerate(train_dataloader):
            inputs = inputs.to(device)
            labels = labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            if outputs.size() != labels.size(): # skip if batch size mismatch
              print(f"Train Mismatch Found {outputs.size()} , and {labels.size()}")
              #break
              continue
            #print(f"Shape of Label : {labels.shape} and output shape : {outputs.shape} ")
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            #progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(inputs))})

        epoch_loss = running_loss / len(train_dataloader)
        val_loss,  validation_accuracy , true_labels , predicted_labels = validate_model(model, test_dataloader, device, loss_function)
        end_time = time.time()
        elapsed_time = end_time - start_time

        print(f"Epoch {epoch + 1}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Test Accuracy: {validation_accuracy:.4f} , Time Taken : {elapsed_time}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count == 10:
                print("No improvement in validation loss for 5 epochs. Training stopped.")
                break



In [22]:
import torch.optim as optim

input_dim = len(X_train[0])
output_dim = 1
embedding_dim = 32
hidden_dim = 7
layer_dim = 3
batch_size = 32
# batch size dhould be 32,64,128 ..
dropout = 0.2
n_epochs = 10
learning_rate = 1e-3
weight_decay = 1e-6
device = torch.device("cpu")

model_params = {'input_dim': input_dim,
                'embedding_dim' : embedding_dim,
                'hidden_dim' : hidden_dim,
                'layer_dim' : layer_dim,
                'output_dim' : output_dim,
                'dropout_prob' : dropout}

model = get_model('gru', model_params)

loss_fn = nn.BCEWithLogitsLoss()  ## MSELoss of Regression problem  # BCELoss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


# Number of Parameters for Model
total_parameters = []
for p in model.parameters():
    total_parameters.append(p.numel())

print(f"Total Number of Parameters for Model Training : { sum(total_parameters)} " )

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Model Parameters  : " , model_params)

# Train Model with configured Parameter
train_model(model, train_dataloader ,test_dataloader, device ,n_epochs,optimizer,loss_fn)


Total Number of Parameters for Model Training : 39557 
Model Parameters  :  {'input_dim': 1188, 'embedding_dim': 32, 'hidden_dim': 7, 'layer_dim': 3, 'output_dim': 1, 'dropout_prob': 0.2}
Train Mismatch Found torch.Size([]) , and torch.Size([1])
torch.Size([32, 1188])
Epoch 1, Train Loss: 0.6908, Val Loss: 0.6857, Test Accuracy: 0.5497 , Time Taken : 62.693838119506836
Train Mismatch Found torch.Size([]) , and torch.Size([1])
torch.Size([32, 1188])
Epoch 2, Train Loss: 0.6885, Val Loss: 0.6952, Test Accuracy: 0.4925 , Time Taken : 63.1167151927948
Train Mismatch Found torch.Size([]) , and torch.Size([1])
torch.Size([32, 1188])
Epoch 3, Train Loss: 0.6922, Val Loss: 0.6906, Test Accuracy: 0.5401 , Time Taken : 62.62605285644531
Train Mismatch Found torch.Size([]) , and torch.Size([1])
torch.Size([32, 1188])
Epoch 4, Train Loss: 0.6871, Val Loss: 0.6819, Test Accuracy: 0.5731 , Time Taken : 62.77819013595581
Train Mismatch Found torch.Size([]) , and torch.Size([1])
torch.Size([32, 1188])

In [23]:
# Evaluate the model on the test dataset
_, final_accuracy, true_labels, predicted_labels = validate_model(model, valid_dataloader,device,loss_fn)

# Print the final accuracy
print(f"Final Accuracy: {final_accuracy:.4f}")

# Print the classification summary
print("\n Classification Summary:")
print(classification_report(true_labels, predicted_labels))

torch.Size([32, 1188])
Final Accuracy: 0.6406

 Classification Summary:
              precision    recall  f1-score   support

         0.0       0.65      0.60      0.63     23309
         1.0       0.63      0.68      0.65     23073

    accuracy                           0.64     46382
   macro avg       0.64      0.64      0.64     46382
weighted avg       0.64      0.64      0.64     46382

