In [1]:
%pip install transformers
%pip install einops

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig


data = pd.read_csv("LncRNA_Dataset.csv")
data = data.dropna()

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("zhihan1996/DNABERT-2-117M", config=config, trust_remote_code=True)

for i in range(len(data['Sequence'])):
    try:
        if len(data['Sequence'][i]) > 8000:
            data['Sequence'][i] = data['Sequence'][i][:8000]
    except KeyError:
        pass

In [3]:
batch_size = 10

first_batch = True

for i in range(0, len(data), batch_size):
    all_embeddings = []
    batch_data = data['Sequence'][i:i+batch_size]

    for cnt, dna in enumerate(batch_data, start=i):
        # Tokenize the DNA sequence
        inputs = tokenizer(dna, return_tensors='pt')["input_ids"]

        # Generate embeddings
        with torch.no_grad():
            hidden_states = model(inputs)[0]
        
        # Calculate max-pooling embedding
        embedding_max = torch.max(hidden_states[0], dim=0)[0]
        
        # Append the embedding to the list
        all_embeddings.append(embedding_max.numpy())

        print(f"Processed sequence {cnt+1}/{len(data)}")

    # Convert embeddings to DataFrame
    embeddings_df = pd.DataFrame(all_embeddings)

    # If it's the first batch, write the DataFrame directly to the file
    if first_batch:
        embeddings_df.to_csv("embeddings.csv", index=False)
        first_batch = False
    # For subsequent batches, append to the existing file
    else:
        embeddings_df.to_csv("embeddings.csv", mode='a', header=False, index=False)

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [5]:
data = pd.read_csv("LncRNA_Dataset.csv")
data = data.dropna()

In [6]:
def focal_binary_cross_entropy(outputs, targets, gamma = 2, num_classes = 4):
    p = outputs.reshape(-1)
    t = targets.reshape(-1)
    p = torch.where(t >= 0.5, p, 1-p)
    logp = - torch.log(torch.clamp(p, 1e-4, 1-1e-4))
    loss = logp * ((1-p) ** gamma)
    loss = num_classes * loss.mean()
    return loss

In [None]:
device = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available() else "cpu"
)

print(device)

embeddings_df = pd.read_csv("embeddings.csv")
X_embeddings = torch.tensor(embeddings_df.values, dtype=torch.float32).to(device)

data['Nucleus'] = data['SubCellular_Localization'].apply(lambda x: 1 if 'Nucleus' in x else 0)
data['Cytoplasm'] = data['SubCellular_Localization'].apply(lambda x: 1 if 'Cytoplasm' in x else 0)
data['Chromatin'] = data['SubCellular_Localization'].apply(lambda x: 1 if 'Chromatin' in x else 0)
data['Insoluble cytoplasm'] = data['SubCellular_Localization'].apply(lambda x: 1 if 'Insoluble cytoplasm' in x else 0)

split_idx = int(0.8 * len(X_embeddings))
X_train, X_test = X_embeddings[:split_idx], X_embeddings[split_idx:]

y_train = torch.tensor(data[['Nucleus', 'Cytoplasm', 'Chromatin', 'Insoluble cytoplasm']][:split_idx].values, dtype=torch.float32).to(device)
y_test = torch.tensor(data[['Nucleus', 'Cytoplasm', 'Chromatin', 'Insoluble cytoplasm']][split_idx:].values, dtype=torch.float32).to(device)


In [8]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(4096, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(2048, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(512, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(64, 4)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.bn1(torch.relu(self.fc1(x)))
        x = self.dropout1(x)
        x = self.bn2(torch.relu(self.fc2(x)))
        x = self.dropout2(x)
        x = self.bn3(torch.relu(self.fc3(x)))
        x = self.dropout3(x)
        x = torch.sigmoid(self.fc4(x))
        return x

In [9]:
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5, padding=2)
        self.layer_norm1 = nn.LayerNorm([4096, 64])
        self.layer_norm2 = nn.LayerNorm([4096, 64])
        self.layer_norm3 = nn.LayerNorm([4096, 64])
        self.relu = nn.ReLU()
        self.adaptive_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(12288, 4000)
        self.fc2 = nn.Linear(4000, 1600)
        self.fc3 = nn.Linear(1600, 400)
        self.fc4 = nn.Linear(400, 64)
        self.fc_out = nn.Linear(64, 4)

    def forward(self, x):
        # x: batchSize × seqLen × feaSize
        x = x.unsqueeze(2)
        x = x.permute(0, 2, 1)  # Permute dimensions to fit Conv1d input shape
        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)


        x1 = x1.permute(0, 2, 1)  # Reshape to [*, 64, 1]
        x2 = x2.permute(0, 2, 1)  # Reshape to [*, 64, 1]
        x3 = x3.permute(0, 2, 1)

        # Add layer normalization
        x1 = self.layer_norm1(x1)
        x2 = self.layer_norm2(x2)
        x3 = self.layer_norm3(x3)

        x1 = self.relu(x1)
        x2 = self.relu(x2)
        x3 = self.relu(x3)

        x1 = self.adaptive_max_pool(x1)
        x2 = self.adaptive_max_pool(x2)
        x3 = self.adaptive_max_pool(x3)

        x1 = x1.squeeze(dim=2)
        x2 = x2.squeeze(dim=2)
        x3 = x3.squeeze(dim=2)

        x = torch.cat((x1, x2, x3), dim=1)  # Concatenate along the feature dimension

        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        
        x = self.fc_out(x)  # Output layer
        x = torch.sigmoid(x)  # Sigmoid activation for multi-label classification

        return x

In [None]:
class BiLSTM_MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, mlp_hidden_size, dropout=0.2):
        super(BiLSTM_MLP, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # BiLSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)

        # MLP layer
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, mlp_hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden_size, 4),
            nn.Sigmoid()
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])

        return out

input_size = 64    # embedding dimension
hidden_size = 64
num_layers = 1
mlp_hidden_size = 64

In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle=False)


# model = MLP().to(device)
model = TextCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.00001, weight_decay = 0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # print(inputs.shape)
        # Forward pass
        outputs = model(inputs)
        
        # print(f"outputs : {outputs.shape}")
        # print(f"labels : {labels.shape}")
        # Calculate loss
        loss = focal_binary_cross_entropy(outputs, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item()

    # Print average loss after each epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

In [None]:
# Initialize variables to store the count of correctly identified relevant items at rank 1
model.eval()
correct_at_rank_1 = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        
        # Get the index of the top prediction for each sequence
        _, top_pred_indices = outputs.topk(1, dim=1)
        
        # Iterate over each prediction and its corresponding true labels within the batch
        for i in range(len(labels)):
            top_pred_index = top_pred_indices[i].item()  # Get the index of the top prediction
            true_labels = labels[i]  # Get the true labels for the current sequence
            
            # Check if the top prediction matches at least one of the true labels for the current sequence
            if true_labels[top_pred_index] == 1:
                correct_at_rank_1 += 1

# Calculate Precision@1
precision_at_1 = correct_at_rank_1 / len(test_loader.dataset)

print(f"Precision@1: {precision_at_1:.5f}")