In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
data= pd.read_csv('/kaggle/input/promoter-gene-prediction/promoters.data',names=['Class', 'Instance_Name', 'Sequence'])
data

Unnamed: 0,Class,Instance_Name,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
...,...,...,...
101,-,799,\t\tcctcaatggcctctaaacgggtcttgaggggttttttgctga...
102,-,987,\t\tgtattctcaacaagattaaccgacagattcaatctcgtggat...
103,-,1226,\t\tcgcgactacgatgagatgcctgagtgcttccgttactggatt...
104,-,794,\t\tctcgtcctcaatggcctctaaacgggtcttgaggggtttttt...


In [5]:
data['Sequence'] = [item.split(r"\\")[0].strip() for item in data['Sequence']]
data['Sequence'] = data['Sequence'].str.upper()
data['Sequence']

0      TACTAGCAATACGCTTGCGTTCGGTGGTTAAGTATGTATAATGCGC...
1      TGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAA...
2      GTACTAGAGAACTAGTGCATTAGCTTATTTTTTTGTTATCATGCTA...
3      AATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATA...
4      TCGATAATTAACTATTGACGAAAAGCTGAAAACCACTAGAATGCGC...
                             ...                        
101    CCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTGAAAGG...
102    GTATTCTCAACAAGATTAACCGACAGATTCAATCTCGTGGATGGAC...
103    CGCGACTACGATGAGATGCCTGAGTGCTTCCGTTACTGGATTGTCA...
104    CTCGTCCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCTG...
105    TAACATTAATAAATAAGGAGGCTCTAATGGCACTCATTAGCCAATC...
Name: Sequence, Length: 106, dtype: object

In [6]:
data['Class_Label'] = pd.factorize(data['Class'])[0]
data['Class_Label']

0      0
1      0
2      0
3      0
4      0
      ..
101    1
102    1
103    1
104    1
105    1
Name: Class_Label, Length: 106, dtype: int64

In [7]:
!huggingface-cli download InstaDeepAI/nucleotide-transformer-v2-250m-multi-species --local-dir /kaggle/working

Fetching 11 files:   0%|                                 | 0/11 [00:00<?, ?it/s]Downloading 'esm_config.py' to '/kaggle/working/.cache/huggingface/download/GRchU6ChDc-ljPnvCbORaVfYBzg=.23313afb28fe512badf134e9d1ce08e405e3656c.incomplete'
Downloading 'modeling_esm.py' to '/kaggle/working/.cache/huggingface/download/C_0n0S8TTevl5Ia3SB4cm37Ir6A=.967189e3be48c42bb5a6af4b8243b54590cbe929.incomplete'
Downloading 'README.md' to '/kaggle/working/.cache/huggingface/download/Xn7B-BWUGOee2Y6hCZtEhtFu4BE=.a0ce6da1d1d4f17e33f0853b2add1a67cdf6bb32.incomplete'
Downloading 'pytorch_model.bin' to '/kaggle/working/.cache/huggingface/download/Q1p2l2BzM1m6P5jKvr8WTq1TUio=.ad9be6f4ca8c744dde5794a0cdc443eb2d76d776d9da2377f6d9fe4f4d96fcd9.incomplete'

esm_config.py: 100%|███████████████████████| 14.9k/14.9k [00:00<00:00, 98.9MB/s][A
Download complete. Moving file to /kaggle/working/esm_config.py

README.md: 100%|███████████████████████████| 6.34k/6.34k [00:00<00:00, 40.8MB/s][A
Download complete. Moving fi

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

class DNADataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        print(f"Index requested: {idx}")
        print(f"Raw sequence: {self.sequences[idx]}")
        print(f"Raw label: {self.labels[idx]}")
        sequence = self.sequences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            sequence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class DNAClassifier(nn.Module):
    def __init__(self, base_model, num_classes, dropout_rate=0.1):
        super(DNAClassifier, self).__init__()
        
        self.base_model = base_model
        
        hidden_size = base_model.config.hidden_size
        
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_size, num_classes)
        

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model.esm(input_ids=input_ids, attention_mask=attention_mask)
    
        hidden_states = outputs[0] 
    
        pooled_output = hidden_states[:, 0, :]
    
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
    
        return logits


from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

def train_model(model, train_loader, val_loader, device, num_epochs=10, lr=2e-5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()  
    
    best_val_acc = 0.0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            with autocast():  
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_acc = train_correct / train_total
        
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                with autocast():  
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_acc = val_correct / val_total
        
        print(f'Epoch {epoch+1}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc:.4f}')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_dna_classifier.pth')



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
base_model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)

In [None]:
sequences = list(data['Sequence'].reset_index(drop=True))
labels = list(data['Class_Label'].reset_index(drop=True))

train_seqs, val_seqs, train_labels, val_labels = train_test_split(
    sequences, labels, test_size=0.2, random_state=42
)

train_dataset = DNADataset(train_seqs, train_labels, tokenizer)
val_dataset = DNADataset(val_seqs, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

num_classes = len(set(labels)) 
model = DNAClassifier(base_model, num_classes)
model = model.to(device)

train_model(model, train_loader, val_loader, device)

  scaler = GradScaler()  # Enables mixed precision training
  with autocast():  # Enables mixed precision computation
Epoch 1/10 - Training:   0%|          | 0/6 [00:00<?, ?it/s]

Index requested: 79
Raw sequence: TTAGAGAGCATGTCAGCCTCGACAACTTGCATAAATGCTTTCTTGTAGACGTGCCCT
Raw label: 1
Index requested: 64
Raw sequence: CCGTTTATTTTTTCTACCCATATCCTTGAAGCGGTGTTATAATGCCGCGCCCTCGAT
Raw label: 0
Index requested: 0
Raw sequence: CGACCGAAGCGAGCCTCGTCCTCAATGGCCTCTAAACGGGTCTTGAGGGGTTTTTTG
Raw label: 1
Index requested: 51
Raw sequence: TAACATTAATAAATAAGGAGGCTCTAATGGCACTCATTAGCCAATCAATCAAGAACT
Raw label: 1
Index requested: 72
Raw sequence: CAGCGGCAGCACGTTTCCACGCGGTGAGAGCCTCAGGATTCATGTCGATGTCTTCCG
Raw label: 1
Index requested: 70
Raw sequence: GTACTAGAGAACTAGTGCATTAGCTTATTTTTTTGTTATCATGCTAACCACCCGGCG
Raw label: 0
Index requested: 62
Raw sequence: CTACGGTGGGTACAATATGCTGGATGGAGATGCGTTCACTTCTGGTCTACTGACTCG
Raw label: 1
Index requested: 42
Raw sequence: AACGAGTCAATCAGACCGCTTTGACTCTGGTATTACTGTGAACATTATTCGTCTCCG
Raw label: 1
Index requested: 14
Raw sequence: GATCGCACGATCTGTATACTTATTTGAGTAAATTAACCCACGATCCCAGCCATTCTT
Raw label: 0
Index requested: 32
Raw sequence: AAGTGCTTAGCTTCAAGGTCAC




OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 120.12 MiB is free. Process 5200 has 14.62 GiB memory in use. Of the allocated memory 14.46 GiB is allocated by PyTorch, and 37.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [18]:
torch.cuda.empty_cache()