In [None]:
import torch
from transformers import BertConfig, AutoModel, AutoTokenizer

model_name = "zhihan1996/DNABERT-2-117M"

config = BertConfig.from_pretrained(model_name, trust_remote_code=True)

model = AutoModel.from_pretrained(model_name, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

bert_layers.py:   0%|          | 0.00/40.7k [00:00<?, ?B/s]

bert_padding.py:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


flash_attn_triton.py:   0%|          | 0.00/42.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- bert_layers.py
- bert_padding.py
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/468M [00:00<?, ?B/s]

In [None]:
dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
inputs = tokenizer(dna, return_tensors='pt')["input_ids"]
hidden_states = model(inputs)[0] # [1, sequence_length, 768]

# embedding with mean pooling
embedding_mean = torch.mean(hidden_states[0], dim=0)
print(embedding_mean.shape) # expect to be 768

# embedding with max pooling
embedding_max = torch.max(hidden_states[0], dim=0)[0]
print(embedding_max.shape) # expect to be 768

print(hidden_states[0].shape)



AssertionError: 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls /content/drive/MyDrive/dataset_sqa/dataset_0/new_test_sampled

test_sampled_0.csv   test_sampled_19.csv  test_sampled_28.csv  test_sampled_37.csv
test_sampled_10.csv  test_sampled_1.csv   test_sampled_29.csv  test_sampled_38.csv
test_sampled_11.csv  test_sampled_20.csv  test_sampled_2.csv   test_sampled_3.csv
test_sampled_12.csv  test_sampled_21.csv  test_sampled_30.csv  test_sampled_4.csv
test_sampled_13.csv  test_sampled_22.csv  test_sampled_31.csv  test_sampled_5.csv
test_sampled_14.csv  test_sampled_23.csv  test_sampled_32.csv  test_sampled_6.csv
test_sampled_15.csv  test_sampled_24.csv  test_sampled_33.csv  test_sampled_7.csv
test_sampled_16.csv  test_sampled_25.csv  test_sampled_34.csv  test_sampled_8.csv
test_sampled_17.csv  test_sampled_26.csv  test_sampled_35.csv  test_sampled_9.csv
test_sampled_18.csv  test_sampled_27.csv  test_sampled_36.csv


In [None]:
!pip uninstall triton

Found existing installation: triton 3.1.0
Uninstalling triton-3.1.0:
  Would remove:
    /usr/local/bin/proton
    /usr/local/bin/proton-viewer
    /usr/local/lib/python3.11/dist-packages/triton-3.1.0.dist-info/*
    /usr/local/lib/python3.11/dist-packages/triton/*
Proceed (Y/n)? y
  Successfully uninstalled triton-3.1.0


In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertConfig, AutoModel, AutoTokenizer, BertModel

class SequenceDataset(Dataset):
    def __init__(self, sequences, questions, labels, tokenizer, tokenizer_q):
        self.sequences = sequences
        self.questions = questions
        self.labels = labels
        self.tokenizer = tokenizer
        self.tokenizer_q = tokenizer_q

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        question = self.questions[idx]
        inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', max_length=512, truncation=True)["input_ids"]
        dna_attention_mask = inputs != self.tokenizer.pad_token_id
        dna_attention_mask = dna_attention_mask.squeeze(1)  # 去掉多余的维度，使形状变为 (256, 512)

        encoded_q = self.tokenizer_q(
            text=question,
            padding='max_length',
            max_length=16,
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=True,
            return_attention_mask=True,
        )
        question = {"input_ids": encoded_q['input_ids'].squeeze(),
                    "token_type_ids": encoded_q['token_type_ids'].squeeze(),
            "attention_mask": encoded_q['attention_mask'].squeeze()}
        return inputs.squeeze(0), torch.tensor(label, dtype=torch.long), question["input_ids"], question['attention_mask'], question["token_type_ids"], dna_attention_mask

# df = pd.read_csv('modified_expanded_processed_sequences.csv')

train_dataset_path = '/content/drive/MyDrive/dataset_sqa/dataset_0/new_train_sampled.csv'
test_dataset_path = '/content/drive/MyDrive/dataset_sqa/dataset_0/new_test_sampled.csv'

df_train = pd.read_csv(train_dataset_path)
df_test = pd.read_csv(test_dataset_path)
train_seqs = df_train['ans'].tolist()
# test_seqs = df_test['ans'].tolist()
unique_answers = set(df_train['ans'].unique()).union(set(df_test['ans'].unique()))
answer_to_label = {answer: idx for idx, answer in enumerate(unique_answers)}

sequences_train = df_train['seq'].tolist()
questions_train = df_train['question'].tolist()
labels_train = [answer_to_label[str(answer)] for answer in df_train['ans'].tolist()]

# sequences_test = df_test['seq'].tolist()
# questions_test = df_test['question'].tolist()
# labels_test = [answer_to_label[answer] for answer in df_test['ans'].tolist()]

# model_name = "zhihan1996/DNABERT-2-117M"
# config = BertConfig.from_pretrained(model_name, trust_remote_code=True)
# dna_model = AutoModel.from_pretrained(model_name, config=config, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model_name = "InstaDeepAI/nucleotide-transformer-500m-human-ref"
dna_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer_q = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

dataset_train = SequenceDataset(sequences_train, questions_train, labels_train, tokenizer, tokenizer_q)
# dataset_test = SequenceDataset(sequences_test, questions_test, labels_test, tokenizer, tokenizer_q)
train_size = len(dataset_train)
# test_size = len(dataset_test)


train_loader = DataLoader(dataset_train, batch_size=256, shuffle=True)
# test_loader = DataLoader(dataset_test, batch_size=256, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
import torch.nn as nn
import torch.optim as optim

# class Classifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(Classifier, self).__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_dim, output_dim)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         return x

class Classifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(Classifier, self).__init__()
        self.base_model = dna_model

        self.qs_base_model = BertModel.from_pretrained('dmis-lab/biobert-v1.1')

        self.fc1 = nn.Linear(1280, 256)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.1)

        self.qs_fc1 = nn.Linear(768, 256)
        self.qs_relu1 = nn.ReLU()
        self.qs_dropout1 = nn.Dropout(0.1)

        # Fusion layer after concatenation of seq and qs branches
        self.fc_fusion = nn.Linear(512, 256)
        self.relu_fusion = nn.ReLU()
        self.dropout_fusion = nn.Dropout(0.1)

        self.classifier = nn.Linear(256, num_labels)

    def forward(self, input_ids, question, attention_mask=None, token_type_ids=None, dna_attention_mask=None):
        with torch.no_grad():
            outputs = self.base_model(input_ids=input_ids, attention_mask=dna_attention_mask, encoder_attention_mask=dna_attention_mask)
        last_hidden_state = outputs[0]

        with torch.no_grad():
            qs_outputs = self.qs_base_model(input_ids=question, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)

        seq_x = self.fc1(last_hidden_state[:, 0, :])
        seq_x = self.relu1(seq_x)
        seq_x = self.dropout1(seq_x)

        qs_x = self.qs_fc1(qs_outputs['pooler_output'])
        qs_x = self.qs_relu1(qs_x)
        qs_x = self.qs_dropout1(qs_x)

        # Fusion: concatenate the two branches' outputs
        combined_x = torch.cat((seq_x, qs_x), dim=1)

        # Pass through fusion layer
        x = self.fc_fusion(combined_x)
        x = self.relu_fusion(x)
        x = self.dropout_fusion(x)

        logits = self.classifier(x)
        return logits

output_dim = 37
model = Classifier(model_name, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

log_interval = 10

for epoch in range(num_epochs):
    model.train()
    print('begin')
    cnt = 0
    for batch_idx, (inputs, labels, question, attention_mask, token_type_ids, dna_attention_mask) in enumerate(train_loader):
        input_ids = inputs.to(device)
        labels = labels.to(device)
        question, attention_mask, token_type_ids = question.to(device), attention_mask.to(device), token_type_ids.to(device)
        dna_attention_mask = dna_attention_mask.squeeze(1)
        dna_attention_mask = dna_attention_mask.to(device)
        # print(input_ids.shape)
        # print(dna_attention_mask.shape)

        optimizer.zero_grad()

        outputs = model(input_ids, question, attention_mask, token_type_ids, dna_attention_mask)

        loss = criterion(outputs, labels.squeeze())
        # with torch.no_grad():
        #     hidden_states = model(inputs)[0]  # [batch_size, sequence_length, 768]
        #     embeddings = torch.mean(hidden_states, dim=1)
        # optimizer.zero_grad()
        # outputs = classifier(embeddings)

        # loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()

        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(input_ids), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

    model.eval()
    for i in range(0, 39):
        dataset_name_t = f'/content/drive/MyDrive/dataset_sqa/dataset_0/new_test_sampled/test_sampled_{i}.csv'
        # dataset_test_t = GenomicBenchmarkDataset(
        #     max_length = max_length,
        #     d_output = 37,
        #     use_padding = use_padding,
        #     tokenizer_q = tokenizer_q,
        #     tokenizer=tokenizer,
        #     dataset_name=dataset_name_t,
        #     rc_aug=rc_aug,
        #     add_eos=add_eos,
        #     answer_to_label=answer_to_label
        # )
        sub_df_test = pd.read_csv(dataset_name_t)
        sub_test_seqs = sub_df_test['ans'].tolist()
        sub_sequences_test = sub_df_test['seq'].tolist()
        sub_questions_test = sub_df_test['question'].tolist()
        sub_labels_test = [answer_to_label[str(answer)] for answer in sub_df_test['ans'].tolist()]
        sub_dataset_test = SequenceDataset(sub_sequences_test, sub_questions_test, sub_labels_test, tokenizer, tokenizer_q)
        sub_test_size = len(sub_dataset_test)
        test_loader = DataLoader(sub_dataset_test, batch_size=256, shuffle=False)
        # t_loader = DataLoader(dataset_test_t, batch_size=batch_size, shuffle=False)
        print(i)
        # test(model, device, t_loader, loss_fn)
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for (inputs, labels, question, attention_mask, token_type_ids, dna_attention_mask) in test_loader:
                input_ids = inputs.to(device)
                labels = labels.to(device)
                question, attention_mask, token_type_ids = question.to(device), attention_mask.to(device), token_type_ids.to(device)
                dna_attention_mask = dna_attention_mask.squeeze(1)
                dna_attention_mask = dna_attention_mask.to(device)
                outputs = model(input_ids, question, attention_mask, token_type_ids, dna_attention_mask)
                test_loss += criterion(outputs, labels.squeeze()).item()  # sum up batch loss
                pred = outputs.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
                correct += pred.eq(labels.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)

        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
    #     for inputs, labels in test_loader:

    #         hidden_states = model(inputs)[0]
    #         embeddings = torch.mean(hidden_states, dim=1)
    #         outputs = classifier(embeddings)

    #         _, predicted = torch.max(outputs.data, 1)
    #         total += labels.size(0)
    #         correct += (predicted == labels).sum().item()

    # print(f'Accuracy: {100 * correct / total:.2f}%')


pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

begin
0

Test set: Average loss: 0.0030, Accuracy: 503/1000 (50.30%)

1

Test set: Average loss: 0.0030, Accuracy: 527/1000 (52.70%)

2

Test set: Average loss: 0.0034, Accuracy: 388/1000 (38.80%)

3

Test set: Average loss: 0.0029, Accuracy: 510/1000 (51.00%)

4

Test set: Average loss: 0.0029, Accuracy: 505/1000 (50.50%)

5

Test set: Average loss: 0.0030, Accuracy: 533/1000 (53.30%)

6

Test set: Average loss: 0.0031, Accuracy: 528/1000 (52.80%)

7

Test set: Average loss: 0.0130, Accuracy: 0/999 (0.00%)

8

Test set: Average loss: 0.0030, Accuracy: 396/1000 (39.60%)

9

Test set: Average loss: 0.0026, Accuracy: 691/1000 (69.10%)

10

Test set: Average loss: 0.0030, Accuracy: 372/1000 (37.20%)

11

Test set: Average loss: 0.0028, Accuracy: 590/1000 (59.00%)

12

Test set: Average loss: 0.0027, Accuracy: 751/1000 (75.10%)

13

Test set: Average loss: 0.0028, Accuracy: 589/1000 (58.90%)

14

Test set: Average loss: 0.0027, Accuracy: 743/1000 (74.30%)

15

Test set: Average loss: 0.010

KeyboardInterrupt: 