In [20]:
%reload_ext autoreload
%autoreload 0
from pathlib import Path
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, TrainingArguments, Trainer
#from fastai import *
#from fastai.text.all import *
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, roc_curve, recall_score, f1_score, classification_report
from torch.utils.data import Dataset, DataLoader, SequentialSampler


In [2]:
model_dir = "/home/sm1073/.cache/huggingface/hub/models--zhihan1996--DNABERT-2-117M/DNABERT-2-117M"
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_dir, trust_remote_code=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/sm1073/.cache/huggingface/hub/models--zhihan1996--DNABERT-2-117M/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
path = Path('/home/sm1073/Documents/independent_project/')

In [5]:
df = pd.read_csv(path/'x_inactivation_genes.csv')
df.head()

Unnamed: 0,GeneName,Sequence,Label
0,ASMTL,AAGTGCGGACGCCCGGCTCCCGGCGTGGACGCCATGGTGCTGTGCC...,1
1,RIBC1,CGGGCGACCGGCAAATGTCGCGAGAATACGTCCAGGCCTAACGGGA...,1
2,MPC1L,CTGTGGCGGAAGATGAGAGATAACTTCCAGAGCAAGGAGTTCCGGG...,1
3,DDX3X,CTTTCCCCTTACTCCGCTCCCCTCTTTTCCCTCCCTCTCCTCCCCT...,1
4,HDHD1,CTCAGTGCGCGTGCGCGGGGCGGGCGGGTGCGCGCGCACTTCCTCC...,1


In [6]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
class DNADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sequence
        self.labels = dataframe.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        dna_sequence = str(self.text.iloc[index])
        inputs = self.tokenizer(dna_sequence, return_tensors='pt', max_length=self.max_len, padding='max_length', truncation=True)
        labels = self.labels.iloc[index]
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50
)

In [9]:
train_dataset = DNADataset(train_df, tokenizer, max_len=512)
val_dataset = DNADataset(val_df, tokenizer, max_len=512)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [10]:
trainer.train()
results = trainer.evaluate()
print(results)

Step,Training Loss,Validation Loss
50,0.6732,0.699614


{'eval_loss': 0.6989232897758484, 'eval_runtime': 34.8368, 'eval_samples_per_second': 1.148, 'eval_steps_per_second': 0.144, 'epoch': 3.0}


In [15]:
class DNADatasetForPrediction(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sequence
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        dna_sequence = str(self.text.iloc[index])
        inputs = self.tokenizer(dna_sequence, return_tensors='pt', max_length=self.max_len, padding='max_length', truncation=True)
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten()
        }


In [16]:
#dataset
predict_dataset = DNADatasetForPrediction(df, tokenizer, max_len=512)
#dataloader
predict_dataloader = DataLoader(predict_dataset, sampler=SequentialSampler(predict_dataset), batch_size=8)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_features=768, out_f

In [18]:
predictions = []
with torch.no_grad():
    for batch in predict_dataloader:
        inputs = {
            'input_ids': batch['input_ids'],
            'attention_mask': batch['attention_mask']
        }
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.softmax(logits, dim=1).cpu().numpy())

predictions = np.array(predictions)
pred_labels = np.argmax(predictions, axis=1)


In [31]:
print(predictions)

[[0.486771   0.513229  ]
 [0.5210752  0.47892484]
 [0.49427083 0.50572914]
 [0.4994406  0.5005594 ]
 [0.50485295 0.49514702]
 [0.50629497 0.4937051 ]
 [0.49804747 0.50195247]
 [0.49493086 0.50506914]
 [0.5103201  0.48967984]
 [0.5359552  0.46404478]
 [0.5162862  0.48371384]
 [0.5280626  0.47193748]
 [0.48489118 0.51510876]
 [0.51309896 0.48690102]
 [0.50312674 0.4968733 ]
 [0.5176405  0.4823595 ]
 [0.54898036 0.45101956]
 [0.5208816  0.47911844]
 [0.54039896 0.45960099]
 [0.5050372  0.4949628 ]
 [0.5163076  0.48369244]
 [0.48166674 0.51833326]
 [0.49509546 0.50490457]
 [0.5396035  0.46039647]
 [0.5252875  0.4747125 ]
 [0.4948651  0.5051349 ]
 [0.46449602 0.535504  ]
 [0.5044936  0.49550644]
 [0.50871044 0.49128956]
 [0.51682687 0.48317304]
 [0.513368   0.48663193]
 [0.50653946 0.4934605 ]
 [0.51430655 0.48569348]
 [0.5223847  0.4776153 ]
 [0.5034893  0.49651074]
 [0.4955704  0.50442964]
 [0.5208514  0.4791486 ]
 [0.5202584  0.47974154]
 [0.50790006 0.4921    ]
 [0.5547467  0.44525322]


In [27]:
true_labels = df['Label'].values

In [32]:
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average='weighted')
recall = recall_score(true_labels, pred_labels, average='weighted')
f1 = f1_score(true_labels, pred_labels, average='weighted')
#extract probabilities for escape
escape_prob = predictions[:,1]
roc_auc = roc_auc_score(true_labels, escape_prob)

In [35]:
# Print the classification report
print(classification_report(true_labels, pred_labels))

# Print the metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC score:{roc_auc}')

              precision    recall  f1-score   support

           0       0.58      0.95      0.72       101
           1       0.86      0.30      0.45        99

    accuracy                           0.63       200
   macro avg       0.72      0.63      0.58       200
weighted avg       0.72      0.63      0.59       200

Accuracy: 0.63
Precision: 0.718103896103896
Recall: 0.63
F1 Score: 0.5861530692402649
ROC AUC score:0.7772777277727773


NameError: name 'label_binarize' is not defined