<a href="https://colab.research.google.com/github/quang-m-nguyen/DeepPGD/blob/main/bert_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy tensorflow transformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFBertForSequenceClassification
import os

def load_data(tsv_file):
    """
    Load sequences and labels from a TSV file.
    Args:
        tsv_file (str): Path to the TSV file.
    Returns:
        sequences (List[str]): List of DNA sequences.
        labels (List[int]): List of labels (0 or 1).
    """
    df = pd.read_csv(tsv_file, sep='\t')
    sequences = df['text'].tolist()
    labels = df['label'].tolist()
    return sequences, labels

def tokenize_sequences(sequences, tokenizer, max_length=64):
    """
    Tokenize DNA sequences using DNABERT2 tokenizer.
    Args:
        sequences (List[str]): List of DNA sequences.
        tokenizer: Tokenizer object.
        max_length (int): Maximum sequence length.
    Returns:
        input_ids, attention_mask
    """
    encodings = tokenizer(
        sequences,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return encodings['input_ids'], encodings['attention_mask']

def build_dnabert_classification_model(pretrained_model_name_or_path, max_length=64):
    """
    Build a DNABERT2 model for binary classification.
    Args:
        pretrained_model_name_or_path (str): Name or path of the pretrained DNABERT2 model.
        max_length (int): Maximum sequence length.
    Returns:
        model (TFBertForSequenceClassification): A compiled Keras model ready for training.
    """
    # Load DNABERT2 model
    model = TFBertForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path,
        num_labels=1,
        from_pt=True
    )

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    return model

def train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=64,
    epochs=3,
    batch_size=16
):
    """
    Fine-tune DNABERT2 for binary classification.
    Args:
        train_tsv (str): Path to the training TSV file.
        val_tsv (str): Path to the validation TSV file.
        pretrained_model_name_or_path (str): DNABERT2 model identifier.
        max_length (int): Maximum sequence length for tokenization.
        epochs (int): Number of training epochs.
        batch_size (int): Training batch size.
    Returns:
        model (TFBertForSequenceClassification): The trained model.
    """
    # Check if files exist
    if not os.path.isfile(train_tsv):
        raise FileNotFoundError(f"Training file not found: {train_tsv}")
    if not os.path.isfile(val_tsv):
        raise FileNotFoundError(f"Validation file not found: {val_tsv}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True
    )

    # Load and tokenize training data
    X_train_sequences, y_train = load_data(train_tsv)
    X_train_input_ids, X_train_attention_mask = tokenize_sequences(
        X_train_sequences, tokenizer, max_length
    )
    y_train = np.array(y_train).astype('float32')

    # Load and tokenize validation data
    X_val_sequences, y_val = load_data(val_tsv)
    X_val_input_ids, X_val_attention_mask = tokenize_sequences(
        X_val_sequences, tokenizer, max_length
    )
    y_val = np.array(y_val).astype('float32')

    # Build the model
    model = build_dnabert_classification_model(
        pretrained_model_name_or_path, max_length
    )

    # Train the model
    history = model.fit(
        x={
            'input_ids': X_train_input_ids,
            'attention_mask': X_train_attention_mask
        },
        y=y_train,
        validation_data=(
            {
                'input_ids': X_val_input_ids,
                'attention_mask': X_val_attention_mask
            },
            y_val
        ),
        epochs=epochs,
        batch_size=batch_size,
        use_multiprocessing=True,
        workers=32
    )
    return model

In [None]:
train_tsv = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train.tsv'  # Update this path
val_tsv = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/test.tsv'  # Update this path
model = train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=64,
    epochs=30,
    batch_size=1028
)
# Save the trained model
model.save('/content/drive/MyDrive/deepPGD/dnabert_finetuned_model')


print("Model training completed and saved.")

# Optional: Evaluate the model on the validation set
X_val_sequences, y_val = load_data(val_tsv)
tokenizer = AutoTokenizer.from_pretrained('zhihan1996/DNABERT-2-117M', trust_remote_code=True)
X_val_input_ids, X_val_attention_mask = tokenize_sequences(X_val_sequences, tokenizer, max_length=64)
y_val = np.array(y_val).astype('float32')

evaluation = model.evaluate(
    x={'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask},
    y=y_val
)
print(f"Validation Loss: {evaluation[0]:.4f}")
print(f"Validation Accuracy: {evaluation[1]:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/168k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.11.attention.self.Wqkv.weight', 'bert.encoder.layer.9.attention.self.Wqkv.weight', 'bert.encoder.layer.11.mlp.wo.bias', 'bert.encoder.layer.3.mlp.layernorm.bias', 'bert.encoder.layer.2.mlp.wo.weight', 'bert.encoder.layer.5.mlp.layernorm.weight', 'bert.encoder.layer.5.attention.self.Wqkv.weight', 'bert.encoder.layer.3.attention.self.Wqkv.bias', 'bert.encoder.layer.5.mlp.layernorm.bias', 'bert.encoder.layer.11.mlp.layernorm.bias', 'bert.encoder.layer.9.mlp.layernorm.bias', 'bert.encoder.layer.10.mlp.layernorm.bias', 'bert.encoder.layer.8.mlp.wo.weight', 'bert.encoder.layer.8.attention.self.Wqkv.weight', 'bert.encoder.layer.3.mlp.layernorm.weight', 'bert.encoder.layer.10.mlp.wo.weight', 'bert.encoder.layer.5.mlp.gated_layers.weight', 'bert.encoder.layer.6.attention.self.Wqkv.bias', 'bert.encoder.layer.4.attention.self.Wqkv.weight', 'bert.encoder.layer.11

Epoch 1/30


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported