In [None]:
!pip install pandas numpy tensorflow transformers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel, AutoConfig

def load_data(tsv_file):
    """
    Load sequences and labels from a TSV file.
    Args:
        tsv_file (str): Path to the TSV file.
    Returns:
        sequences (List[str]): List of DNA sequences.
        labels (List[int]): List of labels (0 or 1).
    """
    df = pd.read_csv(tsv_file, sep='\t')
    sequences = df['text'].tolist()
    labels = df['label'].tolist()
    return sequences, labels

def tokenize_sequences(sequences, tokenizer, max_length=512):
    """
    Tokenize DNA sequences using DNABERT2 tokenizer.
    Args:
        sequences (List[str]): List of DNA sequences.
        tokenizer: Tokenizer object.
        max_length (int): Maximum sequence length.
    Returns:
        input_ids, attention_mask
    """
    encodings = tokenizer(
        sequences,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return encodings['input_ids'], encodings['attention_mask']

def build_dnabert_classification_model(pretrained_model_name_or_path, max_length=512):
    """
    Build a DNABERT2 model for binary classification.
    Args:
        pretrained_model_name_or_path (str): Name or path of the pretrained DNABERT2 model.
        max_length (int): Maximum sequence length.
    Returns:
        model (tf.keras.Model): A compiled Keras model ready for training.
    """
    # Load configuration and set num_labels=1 for binary classification
    config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path,
        num_labels=1,
        finetuning_task='classification',
        problem_type='single_label_classification'
    )
    # Load DNABERT2 model
    bert_model = TFAutoModel.from_pretrained(
        pretrained_model_name_or_path,
        config=config,
        from_pt=True,
        trust_remote_code=True
    )
    # Build the model
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name='input_ids'
    )
    attention_mask = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name='attention_mask'
    )
    outputs = bert_model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    pooled_output = outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token
    x = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
    x = tf.keras.layers.Dropout(0.1)(x)
    logits = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=logits
    )
    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

def train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=512,
    epochs=3,
    batch_size=16
):
    """
    Fine-tune DNABERT2 for binary classification.
    Args:
        train_tsv (str): Path to the training TSV file.
        val_tsv (str): Path to the validation TSV file.
        pretrained_model_name_or_path (str): DNABERT2 model identifier.
        max_length (int): Maximum sequence length for tokenization.
        epochs (int): Number of training epochs.
        batch_size (int): Training batch size.
    Returns:
        model (tf.keras.Model): The trained model.
    """
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True
    )
    # Load and tokenize training data
    X_train_sequences, y_train = load_data(train_tsv)
    X_train_input_ids, X_train_attention_mask = tokenize_sequences(
        X_train_sequences, tokenizer, max_length
    )
    y_train = np.array(y_train).astype('float32')
    # Load and tokenize validation data
    X_val_sequences, y_val = load_data(val_tsv)
    X_val_input_ids, X_val_attention_mask = tokenize_sequences(
        X_val_sequences, tokenizer, max_length
    )
    y_val = np.array(y_val).astype('float32')
    # Build the model
    model = build_dnabert_classification_model(
        pretrained_model_name_or_path, max_length
    )
    # Train the model
    history = model.fit(
        x={
            'input_ids': X_train_input_ids,
            'attention_mask': X_train_attention_mask
        },
        y=y_train,
        validation_data=(
            {
                'input_ids': X_val_input_ids,
                'attention_mask': X_val_attention_mask
            },
            y_val
        ),
        epochs=epochs,
        batch_size=batch_size
    )
    return model

In [None]:
train_tsv = '/path/to/train.tsv'  # Update this path
val_tsv = '/path/to/validation.tsv'  # Update this path
model = train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=512,
    epochs=3,
    batch_size=16
)
# Save the trained model
model.save('dnabert_finetuned_model')