In [2]:
# !pip install pandas numpy tensorflow transformers

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [10]:
import tensorflow as tf
from transformers import TFBertModel, AutoTokenizer
import pandas as pd
import numpy as np

def load_data(tsv_file):
    """
    Load sequences and labels from a TSV file.
    Args:
        tsv_file (str): Path to the TSV file.
    Returns:
        sequences (List[str]): List of DNA sequences.
        labels (List[int]): List of labels (0 or 1).
    """
    df = pd.read_csv(tsv_file, sep='\t')
    sequences = df['text'].tolist()
    labels = df['label'].tolist()
    return sequences, labels

def tokenize_sequences(sequences, tokenizer, max_length=64):
    """
    Tokenize DNA sequences using DNABERT2 tokenizer.
    Args:
        sequences (List[str]): List of DNA sequences.
        tokenizer: Tokenizer object.
        max_length (int): Maximum sequence length.
    Returns:
        input_ids, attention_mask
    """
    encodings = tokenizer(
        sequences,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return encodings['input_ids'], encodings['attention_mask']

class DNABERTClassifier(tf.keras.Model):
    def __init__(self, pretrained_model_name_or_path, max_length=64):
        super(DNABERTClassifier, self).__init__()
        self.bert = TFBertModel.from_pretrained(
            pretrained_model_name_or_path,
            from_pt=True,
            trust_remote_code=True
        )
        self.dropout1 = tf.keras.layers.Dropout(0.3)
        self.dense1 = tf.keras.layers.Dense(512, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(0.3)
        self.dense2 = tf.keras.layers.Dense(256, activation='relu')
        self.dropout3 = tf.keras.layers.Dropout(0.3)
        self.dense3 = tf.keras.layers.Dense(64, activation='relu')
        self.dropout4 = tf.keras.layers.Dropout(0.3)
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, hidden_size]
        
        logits = self.output_layer(pooled_output)

        return logits

def build_dnabert_classification_model(pretrained_model_name_or_path, max_length=64):
    """
    Build a DNABERT2 model for binary classification with additional layers
    using the Keras Subclassing API.

    Args:
        pretrained_model_name_or_path (str): Name or path of the pretrained DNABERT2 model.
        max_length (int): Maximum sequence length.

    Returns:
        model (tf.keras.Model): A compiled Keras model ready for training.
    """
    # Instantiate the model
    model = DNABERTClassifier(pretrained_model_name_or_path, max_length)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy']
    )

    return model

def train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=64,
    epochs=30,
    batch_size=256
):
    """
    Fine-tune DNABERT2 for binary classification with additional layers.

    Args:
        train_tsv (str): Path to the training TSV file.
        val_tsv (str): Path to the validation TSV file.
        pretrained_model_name_or_path (str): DNABERT2 model identifier.
        max_length (int): Maximum sequence length for tokenization.
        epochs (int): Number of training epochs.
        batch_size (int): Training batch size.

    Returns:
        model (tf.keras.Model): The trained model.
    """
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True
    )

    # Load and tokenize training data
    X_train_sequences, y_train = load_data(train_tsv)
    X_train_input_ids, X_train_attention_mask = tokenize_sequences(
        X_train_sequences, tokenizer, max_length
    )
    y_train = np.array(y_train).astype('float32')

    # Load and tokenize validation data
    X_val_sequences, y_val = load_data(val_tsv)
    X_val_input_ids, X_val_attention_mask = tokenize_sequences(
        X_val_sequences, tokenizer, max_length
    )
    y_val = np.array(y_val).astype('float32')

    # Build the model
    model = build_dnabert_classification_model(
        pretrained_model_name_or_path, max_length
    )

    # Callbacks
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

    early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(
        'best_model.keras',
        monitor='val_accuracy',
        save_best_only=True,
        mode='max'
    )

    # Train the model
    history = model.fit(
        x={
            'input_ids': X_train_input_ids,
            'attention_mask': X_train_attention_mask
        },
        y=y_train,
        validation_data=(
            {
                'input_ids': X_val_input_ids,
                'attention_mask': X_val_attention_mask
            },
            y_val
        ),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, model_checkpoint],
    )

    # Load the best model
    model.load_weights('best_model.keras')

    return model

In [11]:
train_tsv = '/teamspace/studios/this_studio/train-data/train.tsv'  # Update this path
val_tsv = '/teamspace/studios/this_studio/train-data/test.tsv'  # Update this path
model = train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=64,
    epochs=30,
    batch_size=256
)
# Save the trained model
model.save('/teamspace/studios/this_studio/DeepPGD/dnabert_finetuned_model.keras')


print("Model training completed and saved.")

# Optional: Evaluate the model on the validation set
X_val_sequences, y_val = load_data(val_tsv)
tokenizer = AutoTokenizer.from_pretrained('zhihan1996/DNABERT-2-117M', trust_remote_code=True)
X_val_input_ids, X_val_attention_mask = tokenize_sequences(X_val_sequences, tokenizer, max_length=64)
y_val = np.array(y_val).astype('float32')

evaluation = model.evaluate(
    x={'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask},
    y=y_val
)
print(f"Validation Loss: {evaluation[0]:.4f}")
print(f"Validation Accuracy: {evaluation[1]:.4f}")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['bert.encoder.layer.8.mlp.wo.weight', 'cls.predictions.transform.dense.bias', 'bert.encoder.layer.5.attention.self.Wqkv.weight', 'bert.encoder.layer.6.mlp.wo.bias', 'bert.encoder.layer.4.mlp.layernorm.bias', 'bert.encoder.layer.6.mlp.gated_layers.weight', 'bert.encoder.layer.3.mlp.layernorm.weight', 'bert.encoder.layer.2.attention.self.Wqkv.weight', 'bert.encoder.layer.3.mlp.wo.bias', 'bert.encoder.layer.0.attention.self.Wqkv.bias', 'bert.encoder.layer.5.mlp.gated_layers.weight', 'bert.encoder.layer.1.mlp.gated_layers.weight', 'bert.encoder.layer.1.mlp.wo.weight', 'cls.predictions.transform.dense.weight', 'bert.encoder.layer.11.mlp.layernorm.bias', 'bert.encoder.layer.3.attention.self.Wqkv.bias', 'bert.encoder.layer.8.mlp.gated_layers.weight', 'bert.encoder.layer.11.mlp.wo.bias', 'bert.en

Epoch 1/30


W0000 00:00:1727505246.668894   60184 assert_op.cc:38] Ignoring Assert operator dnabert_classifier_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert


[1m61/62[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 198ms/step - accuracy: 0.5067 - loss: 0.7565

W0000 00:00:1727505263.503624   60187 assert_op.cc:38] Ignoring Assert operator dnabert_classifier_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert


[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - accuracy: 0.5067 - loss: 0.7565

W0000 00:00:1727505269.543502   60183 assert_op.cc:38] Ignoring Assert operator dnabert_classifier_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1727505284.790949   60185 assert_op.cc:38] Ignoring Assert operator dnabert_classifier_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert


[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 596ms/step - accuracy: 0.5066 - loss: 0.7564 - val_accuracy: 0.5000 - val_loss: 0.7271
Epoch 2/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 403ms/step - accuracy: 0.5005 - loss: 0.7385 - val_accuracy: 0.5000 - val_loss: 0.7120
Epoch 3/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 412ms/step - accuracy: 0.4976 - loss: 0.7268 - val_accuracy: 0.4999 - val_loss: 0.7030
Epoch 4/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 419ms/step - accuracy: 0.4951 - loss: 0.7191 - val_accuracy: 0.4998 - val_loss: 0.6979
Epoch 5/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 425ms/step - accuracy: 0.5048 - loss: 0.7121 - val_accuracy: 0.5041 - val_loss: 0.6953
Epoch 6/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 430ms/step - accuracy: 0.5011 - loss: 0.7117 - val_accuracy: 0.5028 - val_loss: 0.6939
Epoch 7/30
[1m62/62[0m [32m━━━