In [None]:
# !pip install pandas numpy tensorflow transformers

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFBertForSequenceClassification
import os

def load_data(tsv_file):
    """
    Load sequences and labels from a TSV file.
    Args:
        tsv_file (str): Path to the TSV file.
    Returns:
        sequences (List[str]): List of DNA sequences.
        labels (List[int]): List of labels (0 or 1).
    """
    df = pd.read_csv(tsv_file, sep='\t')
    sequences = df['text'].tolist()
    labels = df['label'].tolist()
    return sequences, labels

def tokenize_sequences(sequences, tokenizer, max_length=64):
    """
    Tokenize DNA sequences using DNABERT2 tokenizer.
    Args:
        sequences (List[str]): List of DNA sequences.
        tokenizer: Tokenizer object.
        max_length (int): Maximum sequence length.
    Returns:
        input_ids, attention_mask
    """
    encodings = tokenizer(
        sequences,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return encodings['input_ids'], encodings['attention_mask']

def build_dnabert_classification_model(pretrained_model_name_or_path, max_length=64):
    """
    Build a DNABERT2 model for binary classification.
    Args:
        pretrained_model_name_or_path (str): Name or path of the pretrained DNABERT2 model.
        max_length (int): Maximum sequence length.
    Returns:
        model (TFBertForSequenceClassification): A compiled Keras model ready for training.
    """
    # Load DNABERT2 model
    model = TFBertForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path,
        num_labels=2,
        from_pt=True
    )

    # Compile the model
    model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=['accuracy']
        )
    return model

def train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='zhihan1996/DNABERT-2-117M',
    max_length=64,
    epochs=30,
    batch_size=300
):
    """
    Fine-tune DNABERT2 for binary classification.
    Args:
        train_tsv (str): Path to the training TSV file.
        val_tsv (str): Path to the validation TSV file.
        pretrained_model_name_or_path (str): DNABERT2 model identifier.
        max_length (int): Maximum sequence length for tokenization.
        epochs (int): Number of training epochs.
        batch_size (int): Training batch size.
    Returns:
        model (TFBertForSequenceClassification): The trained model.
    """
    # Check if files exist
    if not os.path.isfile(train_tsv):
        raise FileNotFoundError(f"Training file not found: {train_tsv}")
    if not os.path.isfile(val_tsv):
        raise FileNotFoundError(f"Validation file not found: {val_tsv}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True
    )

    # Load and tokenize training data
    X_train_sequences, y_train = load_data(train_tsv)
    X_train_input_ids, X_train_attention_mask = tokenize_sequences(
        X_train_sequences, tokenizer, max_length
    )
    y_train = np.array(y_train).astype('float32')

    # Load and tokenize validation data
    X_val_sequences, y_val = load_data(val_tsv)
    X_val_input_ids, X_val_attention_mask = tokenize_sequences(
        X_val_sequences, tokenizer, max_length
    )
    y_val = np.array(y_val).astype('float32')

    # Build the model
    model = build_dnabert_classification_model(
        pretrained_model_name_or_path, max_length
    )

    # Train the model
    history = model.fit(
        x={
            'input_ids': X_train_input_ids,
            'attention_mask': X_train_attention_mask
        },
        y=y_train,
        validation_data=(
            {
                'input_ids': X_val_input_ids,
                'attention_mask': X_val_attention_mask
            },
            y_val
        ),
        epochs=epochs,
        batch_size=batch_size,
        use_multiprocessing=True,
        workers=2,
        max_queue_size=32
    )
    return model

2024-10-02 07:58:23.175789: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-02 07:58:23.223553: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-02 07:58:23.237208: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-02 07:58:23.334426: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
train_tsv = '/teamspace/studios/this_studio/train-data/train.tsv'  # Update this path
val_tsv = '/teamspace/studios/this_studio/train-data/test.tsv'  # Update this path
model = train_dnabert_finetuning_model(
    train_tsv,
    val_tsv,
    pretrained_model_name_or_path='bert-large-cased',
    max_length=64,
    epochs=30,
    batch_size=256
)
# Save the trained model
model.save('/teamspace/studios/this_studio/DeepPGD/dnabert_finetuned_model')


print("Model training completed and saved.")

# Optional: Evaluate the model on the validation set
X_val_sequences, y_val = load_data(val_tsv)
tokenizer = AutoTokenizer.from_pretrained('zhihan1996/DNABERT-2-117M', trust_remote_code=True)
X_val_input_ids, X_val_attention_mask = tokenize_sequences(X_val_sequences, tokenizer, max_length=64)
y_val = np.array(y_val).astype('float32')

evaluation = model.evaluate(
    x={'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask},
    y=y_val
)
print(f"Validation Loss: {evaluation[0]:.4f}")
print(f"Validation Accuracy: {evaluation[1]:.4f}")

NameError: name 'train_dnabert_finetuning_model' is not defined