<a href="https://colab.research.google.com/github/quang-m-nguyen/DeepPGD/blob/main/build_methylation_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

def build_methylation_model(input_shape, output_units=1):
    """
    Build a neural network model for predicting DNA methylation status from DNABERT2 embeddings.

    Parameters:
    input_shape (tuple): Shape of the DNABERT2 embedding input (e.g., (768,) for BERT-base embeddings).
    output_units (int): Number of output units. Default is 1 for binary classification (methylated or not).

    Returns:
    model (tf.keras.Model): Compiled Keras model ready for training.
    """

    model = models.Sequential()

    # Input layer for DNABERT2 embeddings
    model.add(layers.InputLayer(input_shape=input_shape))

    # Dense layer with ReLU activation
    model.add(layers.Dense(128, activation='relu'))

    # Dropout layer for regularization
    model.add(layers.Dropout(0.3))

    # Another Dense layer
    model.add(layers.Dense(64, activation='relu'))

    # Output layer for binary classification (use sigmoid for binary output)
    model.add(layers.Dense(output_units, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

import pickle
import logging
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO)

def load_embeddings(file_path):
    """
    Load the precomputed embeddings from a pickle file and log the shape.

    Parameters:
    file_path (str): Path to the embeddings pickle file.

    Returns:
    embeddings (np.array): The embeddings data.
    """
    with open(file_path, 'rb') as f:
        embeddings = pickle.load(f)

    # Extract embeddings and convert to a NumPy array
    embedding_data = np.array([item['embedding'].detach().numpy()  for item in embeddings])

    # Log the shape of the embeddings
    logging.info(f"Loaded embeddings shape: {embedding_data.shape}")

    return embedding_data

def map_labels_to_embeddings(df, embeddings):
    """
    Map labels to the corresponding embeddings.

    Parameters:
    df (pd.DataFrame): DataFrame with 'label' and 'text' columns.
    embeddings (np.array): Precomputed embeddings.

    Returns:
    X (np.array): Mapped embeddings.
    y (np.array): Corresponding labels.
    """
    # Extract labels
    labels = df['label'].values

    # Ensure embeddings and labels are aligned
    if len(labels) != len(embeddings):
        raise ValueError("The number of labels and embeddings must be the same.")

    return embeddings, labels

# Load the embeddings from the pickle file
embeddings_file_path = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/combined_embeddings.pkl'
embeddings = load_embeddings(embeddings_file_path)

# Load the dataset from a TSV file
df = pd.read_csv('/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/combined_train_test.tsv', sep='\t')

# Map labels to embeddings
X, y = map_labels_to_embeddings(df, embeddings)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Now, X_train, y_train, X_val, y_val are ready for model training

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
import time

def train_model(X_train, y_train, X_val, y_val, model, batch_size=32, epochs=50):
    """
    Trains the given model using the training data and logs performance metrics.

    Parameters:
    X_train (np.array): Training embeddings.
    y_train (np.array): Training labels.
    X_val (np.array): Validation embeddings.
    y_val (np.array): Validation labels.
    model (tf.keras.Model): The model to train.
    batch_size (int): Batch size for training. Default is 32.
    epochs (int): Number of epochs to train. Default is 10.

    Returns:
    history: Training history object.
    """
    # Create directories for logging and saving models
    log_dir = "/content/drive/MyDrive/deepPGD/logs/fit/" + time.strftime("%Y%m%d-%H%M%S")
    model_save_path = "best_model.keras"

    # Callbacks
    callbacks = [
        ModelCheckpoint(filepath=model_save_path, monitor='val_loss', save_best_only=True, verbose=1),
        EarlyStopping(monitor='val_loss', patience=10, verbose=1),
        TensorBoard(log_dir=log_dir, histogram_freq=1)
    ]

    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=callbacks,
        verbose=2  # Verbose = 2 for better logging during training
    )

    return history



model = build_methylation_model(input_shape=(768,))
# Train the model and log performance
history = train_model(X_train, y_train, X_val, y_val, model, batch_size=500, epochs=50)

In [None]:
import pandas as pd
import os

def join_tsv_files(file_path1, file_path2):
    """
    Joins two TSV files, keeps only the header from the first file,
    and saves the result to a new file named 'combined_tsv'.
    Includes validation to check the order and size of the combined file.

    Parameters:
      file_path1 (str): Path to the first TSV file.
      file_path2 (str): Path to the second TSV file.
    """
    df1 = pd.read_csv(file_path1, sep='\t')  # Read both files with headers
    df2 = pd.read_csv(file_path2, sep='\t')  # Read both files with headers

    combined_df = pd.concat([df1, df2], ignore_index=True)  # Combine the dataframes

    # Validation: Check the 5th item from the first list and 5th last item from the second list
    fifth_item_df1 = df1.iloc[4].values.tolist()  # 5th item from df1
    fifth_combined = combined_df.iloc[4].values.tolist()  # 5th item from combined

    # Check 5th last item from both df2 and combined_df
    fifth_last_df2 = df2.iloc[-5].values.tolist()  # 5th last item from df2
    fifth_last_combined = combined_df.iloc[-5].values.tolist()  # 5th last item from combined


    # Validate if 5th item matches between df1 and combined
    if fifth_item_df1 != fifth_combined:
        print("Error: Incorrect 5th item in combined file (from df1).")
        return  # Terminate if error

    # Validate if the 5th last item from df2 matches the 5th last item from the combined DataFrame
    if fifth_last_df2 != fifth_last_combined:
        print("Error: Incorrect 5th last item in combined file (from df2).")
        return  # Terminate if error

    # Additional Validation: Check if the bottom of combined matches df2
    bottom_combined = combined_df.tail(len(df2)).values.tolist()
    df2_values = df2.values.tolist()

    if bottom_combined != df2_values:
        print("Error: The bottom of the combined file does not match df2.")
        return  # Terminate if error

    # Validate the size of the combined dataframe
    if len(combined_df) != len(df1) + len(df2):
        print("Error: Incorrect size of combined file.")
        return  # Terminate if error

    # Extract directory from file_path1
    directory = os.path.dirname(file_path1)

    # Save the combined DataFrame to a new file
    combined_file_path = os.path.join(directory, 'combined_train_test.tsv')
    combined_df.to_csv(combined_file_path, sep='\t', index=False)

# Example usage
file_path1 = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train.tsv'
file_path2 = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/test.tsv'
join_tsv_files(file_path1, file_path2)

In [None]:
import pickle
import os

def join_pkl_files(file_path1, file_path2, output_dir=None):
    """
    Joins two pickle files and saves the combined data to a new pickle file
    in the specified output directory. If output_dir is not provided,
    the directory of the first file is used.

    Parameters:
        file_path1 (str): Path to the first pickle file.
        file_path2 (str): Path to the second pickle file.
        output_dir (str, optional): Path to the directory where the combined file will be saved.
                                    Defaults to the directory of file_path1.
    """
    # Load data from the first pickle file
    with open(file_path1, 'rb') as f1:
        data1 = pickle.load(f1)
    len1 = len(data1)

    # Load data from the second pickle file
    with open(file_path2, 'rb') as f2:
        data2 = pickle.load(f2)
    len2 = len(data2)

    # Combine the data
    combined_data = data1 + data2

    # Validation checks
    # Check the 5th item from the first list (data1) and combined data
    if len1 >= 5:
        fifth_item_data1 = data1[4]
        fifth_item_combined = combined_data[4]

        if fifth_item_data1 != fifth_item_combined:
            print("Error: 5th item in combined data does not match the 5th item from the first file.")
            return

    # Check the 5th last item from the second list (data2) and combined data
    if len2 >= 5:
        fifth_last_item_data2 = data2[-5]
        fifth_last_item_combined = combined_data[-5]

        if fifth_last_item_data2 != fifth_last_item_combined:
            print("Error: 5th last item in combined data does not match the 5th last item from the second file.")
            return

    # Validate if the bottom of the combined data matches the second file (data2)
    bottom_combined_data = combined_data[-len2:]
    if bottom_combined_data != data2:
        print("Error: The bottom part of the combined data does not match the second file.")
        return

    # Validate the length of the combined data
    if len(combined_data) != len1 + len2:
        print("Error: Length of combined data is incorrect.")
        return

    # Set output directory to the directory of file_path1 if not provided
    if output_dir is None:
        output_dir = os.path.dirname(file_path1)

    # Check if the output directory exists, and create it if it doesn't
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the combined data to a new pickle file after successful validation
    combined_file_path = os.path.join(output_dir, 'combined_embeddings.pkl')
    with open(combined_file_path, 'wb') as f:
        pickle.dump(combined_data, f)

    print(f"Combined file created successfully at {combined_file_path} with correct length.")

# Example usage
file_path1 = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train_embeddings_data.pkl'
file_path2 = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/test_embeddings_data.pkl'
join_pkl_files(file_path1, file_path2)