<a href="https://colab.research.google.com/github/quang-m-nguyen/DeepPGD/blob/main/deep_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import time

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.callbacks import Callback

# from keras_self_attention import SeqSelfAttention
# from keras.utils import to_categorical
from keras.layers import (
    Bidirectional,
    Conv1D,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    Input,
    LayerNormalization,
    concatenate,
    LSTM
)
from keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from tensorflow.keras import initializers, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def get_y_train():
    train_filename = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train.tsv'
    test_filename = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/test.tsv'

    x_test = np.array([])
    y_test = np.array([])
    x_train = np.array([])
    y_train = np.array([])

    test_labels = []
    three_er_list = []


    train_data = pd.read_csv(train_filename,header = None, sep = "\t")
    test_data  = pd.read_csv(test_filename,header = None, sep = "\t")

    # 0,1
    y_train =  train_data[1][:]

    y_test = test_data[1][:]

    return y_train.to_numpy()  , y_test.to_numpy()



In [None]:
def make_test_labels(y_data):
  test_labels = []
  for i in range(1, len(y_data)):
    if(y_data[i] == "1"):
        test_labels.append([1,0])
    elif(y_data[i] == "0"):
        test_labels.append([0,1])
    else:
        continue

  return test_labels

In [None]:
import pickle

def get_embeddings_array(pkl_file_path):
    """
    Reads a pickle file containing embeddings data and returns the embeddings as a NumPy array.

    Args:
        pkl_file_path (str): Path to the pickle file.

    Returns:
        numpy.ndarray: Array containing the embeddings.
    """
    with open(pkl_file_path, 'rb') as f:
        embeddings_data = pickle.load(f)

    embeddings_list = [item['embedding'].detach().numpy() for item in embeddings_data] # Detach the tensor from the computation graph
    return np.array(embeddings_list)





In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

def prepare_training_data_v1():
    # Example usage
    pkl_file_path_train = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train_embeddings_data.pkl'
    x_train = get_embeddings_array(pkl_file_path_train)

    print('prepare_training_data_v2: x_train')
    print(len(x_train))

    pkl_file_path_test = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/test_embeddings_data.pkl'
    x_test = get_embeddings_array(pkl_file_path_test)

    y_train, y_test = get_y_train()

    # Concatenate train and test data
    x_data = np.concatenate((x_train, x_test), axis=0)
    y_data = np.concatenate((y_train, y_test), axis=0)

    # Create test labels from y_data
    test_labels = make_test_labels(y_data)

    # Split the data into train and test sets with stratified sampling
    x_train, x_test, y_train, y_test = train_test_split(
        x_data, test_labels, test_size=0.10, stratify=test_labels
    )

    return x_train, x_test, y_train, y_test

In [None]:
def prepare_training_data():
    train_filename = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train.tsv'
    test_filename = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/test.tsv'

    x_test = np.array([])
    y_test = np.array([])
    x_train = np.array([])
    y_train = np.array([])

    test_labels = []
    three_er_list = []

    K_MER = 3


    train_data = pd.read_csv(train_filename,header = None, sep = "\t")
    test_data  = pd.read_csv(test_filename,header = None, sep = "\t")

    # ACTG
    pro_x_train = train_data[2][1:]
    # 0,1
    y_train =  train_data[1][:]
    print(y_train)

    pos_train_len = len(pro_x_train)

    pro_x_test =  test_data[2][1:]
    y_test = test_data[1][:]

    pro_x_data  = pd.concat([pro_x_train,pro_x_test],ignore_index= True )
    pro_y_data  = pd.concat([y_train,y_test],ignore_index= True )


    for i in range(1, len(pro_y_data)):
      if(pro_y_data[i] == "1"):
          test_labels.append([1,0])
      elif(pro_y_data[i] == "0"):
          test_labels.append([0,1])
      else:
          continue

    # K-mer Encoding for DNA Sequences
    #
    # Purpose:
    # - Transform variable-length DNA sequences into fixed-length feature representations
    # - Capture local sequence patterns that may be relevant to DNA methylation sites
    # - Create a suitable input format for machine learning models
    #
    # Functionality:
    # 1. Set k-mer size (K=3 in this case)
    # 2. For each DNA sequence:
    #    a. Convert to string and remove any extra characters
    #    b. Generate all possible k-mers (substrings of length K)
    #    c. Store k-mers for each sequence in a list
    # 3. Collect k-mer lists for all sequences in str_array
    #
    # Benefits:
    # - Captures local sequence context
    # - Provides fixed-length representation for variable-length sequences
    # - Reduces sequence complexity while retaining important features
    # - Facilitates efficient sequence comparison and analysis
    # - Improves feature extraction for machine learning models
    #
    # Example:
    # Input DNA sequence: "ATCGATCG"
    # Resulting k-mers (K=3): ["ATC", "TCG", "CGA", "GAT", "ATC", "TCG"]
    #
    # Data structure:
    # str_array = [
    #     ["ATC", "TCG", "CGA", "GAT", "ATC", "TCG"],
    # ]

    for i in pro_x_data:
        seq_str = str(i)
        seq_str = seq_str.strip('[]\'')
        t=0
        l=[]
        for index in range(len(seq_str)):
            t=seq_str[index:index+K_MER]
            if (len(t))==K_MER:
                l.append(t)
        three_er_list.append(l)



    # DNA Sequence Preprocessing
    # Purpose: Turn DNA chunks into number lists for machine learning
    # Steps:
    # 1. Assign a unique number to each DNA chunk (up to 30,000 most common chunks)
    # 2. Convert each DNA sequence to a list of these numbers tokens
    # 3. Make all lists the same length (48) by adding zeros at the end if needed

    # Example:
    # Input DNA sequences:
    #   ["ATCG", "CGTA", "ATCGATCG"]
    #
    # After assigning numbers:
    #   ATCG -> 1, CGTA -> 2, GATC -> 3
    #
    # Converted to number lists:
    #   [1, 2]
    #   [2, 1]
    #   [1, 3, 1]
    #
    # Final output (padded to length 48):
    #   [1, 2, 0, 0, ..., 0]  (46 zeros)
    #   [2, 1, 0, 0, ..., 0]  (46 zeros)
    #   [1, 3, 1, 0, ..., 0]  (45 zeros)

    tokenizer = Tokenizer(num_words = 30000)
    tokenizer.fit_on_texts(three_er_list)
    sequences = tokenizer.texts_to_sequences(three_er_list)
    sequences = pad_sequences(sequences, maxlen = 48, padding = "post")
    sequences = np.array(sequences)


    x_train,x_test = sequences[:pos_train_len],sequences[pos_train_len:]
    # print(x_train)

    y_train,y_test = test_labels[:pos_train_len],test_labels[pos_train_len:]

    return x_train, x_test, y_train, y_test





In [None]:
# x_train, x_test, y_train, y_test = prepare_training_data()
# print(x_train.shape)
# print(x_test.shape)

# x_train, x_test, y_train, y_test = prepare_training_data_v1()
# print(x_train.shape)
# print(x_test.shape)
# print(y_train.shape)
# print(y_test.shape)

In [None]:
def create_masked_data(x_train, mask_percentage):
    """
    Create a masked version of the input data.

    Args:
    x_train (numpy.ndarray): Input data to be masked.
    mask_percentage (float): Percentage of data to be masked, between 0 and 1.

    Returns:
    numpy.ndarray: Masked version of the input data.
    """
    # Validate mask_percentage range
    if mask_percentage < 0.0 or mask_percentage > 1.0:
        raise ValueError("mask_percentage must be between 0 and 1.")

    # Create a boolean mask: True with probability mask_percentage
    mask = np.random.random(x_train.shape) < mask_percentage

    # Apply the mask: keep original values where mask is False, set to 0 where mask is True
    masked_data = np.where(mask, 0.0, x_train)

    return np.array(masked_data)

In [None]:
import time
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Dropout, LSTM, Conv1D, GlobalAveragePooling1D,
    concatenate, Bidirectional, LayerNormalization, Reshape
)
from tensorflow.keras.models import Model

def make_model_architecture():
    t = time.time()
    my_time = int(round(t * 1000)) % 2147483648
    np.random.seed(my_time)

    sequence_input = Input(shape=(768,))

    # Reshape input to add a time dimension
    x = Reshape((1, 768))(sequence_input)

    x = LayerNormalization(epsilon=1e-6)(x)
    x = Dropout(0.1)(x)

    # Multi-head self-attention block
    attention = tf.keras.layers.MultiHeadAttention(num_heads=8, key_dim=96)(x, x)
    x = x + attention
    x = LayerNormalization(epsilon=1e-6)(x)

    # Feed-forward network
    ffn = Dense(3072, activation="gelu")(x)
    ffn = Dense(768)(ffn)
    x = x + ffn
    x = LayerNormalization(epsilon=1e-6)(x)

    # LSTM layers
    x = Bidirectional(LSTM(384, return_sequences=True))(x)
    x = Dropout(0.1)(x)
    x = LayerNormalization(epsilon=1e-6)(x)

    x = Bidirectional(LSTM(384, return_sequences=True))(x)
    x = Dropout(0.1)(x)
    x = LayerNormalization(epsilon=1e-6)(x)

    # Convolutional layers
    conv1 = Conv1D(filters=256, kernel_size=3, activation="gelu", padding="same")(x)
    conv2 = Conv1D(filters=256, kernel_size=5, activation="gelu", padding="same")(x)
    conv3 = Conv1D(filters=256, kernel_size=7, activation="gelu", padding="same")(x)
    x = concatenate([conv1, conv2, conv3], axis=-1)
    x = LayerNormalization(epsilon=1e-6)(x)
    x = Dropout(0.1)(x)

    # Global average pooling
    x = GlobalAveragePooling1D()(x)

    # Final dense layers
    x = Dense(512, activation="gelu")(x)
    x = Dropout(0.1)(x)
    x = Dense(256, activation="gelu")(x)
    x = Dropout(0.1)(x)

    output = Dense(2, activation="softmax")(x)

    model = Model(inputs=sequence_input, outputs=output)

    model.compile(
        loss="categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
        metrics=["accuracy"]
    )

    return model

In [None]:
class AUCMCCCallback(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data
        self.last_epoch_logs = {}
        self.best_val_accuracy = 0.0

    def on_epoch_end(self, epoch, logs={}):
        self.last_epoch_logs = logs.copy()

    def on_epoch_begin(self, epoch, logs={}):
        if epoch < 1:
            return

        x_val, y_val = self.validation_data
        y_pred = self.model.predict(x_val)

        # Calculate metrics using TensorFlow operations
        auc = roc_auc_score(y_val, y_pred)

        # Convert predictions and labels to binary for MCC
        y_pred_binary = tf.argmax(y_pred, axis=1)
        y_val_binary = tf.argmax(y_val, axis=1)
        mcc = matthews_corrcoef(y_val_binary.numpy(), y_pred_binary.numpy())  # MCC needs NumPy arrays

        # Calculate accuracy with TensorFlow
        accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred_binary, y_val_binary), tf.float32))

        print(f"\nValidation AUC: {auc:.4f} - MCC: {mcc:.4f} - ACC: {accuracy.numpy():.4f}")

        # Track and print best validation accuracy
        val_accuracy = self.last_epoch_logs.get("val_accuracy", 0)
        if val_accuracy > self.best_val_accuracy:
            self.best_val_accuracy = val_accuracy
        print("epoch[", epoch, "].val_accuracy:", val_accuracy)
        print("epoch[", epoch, "].best_accuracy:", self.best_val_accuracy)

In [None]:
# Usage
x_train, x_test, y_train, y_test = prepare_training_data_v1()

# Convert NumPy arrays to TensorFlow Tensors
x_train = tf.convert_to_tensor(x_train)
x_test = tf.convert_to_tensor(x_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)
# print(x_train)
mask_percentage = 0.15  # Mask 15% of the data
# x_train = create_masked_data(x_train, mask_percentage)
model = make_model_architecture()

auc_mcc_callback = AUCMCCCallback(validation_data=(x_test, y_test))
model.fit(
    x_train,
    y_train,
    batch_size=1000,
    epochs=100,
    validation_data=(x_test, y_test),
    callbacks=[auc_mcc_callback],
)

# accuracy_best_list[number] = best_acc

# for i in accuracy_best_list.keys():
#     print("best_acc[", i, "] = ", accuracy_best_list[i])
# avg = float(sum(accuracy_best_list.values())) / len(accuracy_best_list)
# print()
# print("best_acc[avg] = ", avg)

In [None]:
# # Draw architecture

# from tensorflow.keras.utils import plot_model

# plot_model(
#     model,
#     to_file='model_plot.png',         # Save the plot to a file
#     show_shapes=True,                 # Show input and output shapes
#     show_layer_names=True,            # Show layer names
#     dpi=96,                           # Set the resolution of the plot
# )


In [None]:
import pickle

def get_pkl_length(pkl_file_path):
    """
    Opens a pickle file, loads the data, and returns the length of the loaded object.

    Args:
        pkl_file_path (str): Path to the pickle file.

    Returns:
        int: Length of the loaded object.
    """
    with open(pkl_file_path, 'rb') as f:
        data = pickle.load(f)
    return len(data)

# Example usage:
pkl_file_path = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train_embeddings_data.pkl'
length = get_pkl_length(pkl_file_path)
print(f"Length of the data in the pickle file: {length}")

In [None]:
import pandas as pd

def get_tsv_length(tsv_file_path):
    """
    Reads a TSV file and returns the number of rows (length) in the file.

    Args:
        tsv_file_path (str): Path to the TSV file.

    Returns:
        int: Number of rows in the TSV file.
    """
    df = pd.read_csv(tsv_file_path, sep='\t', header=None)  # Read TSV file into a DataFrame
    return len(df)  # Return the number of rows

# Example usage:
tsv_file_path = '/content/drive/MyDrive/deepPGD/4mC/4mC_F.vesca/train.tsv'
length = get_tsv_length(tsv_file_path)
print(f"Number of rows in the TSV file: {length}")