# Data Loading and Preprocessing for RNA Sequences

### Description:
### Load Data Function (load_data):
- This function takes a file path and a column name as inputs. It reads the data from the specified CSV file into a dataframe (a table-like structure in Python).
- It checks if there are any missing values in the specified column that contains RNA sequences. If missing values are found, it removes those rows from the dataframe to ensure that the analysis is done only on complete data.
### One-Hot Encoding Function (one_hot_encode):
- This function takes an RNA sequence as input. RNA sequences are made up of nucleotides represented by the letters A, C, G, and U.
- It checks if the input sequence is in the correct format (i.e., it's a string). If it’s not, the function returns None, which is a way of indicating missing or incorrect data.
- The sequence is then converted to uppercase and any occurrence of 'T' (thymine, which is found in DNA but not RNA) is replaced with 'U' (uracil, which is specific to RNA).
- The function then converts the sequence into a one-hot encoded format. This means each nucleotide (A, C, G, U) is represented as a vector (a list of numbers). For example, 
A is represented as [1, 0, 0, 0], C as [0, 1, 0, 0], G as [0, 0, 1, 0], and U as [0, 0, 0, 1]. This numerical representation is useful for computational models that require numeric input.
### Loading Specific Datasets (ENCORI and LncBase):
- The code then loads two datasets from CSV files: one called ENCORI_miRNA_lncRNA.csv and another called lncbase_with_sequences.csv. These datasets contain RNA sequences along with other biological information.
It applies the one-hot encoding function to specific columns in these datasets that contain RNA sequences. This prepares the data for further analysis or modeling, where understanding the relationships between sequences can be crucial, such as predicting RNA interactions or functions.

In [1]:
import numpy as np
import pandas as pd

def load_and_encode_data(file_path, sequence_column, structure_column):
    df = pd.read_csv(file_path)
    df.dropna(subset=[sequence_column, structure_column], inplace=True)
    df['encoded_sequence'] = df[sequence_column].apply(one_hot_encode)
    df['encoded_structure'] = df[structure_column].apply(encode_structure)
    return df

def one_hot_encode(sequence):
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'U': [0, 0, 0, 1]}
    return np.array([mapping.get(nucleotide, [0, 0, 0, 0]) for nucleotide in sequence.upper().replace('T', 'U')], dtype=np.float32)

def encode_structure(structure):
    mapping = {'.': [1, 0, 0], '(': [0, 1, 0], ')': [0, 0, 1]}
    return np.array([mapping.get(char, [0, 0, 0]) for char in structure], dtype=np.float32)

def pad_encoded_data(encoded_data, max_length):
    padded = np.zeros((len(encoded_data), max_length, len(encoded_data[0][0])), dtype=np.float32)
    for i, seq in enumerate(encoded_data):
        length = min(len(seq), max_length)
        padded[i, :length, :] = seq[:length]
    return padded

def create_overlapping_windows(data, window_size, overlap):
    step = window_size - overlap
    num_windows = (len(data) - window_size) // step + 1
    windows = [data[i * step: i * step + window_size] for i in range(num_windows) if i * step + window_size <= len(data)]
    return np.array(windows)


def prepare_dataset(file_path, sequence_column, structure_column, window_size=500, overlap=250, label_column=None, threshold=None):
    df = load_and_encode_data(file_path, sequence_column, structure_column)
    
    # Adjust max_length to be dynamic or based on the longest sequence
    max_length = max(df['encoded_sequence'].apply(len).max(), df['encoded_structure'].apply(len).max())
    
    df['padded_sequences'] = df['encoded_sequence'].apply(lambda x: pad_encoded_data([x], max_length)[0])
    df['padded_structures'] = df['encoded_structure'].apply(lambda x: pad_encoded_data([x], max_length)[0])
    
    # Ensure windowing covers the entire sequence dynamically
    df['sequence_windows'] = df['padded_sequences'].apply(lambda x: create_overlapping_windows(x, window_size, overlap))
    df['structure_windows'] = df['padded_structures'].apply(lambda x: create_overlapping_windows(x, window_size, overlap))
    
    df['integrated_data'] = df.apply(lambda row: np.concatenate((row['sequence_windows'], row['structure_windows']), axis=-1), axis=1)

    if label_column:
        if threshold is not None:
            df['labels'] = df[label_column].apply(lambda x: 1 if x >= threshold else 0)
        else:
            df['labels'] = df[label_column].apply(lambda x: 1 if x == 'positive' else 0)

    return df


In [2]:
mirna_dataset = prepare_dataset(
    'dataset/mirna_sequences.csv', 'miRseq', 'miRseq_structure', 
    label_column='clipExpNum', threshold=10
)

lncrna_dataset = prepare_dataset(
    'dataset/lncbase_with_sequences.csv', 'Sequence', 'Sequence_structure', 
    label_column='positive_negative'
)


: 

In [None]:
mirna_dataset.columns

In [None]:
lncrna_dataset.columns

## Mirna Dataset Structrue and Sequence combined transformer

In [None]:
import tensorflow as tf
import numpy as np

def get_positional_encoding(seq_length, model_size):
    angle_rates = 1 / np.power(10000, (2 * (np.arange(model_size) // 2)) / np.float32(model_size))
    angle_rads = np.arange(seq_length)[:, np.newaxis] * angle_rates
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = np.expand_dims(pos_encoding, 0)  # Add batch dimension for broadcasting
    return tf.cast(pos_encoding, dtype=tf.float32)

def transformer_encoder(inputs, model_size, num_heads, ff_dim, dropout_rate):
    seq_length = inputs.shape[1]  # Ensure this is correct
    model_size = inputs.shape[2]  # Ensure this matches the last dimension of inputs
    pos_encoding = get_positional_encoding(seq_length, model_size)
    inputs += pos_encoding  # Add positional encoding to inputs

    attention_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=model_size, dropout=dropout_rate)
    attention_output = attention_layer(inputs, inputs)
    attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)
    attention_output = tf.keras.layers.Add()([inputs, attention_output])
    attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output)

    ff_layer_one = tf.keras.layers.Dense(ff_dim, activation='relu')
    ff_layer_two = tf.keras.layers.Dense(model_size)
    ff_output = ff_layer_one(attention_output)
    ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)
    ff_output = ff_layer_two(ff_output)
    ff_output = tf.keras.layers.Add()([attention_output, ff_output])
    ff_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ff_output)

    return ff_output


In [None]:
import tensorflow as tf

def build_sequence_only_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs

    # Applying multiple layers of the transformer encoder
    for _ in range(num_layers):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    # Removing any singleton dimensions and applying pooling
    if x.shape[1] == 1:
        x = tf.squeeze(x, axis=1)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model
X = np.array(mirna_dataset['sequence_windows'].tolist())
y = np.array(mirna_dataset['labels'].tolist())

In [None]:
import numpy as np
import pandas as pd

# Simulating the loading and processing of sequence data
def simulate_data_processing():
    # Simulate some sequence data
    sequence_data = np.random.randint(0, 4, (424, 100))  # Example: 424 sequences of length 100
    # One-hot encode the sequence data
    sequence_encoded = np.eye(4)[sequence_data]  # Example: one-hot encoding
    return sequence_encoded.reshape(424, -1, 4)  # Reshape for model input: (batch, sequence_length, features)

# Load and process data
X = simulate_data_processing()
y = np.random.randint(0, 2, 424)  # Random binary labels

print("Shape of X after processing:", X.shape)
print("Shape of y:", y.shape)

# Define model parameters
input_shape = X.shape[1:]  # Dynamic input shape based on data
num_layers = 4
head_size = 64
num_heads = 4
ff_dim = 256
dropout = 0.1

from sklearn.model_selection import train_test_split

# Build the model, ensuring the input shape is correctly passed
try:
    sequence_model = build_sequence_only_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout)
except Exception as e:
    print("Error in building the model:", e)
    raise

# Split data for training
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except Exception as e:
    print("Error during train-test split:", e)
    raise

# Train the model
try:
    history = sequence_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
except Exception as e:
    print("Error during model training:", e)
    raise


In [None]:
import tensorflow as tf
import numpy as np

def get_positional_encoding(seq_length, model_size):
    if seq_length == 0:
        raise ValueError("Sequence length is zero, which is not valid for positional encoding.")
    angle_rates = 1 / np.power(10000, (2 * (np.arange(model_size) // 2)) / np.float32(model_size))
    angle_rads = np.arange(seq_length)[:, np.newaxis] * angle_rates
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = np.expand_dims(pos_encoding, 0)  # Add batch dimension for broadcasting
    return tf.cast(pos_encoding, dtype=tf.float32)

def transformer_encoder(inputs, num_heads, ff_dim, dropout_rate, model_size):
    if inputs.shape[1] == 0:
        raise ValueError("Input sequence length is zero, check your data preprocessing.")
    seq_length = inputs.shape[1]
    model_size = inputs.shape[2]
    pos_encoding = get_positional_encoding(seq_length, model_size)
    inputs += pos_encoding

    attention_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=model_size, dropout=dropout_rate)
    attention_output = attention_layer(inputs, inputs)
    attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)
    attention_output = tf.keras.layers.Add()([inputs, attention_output])
    attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output)

    ff_layer_one = tf.keras.layers.Dense(ff_dim, activation='relu')
    ff_layer_two = tf.keras.layers.Dense(model_size)
    ff_output = ff_layer_one(attention_output)
    ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)
    ff_output = ff_layer_two(ff_output)
    ff_output = tf.keras.layers.Add()([attention_output, ff_output])
    ff_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ff_output)

    return ff_output

def build_combined_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs

    for _ in range(num_layers):
        x = transformer_encoder(x, num_heads, ff_dim, dropout, head_size)

    if x.shape[1] == 1:
        x = tf.squeeze(x, axis=1)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [None]:
try:
    # Assuming the data loading and preprocessing have been done correctly
    sequence_data = np.array(mirna_dataset['sequence_windows'].tolist())
    structure_data = np.array(mirna_dataset['structure_windows'].tolist())
    X = np.concatenate([sequence_data, structure_data], axis=-1)  # Concatenate along the last dimension
    y = np.array(mirna_dataset['labels'].tolist())

    if X.shape[1] == 0:
        raise ValueError("Sequence length is zero after concatenation, check your data.")

    input_shape = X.shape[1:]  # (window_size, num_features)
    model = build_combined_model(input_shape, num_layers=4, head_size=64, num_heads=4, ff_dim=256, dropout=0.1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
    print("Training complete.")
except Exception as e:
    print("Error encountered:", e)