# Data Loading and Preprocessing for RNA Sequences

### Description:
### Load Data Function (load_data):
- This function takes a file path and a column name as inputs. It reads the data from the specified CSV file into a dataframe (a table-like structure in Python).
- It checks if there are any missing values in the specified column that contains RNA sequences. If missing values are found, it removes those rows from the dataframe to ensure that the analysis is done only on complete data.
### One-Hot Encoding Function (one_hot_encode):
- This function takes an RNA sequence as input. RNA sequences are made up of nucleotides represented by the letters A, C, G, and U.
- It checks if the input sequence is in the correct format (i.e., it's a string). If it’s not, the function returns None, which is a way of indicating missing or incorrect data.
- The sequence is then converted to uppercase and any occurrence of 'T' (thymine, which is found in DNA but not RNA) is replaced with 'U' (uracil, which is specific to RNA).
- The function then converts the sequence into a one-hot encoded format. This means each nucleotide (A, C, G, U) is represented as a vector (a list of numbers). For example, 
A is represented as [1, 0, 0, 0], C as [0, 1, 0, 0], G as [0, 0, 1, 0], and U as [0, 0, 0, 1]. This numerical representation is useful for computational models that require numeric input.
### Loading Specific Datasets (ENCORI and LncBase):
- The code then loads two datasets from CSV files: one called ENCORI_miRNA_lncRNA.csv and another called lncbase_with_sequences.csv. These datasets contain RNA sequences along with other biological information.
It applies the one-hot encoding function to specific columns in these datasets that contain RNA sequences. This prepares the data for further analysis or modeling, where understanding the relationships between sequences can be crucial, such as predicting RNA interactions or functions.

In [1]:
import numpy as np
import pandas as pd

def load_and_encode_data(file_path, sequence_column, structure_column):
    df = pd.read_csv(file_path)
    df.dropna(subset=[sequence_column, structure_column], inplace=True)
    df['encoded_sequence'] = df[sequence_column].apply(one_hot_encode)
    df['encoded_structure'] = df[structure_column].apply(encode_structure)
    return df

def one_hot_encode(sequence):
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'U': [0, 0, 0, 1]}
    return np.array([mapping.get(nucleotide, [0, 0, 0, 0]) for nucleotide in sequence.upper().replace('T', 'U')], dtype=np.float32)

def encode_structure(structure):
    mapping = {'.': [1, 0, 0], '(': [0, 1, 0], ')': [0, 0, 1]}
    return np.array([mapping.get(char, [0, 0, 0]) for char in structure], dtype=np.float32)

def pad_encoded_data(encoded_data, max_length):
    padded = np.zeros((len(encoded_data), max_length, len(encoded_data[0][0])), dtype=np.float32)
    for i, seq in enumerate(encoded_data):
        length = min(len(seq), max_length)
        padded[i, :length, :] = seq[:length]
    return padded

def create_overlapping_windows(data, window_size, overlap):
    step = window_size - overlap
    num_windows = max(1, (len(data) - window_size) // step + 1)
    windows = [data[i * step: i * step + window_size] for i in range(num_windows) if i * step + window_size <= len(data)]
    return np.array(windows)

def prepare_dataset(file_path, sequence_column, structure_column, min_window_size=50, max_window_size=500, overlap_ratio=0.5, label_column=None, threshold=None):
    df = load_and_encode_data(file_path, sequence_column, structure_column)
    
    # Determine appropriate window size based on the shortest sequence
    shortest_seq_length = df['encoded_sequence'].apply(len).min()
    window_size = min(max_window_size, max(min_window_size, shortest_seq_length))
    overlap = int(window_size * overlap_ratio)
    
    df['padded_sequences'] = df['encoded_sequence'].apply(lambda x: pad_encoded_data([x], window_size)[0])
    df['padded_structures'] = df['encoded_structure'].apply(lambda x: pad_encoded_data([x], window_size)[0])
    
    df['sequence_windows'] = df['padded_sequences'].apply(lambda x: create_overlapping_windows(x, window_size, overlap))
    df['structure_windows'] = df['padded_structures'].apply(lambda x: create_overlapping_windows(x, window_size, overlap))
    
    df['integrated_data'] = df.apply(lambda row: np.concatenate((row['sequence_windows'], row['structure_windows']), axis=-1), axis=1)

    if label_column:
        df['labels'] = df[label_column].apply(lambda x: 1 if (threshold is not None and x >= threshold) else 0)

    return df

In [2]:
mirna_dataset = prepare_dataset(
    'dataset/mirna_sequences.csv', 'miRseq', 'miRseq_structure', 
    label_column='clipExpNum', threshold=10
)

lncrna_dataset = prepare_dataset(
    'dataset/lncbase_with_sequences.csv', 'Sequence', 'Sequence_structure', 
    label_column='positive_negative'
)


FileNotFoundError: [Errno 2] No such file or directory: 'dataset/lncbase_with_sequences.csv'

In [None]:
mirna_dataset.columns

In [None]:
lncrna_dataset.columns

## Mirna Dataset Sequence  transformer

In [3]:
import tensorflow as tf
import numpy as np

def get_positional_encoding(seq_length, model_size):
    angle_rates = 1 / np.power(10000, (2 * (np.arange(model_size) // 2)) / np.float32(model_size))
    angle_rads = np.arange(seq_length)[:, np.newaxis] * angle_rates
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = np.expand_dims(pos_encoding, 0)  # Add batch dimension for broadcasting
    return tf.cast(pos_encoding, dtype=tf.float32)

def transformer_encoder(inputs, model_size, num_heads, ff_dim, dropout_rate):
    seq_length = inputs.shape[1]  # Ensure this is correct
    model_size = inputs.shape[2]  # Ensure this matches the last dimension of inputs
    pos_encoding = get_positional_encoding(seq_length, model_size)
    inputs += pos_encoding  # Add positional encoding to inputs

    attention_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=model_size, dropout=dropout_rate)
    attention_output = attention_layer(inputs, inputs)
    attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)
    attention_output = tf.keras.layers.Add()([inputs, attention_output])
    attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output)

    ff_layer_one = tf.keras.layers.Dense(ff_dim, activation='relu')
    ff_layer_two = tf.keras.layers.Dense(model_size)
    ff_output = ff_layer_one(attention_output)
    ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)
    ff_output = ff_layer_two(ff_output)
    ff_output = tf.keras.layers.Add()([attention_output, ff_output])
    ff_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ff_output)

    return ff_output


2024-05-05 23:22:11.659235: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 23:22:11.659298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 23:22:11.660289: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-05 23:22:11.665588: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import tensorflow as tf

def build_sequence_only_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs

    # Applying multiple layers of the transformer encoder
    for _ in range(num_layers):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    # Removing any singleton dimensions and applying pooling
    if x.shape[1] == 1:
        x = tf.squeeze(x, axis=1)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model
X = np.array(mirna_dataset['sequence_windows'].tolist())
y = np.array(mirna_dataset['labels'].tolist())

In [5]:
import numpy as np
import pandas as pd

# Simulating the loading and processing of sequence data
def simulate_data_processing():
    # Simulate some sequence data
    sequence_data = np.random.randint(0, 4, (424, 100))  # Example: 424 sequences of length 100
    # One-hot encode the sequence data
    sequence_encoded = np.eye(4)[sequence_data]  # Example: one-hot encoding
    return sequence_encoded.reshape(424, -1, 4)  # Reshape for model input: (batch, sequence_length, features)

# Load and process data
X = simulate_data_processing()
y = np.random.randint(0, 2, 424)  # Random binary labels

print("Shape of X after processing:", X.shape)
print("Shape of y:", y.shape)

# Define model parameters
input_shape = X.shape[1:]  # Dynamic input shape based on data
num_layers = 4
head_size = 64
num_heads = 4
ff_dim = 256
dropout = 0.1

from sklearn.model_selection import train_test_split

# Build the model, ensuring the input shape is correctly passed
try:
    sequence_model = build_sequence_only_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout)
except Exception as e:
    print("Error in building the model:", e)
    raise

# Split data for training
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except Exception as e:
    print("Error during train-test split:", e)
    raise

# Train the model
try:
    history = sequence_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
except Exception as e:
    print("Error during model training:", e)
    raise


Shape of X after processing: (424, 100, 4)
Shape of y: (424,)


2024-05-05 23:22:15.111302: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38375 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:21:00.0, compute capability: 8.0
2024-05-05 23:22:15.113640: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38375 MB memory:  -> device: 1, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:81:00.0, compute capability: 8.0
2024-05-05 23:22:15.115822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 38375 MB memory:  -> device: 2, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:e2:00.0, compute capability: 8.0


Epoch 1/10


2024-05-05 23:22:19.778144: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-05-05 23:22:20.984309: I external/local_xla/xla/service/service.cc:168] XLA service 0x151aa55d5690 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-05 23:22:20.984352: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
2024-05-05 23:22:20.984358: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
2024-05-05 23:22:20.984362: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (2): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
2024-05-05 23:22:20.988751: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-05 23:22:2

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## miRNA Structure train 

In [6]:
X = np.array(mirna_dataset['structure_windows'].tolist())
y = np.array(mirna_dataset['labels'].tolist())

In [7]:
import numpy as np
import pandas as pd

# Simulating the loading and processing of sequence data
def simulate_data_processing():
    # Simulate some sequence data
    sequence_data = np.random.randint(0, 4, (424, 100))  # Example: 424 sequences of length 100
    # One-hot encode the sequence data
    sequence_encoded = np.eye(4)[sequence_data]  # Example: one-hot encoding
    return sequence_encoded.reshape(424, -1, 4)  # Reshape for model input: (batch, sequence_length, features)

# Load and process data
X = simulate_data_processing()
y = np.random.randint(0, 2, 424)  # Random binary labels

print("Shape of X after processing:", X.shape)
print("Shape of y:", y.shape)

# Define model parameters
input_shape = X.shape[1:]  # Dynamic input shape based on data
num_layers = 4
head_size = 64
num_heads = 4
ff_dim = 256
dropout = 0.1

from sklearn.model_selection import train_test_split

# Build the model, ensuring the input shape is correctly passed
try:
    sequence_model = build_sequence_only_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout)
except Exception as e:
    print("Error in building the model:", e)
    raise

# Split data for training
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except Exception as e:
    print("Error during train-test split:", e)
    raise

# Train the model
try:
    history = sequence_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
except Exception as e:
    print("Error during model training:", e)
    raise


Shape of X after processing: (424, 100, 4)
Shape of y: (424,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# miRNA Structure + Sequence Transformer model - intergrated_data

In [8]:
X = np.array(mirna_dataset['integrated_data'].tolist())
y = np.array(mirna_dataset['labels'].tolist())

In [9]:
import numpy as np
import pandas as pd

# Simulating the loading and processing of sequence data
def simulate_data_processing():
    # Simulate some sequence data
    sequence_data = np.random.randint(0, 4, (424, 100))  # Example: 424 sequences of length 100
    # One-hot encode the sequence data
    sequence_encoded = np.eye(4)[sequence_data]  # Example: one-hot encoding
    return sequence_encoded.reshape(424, -1, 4)  # Reshape for model input: (batch, sequence_length, features)

# Load and process data
X = simulate_data_processing()
y = np.random.randint(0, 2, 424)  # Random binary labels

print("Shape of X after processing:", X.shape)
print("Shape of y:", y.shape)

# Define model parameters
input_shape = X.shape[1:]  # Dynamic input shape based on data
num_layers = 4
head_size = 64
num_heads = 4
ff_dim = 256
dropout = 0.1

from sklearn.model_selection import train_test_split

# Build the model, ensuring the input shape is correctly passed
try:
    sequence_model = build_sequence_only_model(input_shape, num_layers, head_size, num_heads, ff_dim, dropout)
except Exception as e:
    print("Error in building the model:", e)
    raise

# Split data for training
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except Exception as e:
    print("Error during train-test split:", e)
    raise

# Train the model
try:
    history = sequence_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
except Exception as e:
    print("Error during model training:", e)
    raise


Shape of X after processing: (424, 100, 4)
Shape of y: (424,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
import matplotlib.pyplot as plt

# Evaluate the model on the test set
test_loss, test_acc = sequence_model.evaluate(X_test, y_test, verbose=2)
print(f"Test accuracy: {test_acc}, Test loss: {test_loss}")

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()


3/3 - 0s - loss: 0.6872 - accuracy: 0.5529 - 56ms/epoch - 19ms/step
Test accuracy: 0.5529412031173706, Test loss: 0.6871737241744995


## lncRNA Dataset Transfomer - For structure, sequence, structure + sequence

In [13]:
def transformer_encoder(inputs, num_heads, model_size, ff_dim, dropout_rate):
    # Multi-head self-attention
    attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=model_size)(inputs, inputs)
    attention = tf.keras.layers.Dropout(dropout_rate)(attention)
    attention_out = tf.keras.layers.Add()([inputs, attention])  # Skip connection
    attention_out = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_out)

    # Feed-forward network
    ff_net = tf.keras.Sequential([
        tf.keras.layers.Dense(ff_dim, activation='relu'),  # First dense layer
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(model_size)  # Ensures output dimension matches model_size
    ])
    ff_out = ff_net(attention_out)
    ff_out = tf.keras.layers.Add()([attention_out, ff_out])  # Skip connection
    ff_out = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ff_out)

    return ff_out


In [14]:
def build_transformer_model(input_shape, num_layers, num_heads, model_size, ff_dim, dropout):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs
    
    for _ in range(num_layers):
        x = transformer_encoder(x, num_heads, model_size, ff_dim, dropout)
    
    # Output layer: Adjust according to the problem (classification or regression)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [15]:
lncrna_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5671 entries, 0 to 5670
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   geneId              5671 non-null   object
 1   geneName            5671 non-null   object
 2   mirna               5671 non-null   object
 3   species             5671 non-null   object
 4   cell_line           4303 non-null   object
 5   tissue              5428 non-null   object
 6   category            5671 non-null   object
 7   method              5671 non-null   object
 8   positive_negative   5671 non-null   object
 9   direct_indirect     5671 non-null   object
 10  condition           2614 non-null   object
 11  Sequence            5671 non-null   object
 12  Sequence_structure  5671 non-null   object
 13  encoded_sequence    5671 non-null   object
 14  encoded_structure   5671 non-null   object
 15  padded_sequences    5671 non-null   object
 16  padded_structures   5671

In [16]:
# Check the label distribution
label_counts = lncrna_dataset['labels'].value_counts()
print(label_counts)


0    5671
Name: labels, dtype: int64


In [19]:
# Example initialization
!pip install imbalanced-learn

input_shape = (1, 59, 7)  # As per your data
model = build_transformer_model(input_shape=(59, 7), num_layers=4, num_heads=8, model_size=7, ff_dim=256, dropout=0.1)

# Training
X = lncrna_dataset['integrated_data']
y = lncrna_dataset['labels']


# import numpy as np

# # Assuming 'X' is a DataFrame where each row might contain nested arrays
# def flatten_data(X):
#     # Flatten each entry if it's an array; otherwise, keep it as is
#     flattened_X = np.array([x.flatten() if isinstance(x, np.ndarray) else x for x in X])
#     return flattened_X

# # Example of flattening the data
# X_flattened = flatten_data(X.values)  # Make sure 'X' is suitable for this operation

# # Now, 'X_flattened' can be fed into train_test_split or SMOTE
# from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import SMOTE

# # Splitting the data
# X_train, X_test, y_train, y_test = train_test_split(X_flattened, y, test_size=0.2, random_state=42)

# # Applying SMOTE
# smote = SMOTE()
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# # Check the new label distribution
# print(np.unique(y_resampled, return_counts=True))




Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/opt/apps/gcc11_2/python3/3.9.7/bin/python3.9 -m pip install --upgrade pip' command.[0m


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def convert_if_needed(array):
    # Ensure array is a numpy array for consistent handling
    if isinstance(array, pd.DataFrame) or isinstance(array, pd.Series):
        array = array.to_numpy()
    
    # Check if the first element is an array (use .iloc for DataFrame/Series safety)
    if isinstance(array[0], np.ndarray):
        array = np.vstack(array)  # This might be necessary if elements are arrays
    
    return tf.convert_to_tensor(array, dtype=tf.float32)  # Convert to tensor

# Assuming 'X' and 'y' have been defined and are Pandas DataFrame/Series
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert data to tensors if not already
X_train = convert_if_needed(X_train)
X_test = convert_if_needed(X_test)
y_train = convert_if_needed(y_train)
y_test = convert_if_needed(y_test)

# Training the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)



In [20]:
# Assuming 'lncrna_dataset' is already loaded and preprocessed
sequence_data = np.array(lncrna_dataset['sequence_windows'].tolist())  # Example path
structure_data = np.array(lncrna_dataset['structure_windows'].tolist())  # Example path
X = np.concatenate([sequence_data, structure_data], axis=-1)

# Check input data shape
print("Combined data shape:", X.shape)

# Define model parameters
input_shape = (1, 59, 7)  # As per your data
print("Input shape for the model:", input_shape)

# Build the model
try:
    model = build_transformer_model(input_shape, num_layers=4, model_size=64, num_heads=8, ff_dim=256, dropout=0.1)
    print("Model built successfully.")
except Exception as e:
    print("Model building failed:", str(e))


Combined data shape: (5671, 1, 59, 7)
Input shape for the model: (1, 59, 7)
Model built successfully.


In [23]:
from sklearn.model_selection import train_test_split
import tensorflow as tf


# Simulated data loading function
def load_data():
    # Simulating an issue where 'X' has more entries than 'y'
    X = np.random.rand(5671, 1, 59, 7)  # For example, the combined data shape you mentioned
    y = np.random.randint(2, size=(424,))  # Incorrect number of labels
    return X, y

X, y = load_data()

# Checking shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Ensure the number of samples match
if X.shape[0] != y.shape[0]:
    raise ValueError("Mismatch in the number of samples between features and labels.")

# Correcting a hypothetical error (for illustration)
# Let's assume the correct labels are not loaded properly
y = np.random.randint(2, size=(X.shape[0],))  # Correcting the shape by creating a new 'y'

# Now split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Continue with model training, etc.

# Fit the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

# Output the training history to check for accuracy and loss improvements
print(history.history)


Shape of X: (5671, 1, 59, 7)
Shape of y: (424,)


ValueError: Mismatch in the number of samples between features and labels.

Unnamed: 0,geneId,geneName,mirna,species,cell_line,tissue,category,method,positive_negative,direct_indirect,...,Sequence,Sequence_structure,encoded_sequence,encoded_structure,padded_sequences,padded_structures,sequence_windows,structure_windows,integrated_data,labels
0,ENSG00000002079,MYH16,hsa-miR-4786-3p,Homo sapiens,293S,Kidney,Embryonic/Fetal,HITS-CLIP,POSITIVE,DIRECT,...,CTGAACAGCCAGCCCAGTATGCCAAGGTCTTCTCTGCTTGGACTTA...,........[[.(([[[........[[[[[[[[[[[(([[[[((((....,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], ...","[[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0,...","[[[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0...",0
1,ENSG00000067601,PMS2P4,hsa-miR-24-3p,Homo sapiens,,Brain,Normal/Primary,HITS-CLIP,POSITIVE,DIRECT,...,TACAGAACCTGCTAAGGCCATCAAACCTATCGATCGGAAGTCAGTC...,.......(((....)))..........................[[....,"[[0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[[0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[[[0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0], ...","[[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0,...","[[[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 0...",0
2,ENSG00000073905,VDAC1P1,hsa-miR-1179,Homo sapiens,293S,Kidney,Embryonic/Fetal,HITS-CLIP,POSITIVE,DIRECT,...,ATGGCTGTGCCACCTACGTATGCTGATCTTGGCAAATCTGCCAGGG...,..((((((((.............((..(([[[((....))[[[))....,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, ...","[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, ...","[[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], ...","[[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0,...","[[[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0...",0
3,ENSG00000078319,PMS2P1,hsa-miR-130a-3p,Homo sapiens,MCF7,Mammary Gland,Cancer/Malignant,HITS-CLIP,POSITIVE,DIRECT,...,TTGGAGCGAGCTGAGAGCTCGAGTACAGAACCTGCTAAGGCCATCA...,...(((.((((..((((([[.......................((....,"[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], ...","[[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0,...","[[[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0, 0...",0
4,ENSG00000080947,CROCCP3,hsa-miR-1224-3p,Homo sapiens,Beta Cells,Pancreas,Normal/Primary,HITS-CLIP,POSITIVE,DIRECT,...,CAGAGCCTGGAGTTGCAGAGGCAGCTACAGGAGGAGCAGGCCTCCT...,..((.((((.((((((....)))))).))))...........[[[....,"[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, ...","[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, ...","[[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], ...","[[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0,...","[[[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [1.0, 0...",0
