In [1]:
# Essential imports
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler


# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, models

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model,load_model,save_model
from keras import layers
from tensorflow.keras.layers import Layer


# Drive mounting
from google.colab import drive
drive.mount('/content/drive', force_remount=True)



files = !ls "/content/drive/My Drive/Data_Final_Code"
for file in files:
  print(file)

pssm_path = '/content/drive/My Drive/Data_Final_Code/PSSM-Data'
RBP_path = '/content/drive/My Drive/Data_Final_Code/RBP-Data/'
# RBP_path = '/content/drive/My Drive/Data_Final_Code/RBP-Data-Sample/'



miRNA_path = '/content/drive/My Drive/Data_Final_Code/miRNA-DATA/clean_miRNA_no_duplicates.txt'
# miRNA_path = '/content/drive/My Drive/Data_Final_Code/miRNA-DATA/miR-223.txt'
# miRNA_path ='/content/drive/My Drive/Data_Final_Code/miRNA-DATA/clean_miRNA_no_duplicates_rna.txt'
# miRNA_path ='/content/drive/My Drive/Data_Final_Code/miRNA-DATA/reversed_miRNA_sequences.txt'

num_files = len(os.listdir(pssm_path))

print(f"Number of files in {pssm_path}: {num_files}")
pssm_path = '/content/drive/My Drive/Data_Final_Code/PSSM-Data'
RBP_path = '/content/drive/My Drive/Data_Final_Code/RBP-Data/'
# RBP_path = '/content/drive/My Drive/Data_Final_Code/RBP-Data-Sample/'
# miRNA_path = '/content/drive/My Drive/Data_Final_Code/miRNA-DATA/clean_miRNA_no_duplicates.txt'
miRNA_path = '/content/drive/My Drive/Data_Final_Code/miRNA-DATA/miR-223.txt'


#Read and Process RBP

Mounted at /content/drive
All_proteins	       mirna_model.h.May15	 model_with_RBP_miRNA.ipynb  split_files2.zip
destination.zip        mirna_model.h.May18	 model_with_RBP_pssm.ipynb   Test
githup		       mirna_model.h.May18.TEST  PSSM-Data		     TEST-Similarity.ipynb
miRNA-DATA	       mirna_model.h.May20	 RBP-Data		     Untitled1.ipynb
mirna_model.h2024-185  mirna_model.h.May21	 RBP-Data-Sample
mirna_model.h2024-old  mirna_model.h.TEST	 rna_model.h1
mirna_model.h.Jun08    mirna_model.N2024	 rna_model.h5
Number of files in /content/drive/My Drive/Data_Final_Code/PSSM-Data: 498


In [2]:
#Read and Process RBP

def process_sequences(sequences):
    if not sequences:  # Check if the sequences list is empty
          return np.array([]), set()  # Return an empty array and set if no sequences to process
    unique_characters = set(''.join(sequences))
    letter2number = {l: i for i, l in enumerate(unique_characters, start=1)}
    processed_seqs = [[letter2number[char] for char in seq] for seq in sequences]
    return pad_sequences(processed_seqs, padding='post'), unique_characters

def clean_data(sequences, labels):
    cleaned_sequences = []
    cleaned_labels = []
    for seq, label in zip(sequences, labels):
        if pd.isna(seq) or pd.isna(label):
            # print("Warning: Missing sequence or label")
            continue
        try:
            cleaned_label = int(float(label))
            if cleaned_label in [0, 1]:
                cleaned_sequences.append(seq)
                cleaned_labels.append(cleaned_label)
            else:
                print(f"Warning: Label not 0 or 1 encountered: '{label}'")
        except ValueError:
            print(f"Warning: Non-numeric label encountered: '{label}'")
    return cleaned_sequences, cleaned_labels


def read_and_process_data(file_path):
    data = pd.read_csv(file_path, sep='\s+', names=['sequence', 'label'])
    sequences, labels = clean_data(data['sequence'].tolist(), data['label'].tolist())
    processed_sequences, unique_characters = process_sequences(sequences)
    labels = np.array(labels).astype(np.float32)
    return processed_sequences, labels, unique_characters

pssm_data_pca_dict = {
    "SampleProtein1": [0.1, 0.2, 0.3],
    "SampleProtein2": [0.4, 0.5, 0.6]
}
def process_all_files(directory_path, pssm_data_pca_dict):
    data_dict = {}
    data_dict_nopssm = {}
    global_unique_characters = set()
    added_items_count = 0
    added_items_nopssm_count = 0  # Counter for items added to data_dict_nopssm

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.fa'):
            rbp_name = file_name.split('.')[0]
            file_path = os.path.join(directory_path, file_name)
            sequences, labels, unique_characters = read_and_process_data(file_path)

            # Attempt to find an exact or similar key
            # key_to_use   = rbp_name if rbp_name in pssm_data_pca_dict else find_similar_key(rbp_name, pssm_data_pca_dict.keys())
            key_to_use = False

            if key_to_use:
                data_dict[rbp_name] = {
                    'sequences': sequences,
                    'pssm_data': pssm_data_pca_dict[key_to_use],
                    'labels': labels
                }
                added_items_count += 1
            else:
                data_dict_nopssm[rbp_name] = {
                    'sequences': sequences,
                    'labels': labels
                }
                added_items_nopssm_count += 1  # Increment counter
                # Print the information for items added to data_dict_nopssm
                print(f"Added to data_dict_nopssm: {rbp_name}, Total Items in data_dict_nopssm: {added_items_nopssm_count}")

            global_unique_characters.update(unique_characters)
            # print(f"Finished processing {file_name}")

    print(f"Items added from pssm_data_pca_dict to data_dict: {added_items_count}")
    print(f"Total items added to data_dict_nopssm: {added_items_nopssm_count}")  # Final count for data_dict_nopssm
    return data_dict, data_dict_nopssm, global_unique_characters

rbp_data_dict, rbp_data_dict_nopssm, global_unique_characters = process_all_files(RBP_path, pssm_data_pca_dict)



Added to data_dict_nopssm: SMNDC1, Total Items in data_dict_nopssm: 1
Added to data_dict_nopssm: EIF3D, Total Items in data_dict_nopssm: 2
Added to data_dict_nopssm: NSUN2, Total Items in data_dict_nopssm: 3
Added to data_dict_nopssm: CPEB4, Total Items in data_dict_nopssm: 4
Added to data_dict_nopssm: BCLAF1, Total Items in data_dict_nopssm: 5
Added to data_dict_nopssm: HNRNPL, Total Items in data_dict_nopssm: 6
Added to data_dict_nopssm: EIF3G, Total Items in data_dict_nopssm: 7
Added to data_dict_nopssm: BCCIP, Total Items in data_dict_nopssm: 8
Added to data_dict_nopssm: AKAP8L, Total Items in data_dict_nopssm: 9
Added to data_dict_nopssm: METAP2, Total Items in data_dict_nopssm: 10
Added to data_dict_nopssm: DDX24, Total Items in data_dict_nopssm: 11
Added to data_dict_nopssm: AKAP1, Total Items in data_dict_nopssm: 12
Added to data_dict_nopssm: UTP18, Total Items in data_dict_nopssm: 13
Added to data_dict_nopssm: EIF4G2, Total Items in data_dict_nopssm: 14
Added to data_dict_nops

In [None]:
pssm_data_pca_dict = {
    "SampleProtein1": [0.1, 0.2, 0.3],
    "SampleProtein2": [0.4, 0.5, 0.6]
}
def find_similar_key(target, keys):
    # Simple heuristic: find a key with the maximum overlap in characters
    # This is a very basic form of similarity and might need adjustment
    similar_key = max(keys, key=lambda k: len(set(k) & set(target)))
    # Define a threshold for similarity, for example, at least half the characters match
    threshold = len(target) // 2
    if len(set(similar_key) & set(target)) >= threshold:
        return similar_key
    return None

def process_all_files(directory_path, pssm_data_pca_dict):
    data_dict = {}
    data_dict_nopssm = {}
    global_unique_characters = set()
    added_items_count = 0

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.fa'):
            rbp_name = file_name.split('.')[0]
            file_path = os.path.join(directory_path, file_name)
            sequences, labels, unique_characters = read_and_process_data(file_path)

            # Attempt to find an exact or similar key
            # key_to_use   = rbp_name if rbp_name in pssm_data_pca_dict else find_similar_key(rbp_name, pssm_data_pca_dict.keys())
            key_to_use = False

            # Check if rbp_name and key_to_use are not the same, then print
            # if rbp_name != key_to_use:
                # print(f"RBP Name: {rbp_name}, Key to Use: {key_to_use}")

            if key_to_use:
                data_dict[rbp_name] = {
                    'sequences': sequences,
                    'pssm_data': pssm_data_pca_dict[key_to_use],
                    'labels': labels
                }
                added_items_count += 1
            else:
                data_dict_nopssm[rbp_name] = {
                    'sequences': sequences,
                    'labels': labels
                }

            global_unique_characters.update(unique_characters)
            print(f"Finished processing {file_name}")

    print(f"Items added from pssm_data_pca_dict to data_dict: {added_items_count}")
    return data_dict, data_dict_nopssm, global_unique_characters

rbp_data_dict, rbp_data_dict_nopssm, global_unique_characters = process_all_files(RBP_path, pssm_data_pca_dict)


import numpy as np
import os

class PSSMProcessor:
    def __init__(self, path):
        self.path = path
        self.final_array = None

    def load_and_adjust_pssms(self):
        def load_pssm(file):
            alphabet = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
            with open(file, 'r') as f:
                lines = f.readlines()
            pssm = []
            for line in lines[3:-6]:
                pssm.append([int(x) for x in line.split()[2:22]])
            return np.array(pssm)

        pssm_files = [os.path.join(self.path, f) for f in os.listdir(self.path) if f.endswith('.pssm')]
        pssm_data = []

        for file in pssm_files:
            pssm_matrix = load_pssm(file)
            if pssm_matrix.size == 0:
                print(f"File with zero-size PSSM matrix: {file}")
            else:
                filename = os.path.basename(file)
                pssm_data.append((filename, pssm_matrix))

        min_length, max_length, average_length = self.adjust_pssm_to_average_length(pssm_data)
        print(f"Min length: {min_length}, Max length: {max_length}, Average length: {average_length}")
        # Initialize pssm_dict here, right before filling it
        pssm_dict = {}
        for item in self.final_array:
            if "-" in item[0]:
                key_part = item[0].split('-')[0]
            else:
                key_part = item[0].split('_')[0]
            pssm_data = item[1]
            if len(pssm_data) == 0:
                print(f"Empty PSSM data for file: {item[0]}, Data: {item[1]}")
            else:
                pssm_dict[key_part] = pssm_data

        return pssm_dict  # Ensure this return statement is included

    def adjust_pssm_to_average_length(self, pssm_data):
        lengths = [pssm.shape[0] for _, pssm in pssm_data]
        min_length = min(lengths)
        max_length = max(lengths)
        average_length = sum(lengths) // len(lengths)

        adjusted_pssms = []
        for filename, pssm in pssm_data:
            if pssm.shape[0] > average_length:
                adjusted_pssm = pssm[:average_length]
            elif pssm.shape[0] < average_length:
                repeat_times = average_length // pssm.shape[0] + 1
                extended_pssm = np.tile(pssm, (repeat_times, 1))[:average_length]
                adjusted_pssm = extended_pssm
            else:
                adjusted_pssm = pssm
            adjusted_pssms.append((filename, adjusted_pssm.flatten()))

        self.final_array = np.array(adjusted_pssms, dtype=object)
        return min_length, max_length, average_length


processor = PSSMProcessor(pssm_path)
pssm_data_dict =processor.load_and_adjust_pssms()
# pssm_data_dict = processor.generate_random_rows(300)
print(f"Number of entries in PSSM data dictionary: {len(pssm_data_dict)}")

# Print the first few entries to verify
for key in list(pssm_data_dict.keys())[:10]:  # Adjust the number to print as needed
    print(f"Key: {key}, Length of PSSM row vector: {len(pssm_data_dict[key])}")
    # print(f"Key: {key}, Length of PSSM row vector: {len(pssm_data_dict[key])}, PSSM DATA: {pssm_data_dict[key]}")




Finished processing SMNDC1.both.fa
Finished processing EIF3D.both.fa
Finished processing NSUN2.both.fa
Finished processing CPEB4.both.fa
Finished processing BCLAF1.both.fa
Finished processing HNRNPL.both.fa
Finished processing EIF3G.both.fa
Finished processing BCCIP.both.fa
Finished processing AKAP8L.both.fa
Finished processing METAP2.both.fa
Finished processing DDX24.both.fa
Finished processing AKAP1.both.fa
Finished processing UTP18.both.fa
Finished processing EIF4G2.both.fa
Finished processing UCHL5.both.fa
Finished processing LSM11.both.fa
Finished processing PTBP1.both.fa
Finished processing IGF2BP3.both.fa
Finished processing DDX42.both.fa
Finished processing SND1.both.fa
Finished processing SRSF7.both.fa
Finished processing GPKOW.both.fa
Finished processing RBFOX2.both.fa
Finished processing SAFB2.both.fa
Finished processing STAU2.both.fa
Finished processing TAF15.both.fa
Finished processing ZC3H8.both.fa
Finished processing SF3B4.both.fa
Finished processing XRN2.both.fa
Finishe

In [None]:
processor = PSSMProcessor(pssm_path)
pssm_data_dict =processor.load_and_adjust_pssms()
# pssm_data_dict = processor.generate_random_rows(300)
print(f"Number of entries in PSSM data dictionary: {len(pssm_data_dict)}")

# Print the first few entries to verify
for key in list(pssm_data_dict.keys())[:10]:  # Adjust the number to print as needed
    print(f"Key: {key}, Length of PSSM row vector: {len(pssm_data_dict[key])}")
    # print(f"Key: {key}, Length of PSSM row vector: {len(pssm_data_dict[key])}, PSSM DATA: {pssm_data_dict[key]}")

# prompt: applay PCA to df and give anaysis of elbow curve for optimaration

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Apply PCA to the data
pca = PCA()
pca.fit(df)

# Plot the elbow curve
plt.plot(range(1, len(df.columns) + 1), pca.explained_variance_ratio_, marker='o')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Elbow Curve for PSSM')
plt.show()

# Analyze the elbow curve to determine the optimal number of principal components to retain
# Look for the point where the explained variance ratio starts to level off
# In this case, it appears that retaining around 100 principal components would capture most of the variance in the data

# Perform PCA with the chosen number of components

pca = PCA(n_components=25)
df_pca = pca.fit_transform(df)

# Print the transformed data
print(df_pca[0])
type(df_pca)

import pandas as pd
from sklearn.decomposition import PCA

# Assuming pssm_data_dict is already filled with your data
# Convert the dictionary values (which are the PSSM arrays) into a list of lists
data = [value for key, value in pssm_data_dict.items()]

# Create the DataFrame from this data
df = pd.DataFrame(data)
# Assuming 'num_files' is defined; if not, replace df.columns[:num_files] with appropriate slicing or remove if unnecessary
# df = df[df.columns[:num_files]] # Uncomment if num_files is defined and you need to slice the DataFrame

# Rename the columns as specified
df.columns = ['x' + str(col) for col in df.columns]
# print(df.head())

# Apply PCA to the data
pca = PCA(n_components=25)  # Directly initializing PCA with n_components=25
df_pca = pca.fit_transform(df)


# Print the explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
# Creating a new dictionary with the same keys and the PCA-transformed data as values
keys = list(pssm_data_dict.keys())
pssm_data_pca_dict = {keys[i]: df_pca[i] for i in range(len(keys))}

# Now, pssm_data_pca_dict contains the original keys and the PCA-transformed data
print(f"Number of entries in PSSM data dictionary: {len(pssm_data_pca_dict)}")

# Print the first few entries to verify
for key in list(pssm_data_pca_dict.keys())[:1]:  # Adjust the number to print as needed
    print(f"Key: {key}, Length of PSSM row vector: {len(pssm_data_pca_dict[key])}, PSSM DATA: {pssm_data_pca_dict[key]}")

In [None]:
# Function to normalize data and train an autoencoder
def train_autoencoder(data, encoding_dim):
    # Scale the data to range [-1, 1]
    scaler = MinMaxScaler(feature_range=(-1, 1))
    data_normalized = scaler.fit_transform(data)

    # Define the input layer
    input_img = tf.keras.Input(shape=(data_normalized.shape[1],))
    # Encoder layers
    encoded = layers.Dense(encoding_dim * 2, activation='relu')(input_img)
    encoded = layers.Dense(encoding_dim, activation='relu')(encoded)
    # Decoder layers
    decoded = layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    decoded = layers.Dense(data_normalized.shape[1], activation='tanh')(decoded)  # Using tanh activation

    # Autoencoder model
    autoencoder = models.Model(input_img, decoded)
    # Encoder model
    encoder = models.Model(input_img, encoded)

    # Compile the autoencoder
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    # Train the autoencoder
    autoencoder.fit(data_normalized, data_normalized,
                    epochs=100,
                    batch_size=256,
                    shuffle=True,
                    validation_split=0.2)

    # Use the encoder to reduce the dimensionality of the data
    reduced_data = encoder.predict(data_normalized)

    return reduced_data, encoder, autoencoder, scaler



# Assuming pssm_data_dict is your dictionary of PSSM data
keys = list(pssm_data_dict.keys())
pssm_values = np.array(list(pssm_data_dict.values()))

# Assuming all PSSM vectors are of the same length, otherwise you'll need to adjust them
# Determine the desired encoding dimension
encoding_dim = int(pssm_values.shape[1] * 0.2)  # Example: 20% of the input size

# Train the autoencoder and get the reduced data
reduced_data, encoder, autoencoder, scaler = train_autoencoder(pssm_values, encoding_dim)

# Map the reduced data back to the protein names
reduced_data_dict = {key: reduced_data[i] for i, key in enumerate(keys)}


# Example: Print the first few entries to verify
# for key in list(reduced_data_dict.keys())[:5]:
#     print(f"Protein: {key}, Reduced PSSM Data: {reduced_data_dict[key]}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_and_visualize(model, X_test_seq, X_test_pssm, y_test, protein_name):
    # Generate predictions
    test_pred = (model.predict([X_test_seq, X_test_pssm]) > 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_test, test_pred)
    precision = precision_score(y_test, test_pred, zero_division=0)
    recall = recall_score(y_test, test_pred, zero_division=0)
    f1 = f1_score(y_test, test_pred, zero_division=0)

    # Print metrics
    print(f"\nResults for {protein_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Plot confusion matrix
    conf_matrix = confusion_matrix(y_test, test_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {protein_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

def plot_training_history(history, title=''):
    # Plot accuracy
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Model Accuracy for {title}')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='lower right')

    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Model Loss for {title}')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')

    plt.tight_layout()
    plt.show()

class Metrics(Callback):
    def on_epoch_end(self, epoch, logs=None):
        train_pred = (self.model.predict(X_train) > 0.5).astype(int)
        test_pred = (self.model.predict(X_test) > 0.5).astype(int)
        metrics = {
            "Training": (y_train, train_pred),
            "Testing": (y_test, test_pred),
        }
        for key, (y, pred) in metrics.items():
            accuracy = accuracy_score(y, pred)
            precision = precision_score(y, pred, zero_division=0)
            recall = recall_score(y, pred, zero_division=0)
            f1 = f1_score(y, pred, zero_division=0)
            print(f"\n{key} - Epoch: {epoch+1}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

In [None]:
class Attention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super().build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


def create_model_and_embedding_function(max_length, unique_characters, embedding_dim=128, lstm_units=512, dropout_rate=0.5, learning_rate=0.0001):
    sequence_input = Input(shape=(max_length,), name='sequences')
    x = Embedding(input_dim=len(unique_characters) + 1, output_dim=embedding_dim, input_length=max_length)(sequence_input)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Dropout(dropout_rate)(x)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Dropout(dropout_rate)(x)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Dropout(dropout_rate)(x)
    x = Attention()(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(64, activation='relu')(x)
    outputs = Dropout(dropout_rate)(outputs)
    final_output = Dense(1, activation='sigmoid')(outputs)

    model = Model(inputs=[sequence_input], outputs=final_output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate), metrics=['accuracy'])

    # Define a function to get embeddings from the LSTM layer
    embedding_model = Model(inputs=[sequence_input], outputs=x_lstm)

    return model, embedding_model

#Save Model

def get_callbacks(model_filepath):
  callbacks = [
    # EarlyStopping(monitor="val_loss", patience=30),
    ModelCheckpoint(
        filepath=model_filepath,
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=False,
        mode='auto',
        period=1)
    ]

  return callbacks

model_filepath = "/content/drive/My Drive/Data_Final_Code/mirna_model.h.Jun12"
callbacks = get_callbacks(model_filepath)

#Create train and test

from sklearn.model_selection import train_test_split
import numpy as np

# Placeholder for combined data
all_sequences = []
all_labels = []

# Combine data from all keys
for key in rbp_data_dict_nopssm:
    all_sequences.extend(rbp_data_dict_nopssm[key]['sequences'])
    all_labels.extend(rbp_data_dict_nopssm[key]['labels'])

# Convert lists to NumPy arrays
all_sequences = np.array(all_sequences)
all_labels = np.array(all_labels)

# Split into training and test sets
X_train_seq, X_test_seq, y_train, y_test = train_test_split(
    all_sequences, all_labels, test_size=0.1, random_state=42
)
# Print shapes to verify the split
print(f'Training sequences shape: {X_train_seq.shape}')
print(f'Test sequences shape: {X_test_seq.shape}')
print(f'Training labels shape: {y_train.shape}')
print(f'Test labels shape: {y_test.shape}')



# Calculate max_length
max_length = max(len(seq) for key in rbp_data_dict_nopssm for seq in rbp_data_dict_nopssm[key]['sequences'])

print(f"max_length: {max_length}, num_unique_characters: {global_unique_characters}")

# Assuming create_model_with_pssm function is defined as in your snippet
model, embedding_model = create_model_and_embedding_function(max_length, global_unique_characters)
model.summary()


# Save model summary to a file
import sys
from contextlib import redirect_stdout

summary_path = '/content/drive/My Drive/model_summary.txt'

with open(summary_path, 'w') as f:
    with redirect_stdout(f):
        model.summary()




history = model.fit([X_train_seq], y_train, validation_data=([X_test_seq], y_test), batch_size=64, epochs=50,callbacks=callbacks)
# history = model.fit([X_train_seq], y_train, validation_data=([X_test_seq], y_test), batch_size=32, epochs=35,callbacks=callbacks)


import pandas as pd

history_df = pd.DataFrame(history.history)
history_path = '/content/drive/My Drive/training_history.csv'
history_df.to_csv(history_path, index=False)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
import tensorflow as tf
import pandas as pd



# Calculate max_length
max_length = max(len(seq) for key in rbp_data_dict_nopssm for seq in rbp_data_dict_nopssm[key]['sequences'])

print(f"max_length: {max_length}, num_unique_characters: {global_unique_characters}")

# Assuming create_model_with_pssm function is defined as in your snippet
model, embedding_model = create_model_and_embedding_function(max_length, global_unique_characters)
model.summary()


# Save model summary to a file
import sys
from contextlib import redirect_stdout

summary_path = '/content/drive/My Drive/model_summary.txt'

with open(summary_path, 'w') as f:
    with redirect_stdout(f):
        model.summary()



# Define the checkpoint and logger paths
checkpoint_path = '/content/drive/My Drive/model_checkpoints/cp-{epoch:04d}.ckpt'
history_path = '/content/drive/My Drive/training_history.csv'

# Define the checkpoint callback
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    save_freq='epoch',
    verbose=1
)

# Define a CSV logger to save training history
csv_logger = CSVLogger(history_path, append=True)

# Create the model
model, embedding_model = create_model_and_embedding_function(max_length, global_unique_characters)

# Load the latest checkpoint if available
latest_checkpoint = tf.train.latest_checkpoint('/content/drive/My Drive/model_checkpoints')
if latest_checkpoint:
    model.load_weights(latest_checkpoint)
    print(f"Resuming training from checkpoint: {latest_checkpoint}")

# Get the initial epoch from the latest checkpoint
initial_epoch = int(latest_checkpoint.split('-')[-1].split('.')[0]) if latest_checkpoint else 0

# Train the model
history = model.fit(
    [X_train_seq],
    y_train,
    validation_data=([X_test_seq], y_test),
    batch_size=64,
    epochs=50,
    initial_epoch=initial_epoch,
    callbacks=[checkpoint_callback, csv_logger]
)

# Manually save the model if needed
model.save('/content/drive/My Drive/model.h500')


In [None]:
def evaluate_and_visualize(model, X_seq, y, dataset_name):
    # Generate predictions
    predictions = (model.predict([X_seq]) > 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions, zero_division=0)
    recall = recall_score(y, predictions, zero_division=0)
    f1 = f1_score(y, predictions, zero_division=0)

    # Print metrics
    print(f"\nResults for {dataset_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Plot confusion matrix
    conf_matrix = confusion_matrix(y, predictions)
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {dataset_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Evaluate and visualize for training dataset
evaluate_and_visualize(model, X_train_seq, y_train, "Training Set")

# Evaluate and visualize for test dataset
evaluate_and_visualize(model, X_test_seq, y_test, "Test Set")

plot_training_history(history, title='Model Performance')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_and_visualize_per_protein(model, X_test_seq, X_test_pssm, y_test, protein_name):
    predictions = (model.predict([X_test_seq, X_test_pssm]) > 0.5).astype(int)

    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, zero_division=0)
    recall = recall_score(y_test, predictions, zero_division=0)
    f1 = f1_score(y_test, predictions, zero_division=0)

    print(f"Results for {protein_name}:")
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

    conf_matrix = confusion_matrix(y_test, predictions)
    sns.heatmap(conf_matrix, annot=True, fmt='d')
    plt.title(f'Confusion Matrix for {protein_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
from sklearn.model_selection import train_test_split

# Assuming rbp_data_dict_nopssm is your dataset organized per protein
for protein_name, protein_data in rbp_data_dict_nopssm.items():
    # Prepare the data for the current protein
    sequences = protein_data['sequences']
    labels = protein_data['labels']

    sequences = np.array(sequences)
    labels = np.array(labels)


    # Splitting the data into training and test sets for this specific protein
    X_train_seq, X_test_seq, y_train, y_test = train_test_split(
        sequences, labels, test_size=0.2, random_state=42
    )
    print(f"X_train_seq Sahpe: {X_train_seq.shape}:   y_train: {y_train.shape}")
    print(f"X_test_seq Sahpe:  {X_test_seq.shape}:     y_test: {y_test.shape}" )


    # Assuming you have a function to create the model
    model = create_model_with_pssm(max_length, global_unique_characters)
    # Train the model for the current protein
    # Fit the model on the training data
    history = model.fit([X_train_seq], y_train,
                        validation_data=([X_test_seq], y_test),
                        batch_size=32, epochs=1)

    # Visualize the training process
    plot_training_history(history, title=f'Model Performance for {protein_name}')

    # Evaluate and visualize for the current protein
    evaluate_and_visualize_per_protein(model, X_test_seq, y_test, protein_name)


In [None]:
import random

def sample_positive_sequences(rbp_data_dict_nopssm, sample_size=1000):

    rbp_data_dict_sampled = {}

    for protein_name, data in rbp_data_dict_nopssm.items():
        sequences = data['sequences']
        labels = data['labels']

        # Combine sequences and labels into a single list of tuples
        combined_data = list(zip(sequences, labels))

        # Filter to only include sequences with label 1
        positive_samples = [item for item in combined_data if item[1] == 1]

        # Randomly select up to `sample_size` samples
        num_samples_to_select = min(sample_size, len(positive_samples))
        sampled_positive_sequences = random.sample(positive_samples, num_samples_to_select)

        # Unzip the sequences and labels back into separate lists
        sampled_sequences, sampled_labels = zip(*sampled_positive_sequences) if sampled_positive_sequences else ([], [])

        # Add the sampled data to the new dictionary
        rbp_data_dict_sampled[protein_name] = {
            'sequences': list(sampled_sequences),
            'labels': list(sampled_labels)
        }

    return rbp_data_dict_sampled

# Example usage:
# rbp_data_dict_nopssm is your existing dictionary
sampled_rna_data_dict = sample_positive_sequences(rbp_data_dict_nopssm, 1000)
import random

def sample_positive_sequences(rbp_data_dict_nopssm, sample_size=1000):

    rbp_data_dict_sampled = {}

    for protein_name, data in rbp_data_dict_nopssm.items():
        sequences = data['sequences']
        labels = data['labels']

        # Combine sequences and labels into a single list of tuples
        combined_data = list(zip(sequences, labels))

        # Filter to only include sequences with label 1
        positive_samples = [item for item in combined_data if item[1] == 1]

        # Randomly select up to `sample_size` samples
        num_samples_to_select = min(sample_size, len(positive_samples))
        sampled_positive_sequences = random.sample(positive_samples, num_samples_to_select)

        # Unzip the sequences and labels back into separate lists
        sampled_sequences, sampled_labels = zip(*sampled_positive_sequences) if sampled_positive_sequences else ([], [])

        # Add the sampled data to the new dictionary
        rbp_data_dict_sampled[protein_name] = {
            'sequences': list(sampled_sequences),
            'labels': list(sampled_labels)
        }

    return rbp_data_dict_sampled

# Example usage:
# rbp_data_dict_nopssm is your existing dictionary
sampled_rna_data_dict = sample_positive_sequences(rbp_data_dict_nopssm, 1000)
sampled_rna_data_dict['AATF']['sequences'][0].shape

#Load model
from tensorflow.keras.models import Model
## load model
model_filepath = '/content/drive/My Drive/model.h500'
rna_model = load_model(model_filepath, custom_objects={'Attention': Attention})




for i, layer in enumerate(rna_model.layers):
    print(i, layer.name, layer.output_shape)


embedding_layer_name = 'attention'  # or whatever the correct name is, based on the printed names
embedding_layer_output = rna_model.get_layer(embedding_layer_name).output

# # Create a new model for embeddings
embedding_model = Model(inputs=rna_model.input, outputs=embedding_layer_output)

from tensorflow.keras.models import Model
## load model
model_filepath = '/content/drive/My Drive/model.h500'
checkpoint_path = '/content/drive/My Drive/model_checkpoints'
rna_model = load_model(model_filepath, custom_objects={'Attention': Attention})

# If you need to load the weights from the latest checkpoint
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
if latest_checkpoint:
    rna_model.load_weights(latest_checkpoint)
    print(f"Loaded weights from checkpoint: {latest_checkpoint}")

# Verify the model structure
rna_model.summary()


for i, layer in enumerate(rna_model.layers):
    print(i, layer.name, layer.output_shape)


# or whatever the correct name is, based on the printed names
embedding_layer_output = rna_model.get_layer('attention_1').output

# # Create a new model for embeddings
embedding_model = Model(inputs=rna_model.input, outputs=embedding_layer_output)



In [None]:
def get_embeddings_and_probabilities(sampled_data_dict, model, embedding_model, max_length=101):
    embedding_rna_samples_dict = {}

    for key, data in sampled_data_dict.items():
        sequences = np.array(data['sequences'])

        # Check if reshaping is needed based on your model's input requirements
        # processed_sequences = sequences  # Assume sequences are already in the correct shape
        # Pad sequences to match the input length expected by the model
        processed_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post', dtype='int32')

        # Get embeddings
        embeddings = embedding_model.predict(processed_sequences)

        # Get probabilities
        probabilities = model.predict(processed_sequences).flatten()

        embedding_rna_samples_dict[key] = {
            'embeddings': embeddings,
            'probabilities': probabilities
        }

    return embedding_rna_samples_dict

# Now use the function with your loaded model and embedding model
embedding_rna_samples_dict = get_embeddings_and_probabilities(
    sampled_rna_data_dict, rna_model, embedding_model
)
for key, value in embedding_rna_samples_dict.items():
    print(f"Protein: {key}")
    embeddings = value['embeddings']
    probabilities = value['probabilities']
    print(f"Embeddings Shape: {embeddings.shape}")
    print(f"First Embedding Sample: {embeddings[0]}")
    print(f"Probabilities Shape: {probabilities.shape}")
    print(f"First Probability Sample: {probabilities[0]}")
    print("\n")



In [None]:
import numpy as np

def sum_embeddings(embedding_dict):
    # Initialize a new dictionary to store the results
    summed_embeddings_dict = {}

    # Iterate over each key in the input dictionary
    for key, value in embedding_dict.items():
        print(f"Processing Protein: {key}")
        embeddings = value['embeddings']

        # Sum embeddings along axis 1
        summed_embeddings = np.sum(embeddings, axis=1)

        # Save the summed embeddings in the new dictionary
        summed_embeddings_dict[key] = {
            'summed_embeddings': summed_embeddings
        }

        # Print the shape to verify
        print(f"Summed Embeddings Shape: {summed_embeddings.shape}")

    return summed_embeddings_dict

# Call the function with the example input
summed_embeddings_dict = sum_embeddings(embedding_rna_samples_dict)

# Check the results
for key, value in summed_embeddings_dict.items():
    print(f"Protein: {key}")
    summed_embeddings = value['summed_embeddings']
    print(f"Summed Embeddings Shape: {summed_embeddings.shape}")
summed_embeddings_dict['AATF']['summed_embeddings'][0].shape

In [None]:
# Check if 'AKAP8L' exists in final_data
if 'AKAP8L' in embedding_rna_samples_dict:
    bclaf1_data = embedding_rna_samples_dict['AKAP8L']  # Retrieve data for 'BCLAF1'
    embeddings = bclaf1_data['embeddings']
    probabilities = bclaf1_data['probabilities']

    # Ensure there's data to print
    num_rows_to_print = min(5, len(embeddings))  # Print for 2 rows or fewer if not enough data

    for i in range(num_rows_to_print):
        print(f"Row {i+1} for 'BCLAF1':")
        print("embeddings:", embeddings[i])
        print("probabilities:", probabilities[i])
        print()  # Print a newline for better readability between rows
else:
    print("Key 'BCLAF1' not found in final_data.")

In [None]:


# Read the file into a pandas DataFrame
df = pd.read_csv(miRNA_path, sep='\t')

# Now df contains your data and you can work with it as needed
# For example, to display the first few rows of the DataFrame, you can use:
print(df.head(20))

# If you need to use the DataFrame in other parts of your code, it's now stored in the variable 'df'
miRNA_sequences, miRNA_labels, miRNA_unique_characters = read_and_process_data(miRNA_path)


# Creating the dictionary as specified by the user
protein_name = "AGO"
miRNA_data_dict_sampled = {
    protein_name: {
        'sequences': miRNA_sequences.tolist(),  # Converting the numpy array to a list
        'labels': miRNA_labels.tolist()  # Converting the numpy array to a list
    }
}

# Confirming the keys and types to ensure it's correctly structured
structure_confirmation = {protein_name: {'sequences_type': type(miRNA_data_dict_sampled[protein_name]['sequences']),
                                         'labels_type': type(miRNA_data_dict_sampled[protein_name]['labels'])}}

structure_confirmation

for key in list(miRNA_data_dict_sampled.keys())[:5]:  # Adjust the number to print as needed
    print(f"Key: {key}, Length of data: {len(miRNA_data_dict_sampled[key]['sequences'])}")

# Now use the function with your loaded model and embedding model
embedding_miRNA_dict = get_embeddings_and_probabilities(
    miRNA_data_dict_sampled, rna_model, embedding_model
)


for key, value in embedding_miRNA_dict.items():
    print(f"Protein: {key}")
    embeddings = value['embeddings']
    probabilities = value['probabilities']
    print(f"Embeddings Shape: {embeddings[1]}")
    print(f"First Embedding Sample: {embeddings[3]}")
    print(f"Probabilities Shape: {probabilities.shape}")
    print(f"First Probability Sample: {probabilities[1]}")
    print("\n")


In [None]:
#READ miRNA2 with using the differend MODEL
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# miRNA_path = '/content/drive/My Drive/Data_Final_Code/miRNA-DATA/complementary_miRNA_sequences_ago.txt'
# miRNA_path ='/content/drive/My Drive/Data_Final_Code/miRNA-DATA/miR-223.txt'
miRNA_path ='/content/drive/My Drive/Data_Final_Code/miRNA-DATA/let2-7d.txt'
# miRNA_path ='/content/drive/My Drive/Data_Final_Code/miRNA-DATA/complementary-let-7d.txt'

def process_miRNA_sequences(file_path, RBP_name, max_length=None):
    # Load miRNA sequences
    miRNA_df = pd.read_csv(file_path, sep='\t')

    # Tokenize the sequences
    tokenizer = Tokenizer(char_level=True)  # Use char_level for nucleotide sequences
    tokenizer.fit_on_texts(miRNA_df['sequence'])

    # Convert sequences to numerical format
    sequences = tokenizer.texts_to_sequences(miRNA_df['sequence'])

    # Determine max length if not specified
    if max_length is None:
        max_length = max(len(seq) for seq in sequences)

    # Pad sequences to have the same length
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Calculate lengths of each sequence
    lengths = [len(seq) for seq in sequences]

    # # Labels
    # labels = miRNA_df['label'].values

    # Create dictionary
    miRNA_dic = {
        RBP_name: {
            'miRNA_Sequence': padded_sequences,
            'miRNA_length': lengths
        }
    }

    # Return the dictionary and other relevant information
    return miRNA_dic, max_length, tokenizer
miRNA_sequences_dic, max_length, tokenizer = process_miRNA_sequences(miRNA_path,"AGO",101)
miRNA_sequences_dic['AGO']['miRNA_length'][0]


def get_embeddings_and_probabilities_mirna(sequences, model, embedding_model):
    probabilities = model.predict(sequences).flatten()
    embeddings = embedding_model.predict(sequences)

    # Instead of returning a dictionary with generic keys, use 'AGO' as the key for your data
    miRNA_embedding_dict = {
        'AGO': {
            "embeddings": embeddings,
            "probabilities": probabilities
        }
    }

    return miRNA_embedding_dict

# Assuming miRNA_sequences, miRNA_labels, miRNA_model, and miRNA_embedding_model are already defined
embedding_miRNA_dict = get_embeddings_and_probabilities_mirna(miRNA_sequences_dic['AGO']['miRNA_Sequence'], rna_model, embedding_model)

embedding_miRNA_dict['AGO']['embeddings'].shape

import numpy as np

def sum_embeddings_mirna(embedding_dict, miRNA_dict):
    # Initialize a new dictionary to store the results
    summed_embeddings_dict = {}

    # Iterate over each key in the input dictionary
    for key in embedding_dict.keys():
        print(f"Processing Protein: {key}")
        embeddings = embedding_dict[key]['embeddings']

        # Get the corresponding miRNA lengths
        miRNA_lengths = miRNA_dict[key]['miRNA_length']

        # Initialize a list to store summed embeddings for this key
        summed_embeddings_list = []

        # Sum embeddings for each sequence using the correct length
        for i, miRNA_length in enumerate(miRNA_lengths):
            summed_embedding = np.sum(embeddings[i, :miRNA_length, :], axis=0)
            summed_embeddings_list.append(summed_embedding)

        # Convert the list to a numpy array
        summed_embeddings = np.array(summed_embeddings_list)

        # Save the summed embeddings in the new dictionary
        summed_embeddings_dict[key] = {
            'summed_embeddings': summed_embeddings
        }

        # Print the shape to verify
        print(f"Summed Embeddings Shape: {summed_embeddings.shape}")

    return summed_embeddings_dict


# import numpy as np

# def sum_embeddings_mirna(embedding_dict, miRNA_dict):
#     # Initialize a new dictionary to store the results
#     summed_embeddings_dict = {}
#     i=0

#     # Iterate over each key in the input dictionary
#     for key, value in embedding_dict.items():
#         print(f"Processing Protein: {key}")
#         embeddings = value['embeddings']

#         # Get the corresponding miRNA lengths
#         miRNA_lengths = miRNA_dict[key]['miRNA_length']
# #
#         # Initialize a list to store summed embeddings for this key
#         summed_embeddings_list = []
#         miRNA_length = miRNA_lengths[i]
#         print(f"miRNA_length: {miRNA_length}")
#         # Sum embeddings along axis 1
#         summed_embeddings = np.sum(embeddings[:, :miRNA_length, :], axis=1)

#         # Save the summed embeddings in the new dictionary
#         summed_embeddings_dict[key] = {
#             'summed_embeddings': summed_embeddings
#         }

#         # Print the shape to verify
#         print(f"Summed Embeddings Shape: {summed_embeddings.shape}")
#         i+=1

#     return summed_embeddings_dict

embeddings_dict_miRNA = sum_embeddings_mirna(embedding_miRNA_dict,miRNA_sequences_dic)

for key, value in embeddings_dict_miRNA.items():
    print(f"Protein: {key}")
    embeddings = value['summed_embeddings']
    print(f"Embeddings Shape: {embeddings.shape}")
    print(f"First Embedding Sample: {embeddings[1]}")

In [None]:
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def flatten_embeddings(embeddings):ab, how I can conver my code ther  to .py file and
    # Normalize embeddings to unit length
    return normalize(embeddings, axis=1, norm='l2') #l1

def calculate_average_similarity(embedding_rna_samples_dict, embedding_miRNA_dict):
    protein_similarity_dict = {}

    # Extract and normalize the AGO miRNA embeddings
    miRNA_embeddings = embedding_miRNA_dict['AGO']['embeddings']
    # miRNA_embeddings = embedding_miRNA_dict['AGO']['summed_embeddings']
    miRNA_embeddings_flat = flatten_embeddings(miRNA_embeddings)

    for protein, rna_data in embedding_rna_samples_dict.items():
        rna_embeddings = rna_data['embeddings']
        # rna_embeddings = rna_data['summed_embeddings']
        rna_embeddings_flat = flatten_embeddings(rna_embeddings)

        # Calculate the cosine similarity matrix
        similarity_matrix = cosine_similarity(rna_embeddings_flat, miRNA_embeddings_flat)

        # Calculate the average similarity for the entire matrix
        average_similarity = np.mean(similarity_matrix)

        # Store the average similarity score for each protein
        protein_similarity_dict[protein] = average_similarity

    # Sort the dictionary by similarity score in descending order
    sorted_protein_average_similarity_dict = dict(sorted(protein_similarity_dict.items(), key=lambda item: item[1], reverse=True))

    return sorted_protein_average_similarity_dict

# Assuming that embedding_rna_samples_dict and embedding_miRNA_dict are properly defined and populated
#Attention
protein_average_similarity_scores = calculate_average_similarity(embedding_rna_samples_dict, embedding_miRNA_dict)
#bi-directional
# protein_average_similarity_scores = calculate_average_similarity(summed_embeddings_dict, embeddings_dict_miRNA)
protein_average_similarity_scores


def flatten_embeddings(embeddings):
    # Ensures the output is 2D: (num_samples, 1)
    return np.mean(embeddings, axis=1).reshape(-1, 1)
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_average_similarity(embedding_rna_samples_dict, embedding_miRNA_dict):
    protein_similarity_dict = {}

    # Extract and flatten the AGO miRNA embeddings
    miRNA_embeddings = embedding_miRNA_dict['AGO']['embeddings']
    # miRNA_embeddings = embedding_miRNA_dict['AGO']['summed_embeddings']
    miRNA_embeddings_flat = flatten_embeddings(miRNA_embeddings)

    for protein, rna_data in embedding_rna_samples_dict.items():
        rna_embeddings = rna_data['embeddings']
        # rna_embeddings = rna_data['summed_embeddings']
        # Flatten the RNA embeddings
        rna_embeddings_flat = flatten_embeddings(rna_embeddings)

        # Calculate the cosine similarity matrix
        similarity_matrix = cosine_similarity(rna_embeddings_flat, miRNA_embeddings_flat)

        # Calculate the average similarity for the entire matrix
        average_similarity = np.mean(similarity_matrix)

        # Store the average similarity score for each protein
        protein_similarity_dict[protein] = average_similarity

    # Sort the dictionary by similarity score in descending order
    sorted_protein_average_similarity_dict = dict(sorted(protein_similarity_dict.items(), key=lambda item: item[1], reverse=True))

    return sorted_protein_average_similarity_dict


#Attention layer
protein_average_similarity_scores = calculate_average_similarity(embedding_rna_samples_dict, embedding_miRNA_dict)
#bi-Direction layer
# protein_average_similarity_scores = calculate_average_similarity(summed_embeddings_dict, embeddings_dict_miRNA)
protein_average_similarity_scores


from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def flatten_embeddings(embeddings):
    # Averages embeddings across the sequence length dimension
    # This transforms the shape from (num_samples, sequence_length, embedding_dim)
    # to (num_samples, embedding_dim)
    return np.mean(embeddings, axis=1)

def calculate_average_similarity(embedding_rna_samples_dict, embedding_miRNA_dict):
    protein_similarity_dict = {}

    # Assuming 'AGO' miRNA embeddings as the comparison base
    miRNA_embeddings = embedding_miRNA_dict['AGO']['embeddings']
    # Flatten the miRNA embeddings
    miRNA_embeddings_flat = flatten_embeddings(miRNA_embeddings)

    for protein, rna_data in embedding_rna_samples_dict.items():
        rna_embeddings = rna_data['embeddings']
        # Flatten the RNA embeddings
        rna_embeddings_flat = flatten_embeddings(rna_embeddings)

        # Calculate the cosine similarity matrix between flattened embeddings
        similarity_matrix = cosine_similarity(rna_embeddings_flat, miRNA_embeddings_flat)

        # Calculate the average similarity for the entire matrix
        average_similarity = np.mean(similarity_matrix)

        # Store the average similarity score for the protein
        protein_similarity_dict[protein] = average_similarity

        # Sort the dictionary by similarity in descending order
        sorted_protein_avrage__similarity_dict = dict(sorted(protein_similarity_dict.items(), key=lambda item: item[1], reverse=True))

    return sorted_protein_avrage__similarity_dict
protein_average_similarity_scores = calculate_average_similarity(embedding_rna_samples_dict, embedding_miRNA_dict)
protein_average_similarity_scores

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def flatten_embeddings(embeddings):
    """
    Averages embeddings across the sequence length dimension, converting from
    3D shape (num_samples, sequence_length, embedding_dim) to 2D (num_samples, embedding_dim).
    """
    return np.mean(embeddings, axis=1)

def calculate_max_similarity(embedding_rna_samples_dict, embedding_miRNA_dict):
    protein_similarity_dict = {}

    # We are only comparing with 'AGO' miRNA embeddings in the current setup
    miRNA_embeddings = embedding_miRNA_dict['AGO']['embeddings']
    # Flatten the miRNA embeddings
    miRNA_embeddings_flat = flatten_embeddings(miRNA_embeddings)

    for protein, rna_data in embedding_rna_samples_dict.items():
        rna_embeddings = rna_data['embeddings']
        # Flatten the RNA embeddings
        rna_embeddings_flat = flatten_embeddings(rna_embeddings)

        # Calculate the cosine similarity matrix between flattened embeddings
        similarity_matrix = cosine_similarity(rna_embeddings_flat, miRNA_embeddings_flat)

        # Calculate the maximum similarity for the entire matrix
        max_similarity = np.max(similarity_matrix)

        # Store the maximum similarity score for the protein
        protein_similarity_dict[protein] = max_similarity

    # Sort the dictionary by similarity in descending order
    sorted_protein_similarity_dict = dict(sorted(protein_similarity_dict.items(), key=lambda item: item[1], reverse=True))

    return sorted_protein_similarity_dict
protein_max_similarity_scores = calculate_max_similarity(embedding_rna_samples_dict, embedding_miRNA_dict)
protein_max_similarity_scores

In [None]:
import numpy as np
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Concatenate
from keras.optimizers import Adam
from keras import backend as K

# Define a custom layer for cosine similarity
class CosineSimilarityLayer(Layer):
    def __init__(self, **kwargs):
        super(CosineSimilarityLayer, self).__init__(**kwargs)

    def call(self, inputs):
        x, y = inputs
        x = K.l2_normalize(x, axis=-1)
        y = K.l2_normalize(y, axis=-1)
        return K.batch_dot(x, y, axes=-1)

def create_cnn_model(input_shape, conv_filters=(64, 128), kernel_sizes=(3, 3), pool_size=2, dense_units=256, dropout_rate=0.5, learning_rate=0.0001):
    # Input layers for PSSM and PSCM
    pssm_input = Input(shape=input_shape, name='pssm_input')
    pscm_input = Input(shape=input_shape, name='pscm_input')

    # Convolutional layers for PSSM
    x_pssm = Conv2D(conv_filters[0], kernel_size=kernel_sizes, activation='relu', padding='same')(pssm_input)
    x_pssm = MaxPooling2D(pool_size=(pool_size, pool_size))(x_pssm)
    x_pssm = Conv2D(conv_filters[1], kernel_size=kernel_sizes, activation='relu', padding='same')(x_pssm)
    x_pssm = MaxPooling2D(pool_size=(pool_size, pool_size))(x_pssm)
    x_pssm = Flatten()(x_pssm)
    x_pssm = Dense(dense_units, activation='relu')(x_pssm)
    x_pssm = Dropout(dropout_rate)(x_pssm)

    # Convolutional layers for PSCM
    x_pscm = Conv2D(conv_filters[0], kernel_size=kernel_sizes, activation='relu', padding='same')(pscm_input)
    x_pscm = MaxPooling2D(pool_size=(pool_size, pool_size))(x_pscm)
    x_pscm = Conv2D(conv_filters[1], kernel_size=kernel_sizes, activation='relu', padding='same')(x_pscm)
    x_pscm = MaxPooling2D(pool_size=(pool_size, pool_size))(x_pscm)
    x_pscm = Flatten()(x_pscm)
    x_pscm = Dense(dense_units, activation='relu')(x_pscm)
    x_pscm = Dropout(dropout_rate)(x_pscm)

    # Concatenate PSSM and PSCM encoded representations
    concatenated = Concatenate()([x_pssm, x_pscm])

    # Dense and dropout layers
    x = Dense(dense_units, activation='relu')(concatenated)
    x = Dropout(dropout_rate)(x)

    # Output layer for similarity score
    similarity_score = Dense(1, activation='sigmoid')(x)

    # Model for training
    model = Model(inputs=[pssm_input, pscm_input], outputs=similarity_score)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate), metrics=['accuracy'])

    # Define a function to get embeddings
    embedding_model = Model(inputs=[pssm_input, pscm_input], outputs=concatenated)

    return model, embedding_model

# Example usage
input_shape = (100, 100, 1)  # Example input shape, should be adapted to your data
model, embedding_model = create_cnn_model(input_shape)
model.summary()
