## Environment and Working Directory Configuration

In [None]:
!pip install tensorflow-gpu==2.10.0
!pip install cudnn==8.4.1
!pip install cudatoolkit==11.8.0
!pip install pillow
!pip install scikit-learn
!pip install openpyxl
!pip install opencv
!pip install pandas
!pip install matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
# Change to the target directory
os.chdir('/content/drive/MyDrive/rdkit')
# Print the current working directory
print("The current working directory：", os.getcwd())

## Model and Prediction File Path Configuration

In [None]:
model_path = 'Dataset/23_Antioxidant/best_model3.h5'  # Path to load the model

prediction_file = 'Dataset/23_Antioxidant/test.xlsx'  # Path to the prediction file

results_output_path = 'results/predictions.csv'  # Set the output filename for prediction results (CSV format)

In [None]:
import csv
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import re

def preprocess_sequence(sequence):
    # Read all images and store them in a dictionary
    images = {}
    folder_path = "residues32/IA"
    file_names = os.listdir(folder_path)

    # Load and preprocess images
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        image = cv2.imread(file_path)
        image = cv2.resize(image, (32, 32))
        image = tf.cast(image, tf.float16) / 255.0
        image = tf.where(tf.math.is_nan(image), tf.zeros_like(image), image)
        images[file_path[14:-4]] = image
    def map_seq(input_str):

        char_images = []
        prev_index = None
        for index, char in enumerate(input_str):

            if char == 'x':
                prev_index = index-1
                break

        if prev_index is None and len(input_str) > 0:
            prev_index = len(input_str) - 1

        for n in range(len(input_str)):
            if n == prev_index:
                char = input_str[n]
                image_key = char + '_C'

                char_tensor = tf.convert_to_tensor(images.get(image_key))
                char_images.append(char_tensor)
            elif n == 0:
                char = input_str[n]
                image_key = char + '_N'
                char_tensor = tf.convert_to_tensor(images.get(image_key))
                char_images.append(char_tensor)

            elif n != prev_index:
                char = input_str[n]

                char_tensor = tf.convert_to_tensor(images.get(char))
                char_images.append(char_tensor)
        char_images = np.array(char_images)

        seq_frames = tf.stack(char_images, axis=0)
        return seq_frames

    input_seq = sequence.numpy().decode("utf-8")

    processed_data = []

    seq_frames = map_seq(input_seq)
    processed_data.append(seq_frames)
    processed_data = tf.convert_to_tensor(processed_data)
    return processed_data

def preprocess_seq(filename, max_length):
    data = pd.read_excel(filename, engine='openpyxl', keep_default_na=False, na_values=[''])
    sequences = data['sequence'].tolist()
    labels = data['label'].tolist()

    processed_data = []

    for seq, label in zip(sequences, labels):

        seq = seq.strip().ljust(max_length, 'x')
        processed_data.append((seq, label))

    return processed_data

def get_max_length(filename1,filename2,max_length):

    def count_max_length(data):
        sequences = data['sequence'].tolist()
        labels = data['label'].tolist()
        max_length = 0
        positive_sequences = []
        negative_sequences = []
        for seq, label in zip(sequences, labels):
            if label == 1:
                positive_sequences.append(seq)
            else:
                negative_sequences.append(seq)
            max_length = max(max_length, len(seq))
        return max_length

    data1 = pd.read_excel(filename1, engine='openpyxl', keep_default_na=False, na_values=[''])
    data2 = pd.read_excel(filename2, engine='openpyxl', keep_default_na=False, na_values=[''])
    max_length1 = count_max_length(data1)
    max_length2 = count_max_length(data2)
    if max_length1>max_length:
        max_length =max_length1
    if max_length2>max_length1:
        max_length=max_length2
    return max_length
def load_and_preprocess_data(sequences, labels, batch_size=16):

    sequences = tf.constant(sequences, dtype=tf.string)
    labels = tf.constant(labels, dtype=tf.int32)

    sequence_dataset = tf.data.Dataset.from_tensor_slices(sequences)
    labels_dataset = tf.data.Dataset.from_tensor_slices(labels)

    dataset = tf.data.Dataset.zip((sequence_dataset, labels_dataset))

    def map_fn(sequence, label):

        processed_sequence = tf.py_function(preprocess_sequence, [sequence], tf.float32)
        return processed_sequence,sequence, label

    dataset = dataset.map(lambda sequence, label: map_fn(sequence, label), num_parallel_calls=tf.data.AUTOTUNE)

    dataset = dataset.batch(batch_size)
    return dataset

def read_sequences(filename):
    """
    自动识别文件格式并读取其中的序列
    Args:
      filename: filename

    Returns:
      sequences: list of sequences
    """
    _, ext = os.path.splitext(filename)
    if ext == ".fasta":
        with open(filename, "r") as f:
            sequences = list(SeqIO.parse(f, "fasta"))
        return [str(record.seq) for record in sequences]
    elif ext in [".txt"]:
        with open(filename, "r") as f:
            sequences = f.readlines()
        return [seq.strip() for seq in sequences]
    elif ext == ".xlsx":
        data = pd.read_excel(filename, engine='openpyxl', keep_default_na=False, na_values=[''])
        return data['Sequence'].tolist()
    elif ext in [".csv"]:
        data = pd.read_csv(filename, na_filter=False)
        return data['Sequence'].tolist()
    else:
        raise ValueError("Unsupported file format: {}".format(ext))


In [None]:
# Load the model
model = tf.keras.models.load_model(model_path)
model.summary()
# Get the model input shape to determine max_length
input_shape = model.input_shape
max_length = input_shape[1]
sequences = read_sequences(filename)
preprocess_predictions = preprocess_seq(sequences, max_length)

batch_frame = np.vstack([preprocess_sequence(seq) for seq in preprocess_predictions])
batch_predictions = model.predict(batch_frame)

batch_predictions_binary = (batch_predictions > 0.5).astype("int32")

with open(results_output_path, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)
    if csvfile.tell() == 0:
        writer.writerow(["Sequence", "Label", "Prediction"])
    for seq, predictions in zip(sequences, batch_predictions):
        seq_str = f'{seq}'
        if len(seq) > max_length:
            label = 'out of max length'
            predictions_str = 'out of max length'
            writer.writerow([seq_str, label, predictions_str])
        else:
            label = 0 if predictions < 0.5 else 1
            prediction_value = predictions[0]
            predictions_str = f"{prediction_value:.6f}"
            writer.writerow([seq_str, label, predictions_str])