In [None]:
!pip install tensorflow-gpu==2.10.0
!pip install cudnn==8.4.1
!pip install cudatoolkit==11.8.0
!pip install pillow
!pip install scikit-learn
!pip install openpyxl
!pip install opencv
!pip install pandas
!pip install matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
# 切换到目标目录
os.chdir('/content/drive/MyDrive/rdkit')
# 打印当前工作目录
print("当前工作目录：", os.getcwd())

In [None]:
model_path = 'Dataset/23_Antioxidant/best_model3.h5'

prediction_file = 'Dataset/23_Antioxidant/test.xlsx'

output_file_path = 'predictions.csv'

In [None]:
import csv
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import re

def preprocess_sequence(sequence):
    # 读取所有照片并将它们存储在一个字典中
    images = {}
    folder_path = "residues32/IA"
    file_names = os.listdir(folder_path)

    # 加载和预处理图像
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        image = cv2.imread(file_path)
        image = cv2.resize(image, (32, 32))  # 调整图像大小为 32x32
        image = tf.cast(image, tf.float32) / 255.0  # 转换为 float32 并标准化到 [0, 1] 范围内
        # # 应用 Z-分数标准化（减去均值并除以标准差）
        # mean = tf.math.reduce_mean(image)
        # std = tf.math.reduce_std(image)
        # image = (image - mean) / std

        # 将NaN替换为0
        image = tf.where(tf.math.is_nan(image), tf.zeros_like(image), image)
        images[file_path[14:-4]] = image
    def map_seq(input_str):
        # print(input_str)
        # # 查看图像的形状以调试
        char_images = []  # 创建一个空列表
        prev_index = None
        # 遍历输入字符串中的每个字符
        for index, char in enumerate(input_str):
            # 如果当前字符是 "x"，记住它的索引并退出循环
            if char == 'x':
                prev_index = index-1
                break
        # 如果没有找到 "x"，则记住最后一个字符的索引
        if prev_index is None and len(input_str) > 0:
            prev_index = len(input_str) - 1

        # 遍历输入字符串中的每个字符
        for n in range(len(input_str)):
            # for char in input_str[n:n+1]:
            if n == prev_index:
                char = input_str[n]
                image_key = char + '_C'
                #print(image_key)
                char_tensor = tf.convert_to_tensor(images.get(image_key))
                char_images.append(char_tensor)
            elif n == 0:
                char = input_str[n]
                image_key = char + '_N'
                char_tensor = tf.convert_to_tensor(images.get(image_key))
                char_images.append(char_tensor)
            # 检查字符是否在images字典中
            elif n != prev_index:
                char = input_str[n]
                #print(char)
                # 如果在images字典中，将对应的图像转换为Tensor并添加到列表中
                char_tensor = tf.convert_to_tensor(images.get(char))
                char_images.append(char_tensor)
        char_images = np.array(char_images)

        seq_frames = tf.stack(char_images, axis=0)
        return seq_frames

    input_seq = sequence.numpy().decode("utf-8")
    #print("input sequence"+input_seq)
    processed_data = []
    # for seq in input_seq:
    #     print(seq)
    seq_frames = map_seq(input_seq)
    processed_data.append(seq_frames)
    processed_data = tf.convert_to_tensor(processed_data)
    #processed_data = tf.squeeze(processed_data, axis=1)
    return processed_data

def preprocess_seq(filename, max_length):
    data = pd.read_excel(filename, engine='openpyxl', keep_default_na=False, na_values=[''])
    sequences = data['sequence'].tolist()
    labels = data['label'].tolist()
    # print(len(sequences))
    # print(len(labels))
    processed_data = []

    for seq, label in zip(sequences, labels):
        # 移除行尾空格并填充到指定长度
        seq = seq.strip().ljust(max_length, 'x')
        processed_data.append((seq, label))  # 将数据和标签打包成一个元组并添加到列表中
    #print(processed_data)
    return processed_data

def get_max_length(filename1,filename2,max_length):

    def count_max_length(data):
        sequences = data['sequence'].tolist()
        labels = data['label'].tolist()
        max_length = 0
        positive_sequences = []
        negative_sequences = []
        for seq, label in zip(sequences, labels):
            if label == 1:
                positive_sequences.append(seq)
            else:
                negative_sequences.append(seq)
            max_length = max(max_length, len(seq))
        return max_length
    # 从文件中读取每一行并将其与相应的照片相关联
    data1 = pd.read_excel(filename1, engine='openpyxl', keep_default_na=False, na_values=[''])  # 指定engine为'openpyxl'或'xlrd'
    data2 = pd.read_excel(filename2, engine='openpyxl', keep_default_na=False, na_values=[''])
    max_length1 = count_max_length(data1)
    max_length2 = count_max_length(data2)
    if max_length1>max_length:
        max_length =max_length1
    if max_length2>max_length1:
        max_length=max_length2
    print("数据集中字符最大长度:", max_length)
    return max_length
def load_and_preprocess_data(sequences, labels, batch_size=16):
    # 将 sequences 转换为张量
    sequences = tf.constant(sequences, dtype=tf.string)
    labels = tf.constant(labels, dtype=tf.int32)
    # 创建 tf.data.Dataset，并使用 map 应用处理函数
    sequence_dataset = tf.data.Dataset.from_tensor_slices(sequences)
    labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
    # 合并数据集
    dataset = tf.data.Dataset.zip((sequence_dataset, labels_dataset))

    def map_fn(sequence, label):
        # 处理 sequence
        processed_sequence = tf.py_function(preprocess_sequence, [sequence], tf.float32)
        return processed_sequence,sequence, label

    dataset = dataset.map(lambda sequence, label: map_fn(sequence, label), num_parallel_calls=tf.data.AUTOTUNE)
    # dataset = dataset.shuffle(buffer_size=len(sequences)).cache()
    # dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    return dataset

def read_sequences(filename):
    """
    自动识别文件格式并读取其中的序列
    Args:
      filename: 文件名

    Returns:
      序列列表
    """
    _, ext = os.path.splitext(filename)
    if ext == ".fasta":
        with open(filename, "r") as f:
            sequences = list(SeqIO.parse(f, "fasta"))
        return [str(record.seq) for record in sequences]
    elif ext in [".txt"]:
        with open(filename, "r") as f:
            sequences = f.readlines()
        return [seq.strip() for seq in sequences]
    elif ext == ".xlsx":
        data = pd.read_excel(filename, engine='openpyxl', keep_default_na=False, na_values=[''])
        return data['Sequence'].tolist()
    elif ext in [".csv"]:
        data = pd.read_csv(filename, na_filter=False)
        return data['Sequence'].tolist()
    else:
        raise ValueError("Unsupported file format: {}".format(ext))


In [None]:
max_length = 0
max_length = get_max_length(train_file, val_file, max_length)
print(max_length)

model = tf.keras.models.load_model(model_path)
model.summary()

sequences = read_sequences(filename)
preprocess_predictions = preprocess_seq(sequences, max_length)

batch_frame = np.vstack([preprocess_sequence(seq) for seq in preprocess_predictions])
batch_predictions = model.predict(batch_frame)

batch_predictions_binary = (batch_predictions > 0.5).astype("int32")
output_file_path = 'predictions.csv'

with open(output_file_path, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)
    if csvfile.tell() == 0:
        writer.writerow(["Sequence", "Label", "Prediction"])
    for seq, predictions in zip(sequences, batch_predictions):
        seq_str = f'{seq}'
        if len(seq) > max_length:
            label = 'out of max length'
            predictions_str = 'out of max length'
            writer.writerow([seq_str, label, predictions_str])
        else:
            label = 0 if predictions < 0.5 else 1
            prediction_value = predictions[0]
            predictions_str = f"{prediction_value:.6f}"
            writer.writerow([seq_str, label, predictions_str])