In [3]:
import os
import glob
import numpy as np
import time
from tqdm import tqdm
import tensorflow as tf
import gc
from imblearn.over_sampling import SMOTE,ADASYN,RandomOverSampler

In [10]:
# # FEATURE : "pt" or "esm2" or "esm1b" or "tape"
# FEATURE = "esm1b"

# # MAXSEQ : length
# length = 1000

# # NUM_FEATURE : dim
# dim = 1280

# ch_train = glob.glob(f"ESM_out/{FEATURE}_d{dim}_L{length}/ionchannels/train/*")
# ch_test = glob.glob(f"ESM_out/{FEATURE}_d{dim}_L{length}/ionchannels/test/*")
# tr_train = glob.glob(f"ESM_out/{FEATURE}_d{dim}_L{length}/iontransporters/train/*")
# tr_test = glob.glob(f"ESM_out/{FEATURE}_d{dim}_L{length}/iontransporters/test/*")
# me_train = glob.glob(f"ESM_out/{FEATURE}_d{dim}_L{length}/membraneproteins/train/*")
# me_test = glob.glob(f"ESM_out/{FEATURE}_d{dim}_L{length}/membraneproteins/test/*")

# print(len(ch_train))
# print(len(ch_test))
# print(len(tr_train))
# print(len(tr_test))
# print(len(me_train))
# print(len(me_test))

# Concat

In [4]:
def MCNN_add_data_load(pos_path, neg_path, imb_mod="None", NUM_CLASSES=2):
    data_list = []
    label_list = []
    
    for i in tqdm(pos_path, desc="Loading data", unit="file"):
        data = np.load(i).astype("float16")
        data_list.append(data)
        label_list.append(1)
        
        # pos_path.set_description('Pos')
        # break
    
    for j in tqdm(neg_path, desc="Loading data", unit="file"):
        data = np.load(j).astype("float16")
        data_list.append(data)
        label_list.append(0)
        
    
    
    x_train = np.concatenate(data_list, axis=0)
    y_train = np.array(label_list)

    x_shape = x_train.shape
    
    print(x_shape)
    
    # 獲取數據的索引並打亂它們
    num_samples = len(x_train)
    shuffle_indices = np.arange(num_samples)
    np.random.shuffle(shuffle_indices)
    
    # 使用打亂後的索引重新排列特徵和標籤數組
    x_train = x_train[shuffle_indices].reshape(x_shape[0], 1, x_shape[1], x_shape[2])

    if imb_mod != "None":
        y_train = y_train[shuffle_indices]
    else:
        y_train = tf.keras.utils.to_categorical(y_train[shuffle_indices], NUM_CLASSES)
        
    return x_train, y_train

In [5]:
# FEATURE : "pt" or "esm2"
# MAXSEQ : length
# NUM_FEATURE : dim
# DATA_TYPE : "ch", "tr", "me"
# IMBALANCE : "SMOTE", "ADASYN", "RANDOM"
# ClASS : "A", "B"
# SET : "train", "test"

def imb(FEATURE, MAXSEQ, NUM_FEATURE, DATA_TYPE,  ClASS, SET, IMBALANCE="None", NUM_CLASSES=2):
    print(f"********************** {FEATURE} **********************")
    
    ch_train = glob.glob(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/ionchannels/train/*")
    ch_test = glob.glob(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/ionchannels/test/*")
    tr_train = glob.glob(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/iontransporters/train/*")
    tr_test = glob.glob(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/iontransporters/test/*")
    me_train = glob.glob(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/membraneproteins/train/*")
    me_test = glob.glob(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/membraneproteins/test/*")
    
    # print(len(ch_train))
    # print(len(ch_test))
    # print(len(tr_train))
    # print(len(tr_test))
    # print(len(me_train))
    # print(len(me_test))

    if DATA_TYPE == "ch":
        pos_train = ch_train
        neg_train = tr_train + me_train
        
        pos_test = ch_test
        neg_test = tr_test + me_test
        print("********************** pos : ionchannels **********************")
    
    elif DATA_TYPE == "tr":
        pos_train = tr_train
        neg_train = ch_train + me_train
        
        pos_test = tr_test
        neg_test = ch_test + me_test
        print("********************** pos : iontransporters **********************")
    
    elif DATA_TYPE == "me":
        pos_train = me_train
        neg_train = ch_train + tr_train
        
        pos_test = me_test
        neg_test = ch_test + tr_test 
        print("********************** pos : membraneproteins **********************")

    "=============================================================================================="
    if SET=="train":
        print("********************** SET : Train loadding **********************")
        x_train, y_train = MCNN_add_data_load(pos_train, neg_train, imb_mod=IMBALANCE)
        print("Class:", x_train.shape, y_train.shape)


    elif SET=="test":
        print("********************** SET : Test loadding **********************")
        x_test, y_test= MCNN_add_data_load(pos_test, neg_test)
        print("Class:", x_test.shape, y_test.shape)
        np.savez(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/Class_{ClASS}_L{MAXSEQ}_d{NUM_FEATURE}_test_{FEATURE}.npz", feature=x_test, label=y_test)
    "=============================================================================================="
    
    if IMBALANCE!="None" and SET=="train":
    
        # 將 x_train 的形狀重新整形為二維
        x_train_2d = x_train.reshape(x_train.shape[0], -1)
        print(x_train_2d.shape)
        print(y_train.shape)
        
        del x_train
        gc.collect()
        
        #print(y_train.shape)
        # 創建 SMOTE 物件
        if IMBALANCE=="SMOTE":
            imbalance = SMOTE(random_state=42)
        elif IMBALANCE=="ADASYN":
            imbalance = ADASYN(random_state=42)
        elif IMBALANCE=="RANDOM":
            imbalance = RandomOverSampler(random_state=42)
            
        print("imb:", IMBALANCE)
        
    
        # 使用 fit_resample 進行過採樣
        x_train_resampled, y_train_resampled = imbalance.fit_resample(x_train_2d, y_train)
    
        # 將 x_train_resampled 的形狀恢復為四維
        x_train_resampled = x_train_resampled.reshape(x_train_resampled.shape[0], 1,MAXSEQ, NUM_FEATURE)
    
        print(x_train_resampled.shape)
        print(y_train_resampled.shape)
    
        x_train=x_train_resampled
        y_train=y_train_resampled
        
        del x_train_resampled
        del y_train_resampled
        del x_train_2d
        gc.collect()
    
        y_train = tf.keras.utils.to_categorical(y_train,NUM_CLASSES)
        
        print("不平衡處理 :", x_train.shape, y_train.shape)

        
        np.savez(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/Class_{ClASS}_L{MAXSEQ}_d{NUM_FEATURE}_{IMBALANCE}_{FEATURE}.npz", feature=x_train, label=y_train)
        del x_train
        del y_train
        gc.collect()
        
    elif IMBALANCE=="None" and SET=="train":
        np.savez(f"get_feature/example/{FEATURE}_d{NUM_FEATURE}_L{MAXSEQ}/Class_{ClASS}_L{MAXSEQ}_d{NUM_FEATURE}_{IMBALANCE}_{FEATURE}.npz", feature=x_train, label=y_train)
        del x_train
        del y_train
        gc.collect()


In [6]:
# FEATURE : "pt" or "esm2" or "esm1b" or "tape"
FEATURE = "pt"

# MAXSEQ : length
MAXSEQ = 1000

# NUM_FEATURE : dim
NUM_FEATURE = 1024

# DATA_TYPE : "ch", "tr" ("ch" is ionchannels, "tr" is iontransporters)
# IMBALANCE : "SMOTE", "ADASYN", "RANDOM"
# ClASS : "A", "B" ("A" is ionchannels, "B" is iontransporters)
# SET : "train", "test"


"---------------------------- funtion ----------------------------"
# imb(FEATURE, MAXSEQ, NUM_FEATURE, "ch", "A", "train", "SMOTE")
# imb(FEATURE, MAXSEQ, NUM_FEATURE, "ch", "A", "train", "ADASYN")
# imb(FEATURE, MAXSEQ, NUM_FEATURE, "ch", "A", "train", "RANDOM")

# imb(FEATURE, MAXSEQ, NUM_FEATURE, "tr", "B", "train", "SMOTE")
# imb(FEATURE, MAXSEQ, NUM_FEATURE, "tr", "B", "train", "ADASYN")
# imb(FEATURE, MAXSEQ, NUM_FEATURE, "tr", "B", "train", "RANDOM")

"---------------------------- no imb ----------------------------"
imb(FEATURE, MAXSEQ, NUM_FEATURE, "ch", "A", "train")
imb(FEATURE, MAXSEQ, NUM_FEATURE, "tr", "B", "train")

imb(FEATURE, MAXSEQ, NUM_FEATURE, "ch", "A", "test")
imb(FEATURE, MAXSEQ, NUM_FEATURE, "tr", "B", "test")

********************** pt **********************
********************** pos : ionchannels **********************
********************** SET : Train loadding **********************


Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 127.53file/s]
Loading data: 100%|█████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 141.84file/s]


(30, 1000, 1024)
Class: (30, 1, 1000, 1024) (30, 2)
********************** pt **********************
********************** pos : iontransporters **********************
********************** SET : Train loadding **********************


Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 156.25file/s]
Loading data: 100%|█████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 161.29file/s]


(30, 1000, 1024)
Class: (30, 1, 1000, 1024) (30, 2)
********************** pt **********************
********************** pos : ionchannels **********************
********************** SET : Test loadding **********************


Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 138.89file/s]
Loading data: 100%|█████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 136.99file/s]


(30, 1000, 1024)
Class: (30, 1, 1000, 1024) (30, 2)
********************** pt **********************
********************** pos : iontransporters **********************
********************** SET : Test loadding **********************


Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 156.25file/s]
Loading data: 100%|█████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 165.29file/s]


(30, 1000, 1024)
Class: (30, 1, 1000, 1024) (30, 2)
