#### Setting environments

In [1]:
import torch, os, gc, itertools
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score,\
f1_score, precision_score, recall_score, roc_auc_score, average_precision_score

In [2]:
# Set options
embed_ver = ["clstm", "esm2", "bert", "t5"]

# model options
layer_num = 3
unit_decrease = 2

data_path = "../data/"
model_path = f"../model/emb_gen-ensem/"
result_path = f"../result/prd-emb_gen-ensem/"
os.makedirs(result_path, exist_ok=True)

batch_size = 256
col_str = ['file_id', 'organism', 'locus_tag', 'ess']

In [3]:
# Set data list for test dataset
ts_data = {
    "data1": ["C018"],  # "Escherichia coli K-12 BW25113"
    "data2": ["C016"],  # "Escherichia coli K-12 MG1655"
    "data3": ["O046"],  # "synthetic bacterium JCVI-Syn3A"
    "data4": ["C048"],  # Bacteroides thetaiotaomicron VPI-5482
    "data5": ["C050"]  # Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S
}

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Define function to record perfomance result
def record_perform(comb_ver, file_id, organ, y_real, y_conf, y_prd):
    y_real = y_real.cpu().numpy()
    y_conf = y_conf.cpu().numpy()
    y_prd = y_prd.cpu().numpy()
    
    if file_id != "O046":
        auc_roc = [roc_auc_score(y_real, y_conf)]
        auc_pr = [average_precision_score(y_real, y_conf)]
    else:
        auc_roc = None
        auc_pr = None
    
    tn, fp, fn, tp = confusion_matrix(y_real, y_prd).ravel()
    
    result = pd.DataFrame({
        "comb": [comb_ver],
        "file": [file_id],
        "organism": [organ],
        "tp": [tp],
        "fp": [fp],
        "tn": [tn],
        "fn": [fn],
        "mcc": [matthews_corrcoef(y_real, y_prd)],
        "acc": [accuracy_score(y_real, y_prd)],
        "f1": [f1_score(y_real, y_prd)],
        "prc": [precision_score(y_real, y_prd)],
        "rec": [recall_score(y_real, y_prd)],
        "npv": [precision_score(1 - y_real, 1 - y_prd)],
        "tnr": [recall_score(1 - y_real, 1 - y_prd)],
        "auc-roc": auc_roc,
        "auc-pr": auc_pr
    })

    return result


In [6]:
# Set model architecture
class Classifier(nn.Module):
    def __init__(self, input_size, num_layers, unit_decrease):
        super(Classifier, self).__init__()
        layers = [nn.BatchNorm1d(input_size), nn.Dropout(0.5)]
        in_dim = input_size
        out_dim = 1024
        for i in range(num_layers):            
            out_dim = max(2, out_dim // unit_decrease)
            layers.append(nn.Linear(in_dim, out_dim))
            self.initialize_weights(layers[-1])
            layers.append(nn.GELU())
            in_dim = out_dim
        layers.append(nn.Linear(out_dim, 1))
        self.cls_block = nn.Sequential(*layers)
        
    def initialize_weights(self, layer):
        nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='linear')
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)
    
    def forward(self, x):
        return self.cls_block(x)

#### Evaluation

In [7]:
dfs = [(e_ver, pd.read_csv(data_path + f"data-emb_gen-{e_ver}-ts.csv")) for e_ver in embed_ver]

In [8]:
df_eval = pd.DataFrame()

for r in range(2, len(dfs) + 1):
    combs = list(itertools.combinations(dfs, r))
    
    for comb in combs:
        comb_ver = "_".join([df[0] for df in comb])
        print(f"\n>>>> {comb_ver} <<<<")
        
        # merge dataset
        data = comb[0][1]
        for df in comb[1:]:
            data = pd.merge(data, df[1], on=col_str, suffixes=("", f"_{df[0]}"))
        
        display("Raw data:", data)
        
        # calculate mean of confidences
        col_num = [col for col in data.columns if col not in col_str]
    
        # get test datasets
        loc_ts = {}
        data_ts = {}
        org_ts = {}
        for ts_ver, ids in ts_data.items():
            # get test sample locations
            loc_ts[ts_ver] = data['file_id'].isin(ids)
            # get test samples
            data_ts[ts_ver] = data[loc_ts[ts_ver]]
            org = []
            # get test organism list
            for i in ids:
                organ = data_ts[ts_ver]['organism'][data_ts[ts_ver]['file_id'] == i].to_list()
                if len(organ) > 0:
                    org.append(organ[0])
            org_ts[ts_ver] = org    
            print("Test dataset(" + ts_ver + "):", data_ts[ts_ver].shape)
        print("Test organism:", org_ts, len(org_ts))
        
        # split info.& inputs & labels of the test datasets
        info_ts = {}
        y_ts = {}
        test_loader = {}
        for ts_ver, df in data_ts.items():
            info_ts[ts_ver] = df[col_str]
            X_ts = torch.tensor(df.iloc[:, len(col_str):].astype('float32').values)
            y_ts[ts_ver] = torch.tensor(df['ess'].astype('float32').values)
            print("Splited test dataset(" + ts_ver + "):", X_ts.shape, y_ts[ts_ver].shape)                    
            # generate dataloader by the test datasets
            dataset_ts = TensorDataset(X_ts, y_ts[ts_ver])
            test_loader[ts_ver] = DataLoader(dataset_ts, batch_size=batch_size, shuffle=False)
        
        # get the total test dataset info.
        loc_ts_all = [sum(loc) >= 1 for loc in zip(*loc_ts.values())]
        info_ts_all = data.loc[loc_ts_all, col_str]
        X_ts_all = data.iloc[:, len(col_str):]
        X_ts_all = torch.tensor(X_ts_all.loc[loc_ts_all, :].astype('float32').values)
        y_ts_all = torch.tensor(data.loc[loc_ts_all, 'ess'].astype('float32').values)        
        print("Splited test dataset(all):", X_ts_all.shape, y_ts_all.shape)
        
        # generate dataloader of total test dataset
        test_all_dataset = TensorDataset(X_ts_all, y_ts_all)
        test_all_loader = DataLoader(test_all_dataset, batch_size=256, shuffle=False)
        
        ## Test model ##
        # set model name
        model_name = f"emb_gen-ensem-{comb_ver}"
        print(f"\n===== Test model: {model_name} ====")
        # generate model instance
        model = Classifier(
            input_size=X_ts_all.shape[-1],
            num_layers=layer_num,
            unit_decrease=unit_decrease
        ).to(device)

        # load model weight
        model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))
        model.eval()
        
        # evaluations by test dataset
        for ts_ver, ids in ts_data.items():
            all_preds = []
            all_labels = []
            with torch.no_grad():
                for X_batch, y_batch in test_loader[ts_ver]:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    preds = model(X_batch).squeeze()
                    all_preds.append(preds.cpu())
                    all_labels.append(y_batch.cpu())
            
            # concatenate results to one tensor
            all_preds = torch.cat(all_preds)
            all_labels = torch.cat(all_labels)

            # convert logits to confidences & classes
            prd_conf = torch.sigmoid(all_preds)
            prd_cls = (prd_conf >= 0.5).int()
            # performances by testset
            perform = record_perform(
                comb_ver=f"{comb_ver}",
                test_ver=ts_ver,
                file_id="+".join(ids),
                organ="+".join(org_ts[ts_ver]),
                y_real=y_ts[ts_ver],
                y_conf=prd_conf,
                y_prd=prd_cls,
            )
            display(perform)
            df_eval = pd.concat([df_eval, perform], ignore_index=True)
        
        
        # model evaluation on the total test dataset
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for X_batch, y_batch in test_all_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                preds = model(X_batch).squeeze()
                all_preds.append(preds.cpu())
                all_labels.append(y_batch.cpu())
        
        # concatenate results to one tensor
        all_preds = torch.cat(all_preds)
        all_labels = torch.cat(all_labels)

        # convert logits to confidences & classes
        prd_conf = torch.sigmoid(all_preds)
        prd_cls = (prd_conf >= 0.5).int()

        # performances on total testset
        perform = record_perform(
            comb_ver=f"{comb_ver}",
            test_ver="test_all",
            file_id="+".join([i for ids in ts_data.values() for i in ids]),
            organ="+".join([org for orgs in org_ts.values() for org in orgs]),
            y_real=y_ts_all,
            y_conf=prd_conf,
            y_prd=prd_cls
        )
        display(perform)
        df_eval = pd.concat([df_eval, perform], ignore_index=True)

        # concatenate the protein info. & predicted confidences
        df_prd = pd.DataFrame(prd_conf, columns=["conf"], index=info_ts_all.index)
        df_prd = pd.concat([info_ts_all, df_prd], axis=1)

        # save the model prediction result
        df_prd.to_csv(result_path + f"prd-{model_name}.csv", index=False)
    
        gc.collect()

# save the model perfomance result
display("Model performance:", df_eval)
df_eval.to_csv(f"../result/eval-emb_gen-ensem.csv", index=False)


>>>> clstm_esm2 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,-0.155313,1.499200,-1.622623,1.140054,-0.579188,-0.201200,-0.181841,2.446096,1.391733,-0.146903
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,-0.441345,-0.216852,-0.173726,0.456722,-0.253636,0.488008,-0.237100,0.032918,-0.110359,0.293158
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.291001,-0.496596,-0.240145,0.931340,0.250903,0.500919,-0.206658,0.055057,0.349885,0.051546
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.079341,-0.345971,-0.169741,0.699338,-0.327057,0.669865,-0.155763,0.371676,-0.280873,-0.052240
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,0.157999,-0.030111,0.223549,1.058194,-0.627709,-0.035077,0.126186,0.007433,-0.104928,-0.069931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19398,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,-0.052086,-0.375514,-0.349227,1.061728,-0.046756,0.376358,0.299031,0.954350,1.110562,0.166870
19399,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,-0.216774,-0.193204,-0.085630,0.603998,0.047384,0.273780,0.067045,-0.170716,0.290532,0.482953
19400,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,-0.729781,-0.557699,-0.633800,0.923602,-1.082554,0.885507,-1.106922,-1.015008,-0.201546,-0.682305
19401,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,0.835866,-0.616349,-0.738524,-0.103508,0.191622,0.856668,-0.864706,0.502746,0.409977,0.147629


Test dataset(data1): (4313, 2308)
Test dataset(data2): (4333, 2308)
Test dataset(data3): (458, 2308)
Test dataset(data4): (4825, 2308)
Test dataset(data5): (5474, 2308)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 2304]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4333, 2304]) torch.Size([4333])
Splited test dataset(data3): torch.Size([458, 2304]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 2304]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 2304]) torch.Size([5474])
Splited test dataset(all): torch.Size([19403, 2304]) torch.Size([19403])

===== Test model: emb_gen-ensem-clstm_esm2 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,data1,C018,Escherichia coli K-12 BW25113,249,371,3644,49,0.53722,0.90262,0.542484,0.401613,0.83557,0.986732,0.907597,0.920239,0.645256


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,data2,C016,Escherichia coli K-12 MG1655,269,358,3677,29,0.585527,0.910685,0.581622,0.429027,0.902685,0.992175,0.911276,0.961634,0.741339


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,data3,O046,synthetic bacterium JCVI-Syn3A,154,0,0,304,0.0,0.336245,0.503268,1.0,0.336245,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,data4,C048,Bacteroides thetaiotaomicron VPI-5482,309,182,4318,16,0.754674,0.958964,0.757353,0.629328,0.950769,0.996308,0.959556,0.982608,0.877225


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,96,543,4818,17,0.331339,0.897698,0.255319,0.150235,0.849558,0.996484,0.898713,0.908792,0.110601


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1077,1454,16457,415,0.506815,0.903675,0.535421,0.425524,0.72185,0.975403,0.918821,0.897184,0.508143



>>>> clstm_bert <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_bert,1015_bert,1016_bert,1017_bert,1018_bert,1019_bert,1020_bert,1021_bert,1022_bert,1023_bert
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,-0.013456,-0.048067,-0.110174,-0.008105,0.018130,-0.105259,-0.008767,0.005277,0.042083,0.034597
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,0.007268,-0.119998,0.003547,-0.041535,-0.139431,0.001130,-0.025769,-0.031517,0.014989,-0.006627
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.001732,-0.167638,-0.040140,-0.074581,-0.179380,-0.021140,-0.023059,-0.046827,0.031335,-0.032964
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.032899,-0.134339,0.000210,-0.057801,-0.168538,-0.000947,-0.042856,-0.030568,0.011087,-0.040674
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,0.046949,-0.040834,-0.021078,-0.029033,-0.105917,0.012200,0.013685,-0.042306,-0.009481,-0.002596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19398,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,-0.005665,-0.128913,0.003729,-0.027293,-0.126274,-0.013166,-0.049804,-0.017025,-0.028457,-0.000095
19399,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,0.056558,-0.047529,-0.022225,-0.060193,-0.078188,-0.036799,-0.024342,-0.014599,0.013637,-0.009223
19400,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,0.051691,-0.034478,0.004822,0.016618,0.070800,0.009008,-0.033036,-0.031658,-0.041180,-0.027903
19401,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,0.059217,-0.071523,0.001269,-0.009234,-0.043703,0.003266,-0.027528,-0.026730,-0.012166,0.000853


Test dataset(data1): (4313, 2052)
Test dataset(data2): (4333, 2052)
Test dataset(data3): (458, 2052)
Test dataset(data4): (4825, 2052)
Test dataset(data5): (5474, 2052)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 2048]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4333, 2048]) torch.Size([4333])
Splited test dataset(data3): torch.Size([458, 2048]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 2048]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 2048]) torch.Size([5474])
Splited test dataset(all): torch.Size([19403, 2048]) torch.Size([19403])

===== Test model: emb_gen-ensem-clstm_bert ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert,data1,C018,Escherichia coli K-12 BW25113,248,355,3660,50,0.54395,0.906098,0.550499,0.411277,0.832215,0.986523,0.911582,0.918207,0.625812


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert,data2,C016,Escherichia coli K-12 MG1655,267,349,3686,31,0.58661,0.912301,0.584245,0.433442,0.895973,0.99166,0.913507,0.960102,0.703863


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert,data3,O046,synthetic bacterium JCVI-Syn3A,158,0,0,300,0.0,0.344978,0.512987,1.0,0.344978,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert,data4,C048,Bacteroides thetaiotaomicron VPI-5482,306,175,4325,19,0.755179,0.959793,0.759305,0.636175,0.941538,0.995626,0.961111,0.985062,0.851947


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,93,548,4813,20,0.318737,0.896237,0.246684,0.145086,0.823009,0.995862,0.89778,0.916399,0.113557


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1072,1427,16484,420,0.508101,0.904809,0.537209,0.428972,0.718499,0.975154,0.920328,0.894984,0.51308



>>>> clstm_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19398,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19399,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19400,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19401,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 2052)
Test dataset(data2): (4333, 2052)
Test dataset(data3): (458, 2052)
Test dataset(data4): (4825, 2052)
Test dataset(data5): (5474, 2052)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 2048]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4333, 2048]) torch.Size([4333])
Splited test dataset(data3): torch.Size([458, 2048]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 2048]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 2048]) torch.Size([5474])
Splited test dataset(all): torch.Size([19403, 2048]) torch.Size([19403])

===== Test model: emb_gen-ensem-clstm_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,data1,C018,Escherichia coli K-12 BW25113,251,364,3651,47,0.545167,0.904707,0.549836,0.40813,0.842282,0.98729,0.90934,0.923884,0.626895


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,data2,C016,Escherichia coli K-12 MG1655,271,353,3682,27,0.592426,0.912301,0.587852,0.434295,0.909396,0.99272,0.912515,0.960502,0.710257


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,data3,O046,synthetic bacterium JCVI-Syn3A,159,0,0,299,0.0,0.347162,0.515397,1.0,0.347162,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,305,171,4329,20,0.756859,0.960415,0.761548,0.640756,0.938462,0.995401,0.962,0.981895,0.845802


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,93,541,4820,20,0.32084,0.897516,0.248996,0.146688,0.823009,0.995868,0.899086,0.885047,0.102278


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1079,1429,16482,413,0.51096,0.905066,0.5395,0.430223,0.72319,0.975555,0.920217,0.915912,0.537121



>>>> esm2_bert <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_bert,1015_bert,1016_bert,1017_bert,1018_bert,1019_bert,1020_bert,1021_bert,1022_bert,1023_bert
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-0.025803,2.423989,-1.412047,-0.427782,2.635565,-0.527033,...,-0.013456,-0.048067,-0.110174,-0.008105,0.018130,-0.105259,-0.008767,0.005277,0.042083,0.034597
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,-0.668316,-0.463429,0.103198,-0.088451,0.004220,0.314487,...,0.007268,-0.119998,0.003547,-0.041535,-0.139431,0.001130,-0.025769,-0.031517,0.014989,-0.006627
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,-0.389958,-0.056299,0.404185,-0.130026,0.101402,0.399542,...,-0.001732,-0.167638,-0.040140,-0.074581,-0.179380,-0.021140,-0.023059,-0.046827,0.031335,-0.032964
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,-0.132498,-0.263630,0.180769,-0.201304,-0.002199,0.342121,...,0.032899,-0.134339,0.000210,-0.057801,-0.168538,-0.000947,-0.042856,-0.030568,0.011087,-0.040674
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,-0.286661,-0.111691,0.556191,-0.063293,-0.274769,0.401593,...,0.046949,-0.040834,-0.021078,-0.029033,-0.105917,0.012200,0.013685,-0.042306,-0.009481,-0.002596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19398,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,-0.013360,-0.466237,-0.574799,0.004326,-0.280262,0.498790,...,-0.005665,-0.128913,0.003729,-0.027293,-0.126274,-0.013166,-0.049804,-0.017025,-0.028457,-0.000095
19399,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,-0.644403,-0.233777,0.404386,0.101492,-0.407517,0.367913,...,0.056558,-0.047529,-0.022225,-0.060193,-0.078188,-0.036799,-0.024342,-0.014599,0.013637,-0.009223
19400,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,0.311929,-0.638465,0.265130,0.334715,1.060847,0.648867,...,0.051691,-0.034478,0.004822,0.016618,0.070800,0.009008,-0.033036,-0.031658,-0.041180,-0.027903
19401,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,-0.985561,0.844931,-0.040269,-0.267429,-0.631806,-0.615640,...,0.059217,-0.071523,0.001269,-0.009234,-0.043703,0.003266,-0.027528,-0.026730,-0.012166,0.000853


Test dataset(data1): (4313, 2308)
Test dataset(data2): (4333, 2308)
Test dataset(data3): (458, 2308)
Test dataset(data4): (4825, 2308)
Test dataset(data5): (5474, 2308)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 2304]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4333, 2304]) torch.Size([4333])
Splited test dataset(data3): torch.Size([458, 2304]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 2304]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 2304]) torch.Size([5474])
Splited test dataset(all): torch.Size([19403, 2304]) torch.Size([19403])

===== Test model: emb_gen-ensem-esm2_bert ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert,data1,C018,Escherichia coli K-12 BW25113,269,388,3627,29,0.568886,0.903316,0.563351,0.409437,0.902685,0.992068,0.903362,0.947481,0.693988


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert,data2,C016,Escherichia coli K-12 MG1655,291,384,3651,7,0.615037,0.909762,0.59815,0.431111,0.97651,0.998086,0.904833,0.979997,0.770799


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert,data3,O046,synthetic bacterium JCVI-Syn3A,291,0,0,167,0.0,0.635371,0.777036,1.0,0.635371,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert,data4,C048,Bacteroides thetaiotaomicron VPI-5482,273,430,4070,52,0.52887,0.900104,0.531128,0.388336,0.84,0.987385,0.904444,0.947126,0.68529


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,55,588,4773,58,0.166506,0.881988,0.145503,0.085537,0.486726,0.987994,0.890319,0.857149,0.078805


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1179,1790,16121,313,0.510846,0.891615,0.528581,0.397103,0.790214,0.980954,0.900061,0.913608,0.539222



>>>> esm2_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-0.025803,2.423989,-1.412047,-0.427782,2.635565,-0.527033,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,-0.668316,-0.463429,0.103198,-0.088451,0.004220,0.314487,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,-0.389958,-0.056299,0.404185,-0.130026,0.101402,0.399542,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,-0.132498,-0.263630,0.180769,-0.201304,-0.002199,0.342121,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,-0.286661,-0.111691,0.556191,-0.063293,-0.274769,0.401593,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19398,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,-0.013360,-0.466237,-0.574799,0.004326,-0.280262,0.498790,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19399,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,-0.644403,-0.233777,0.404386,0.101492,-0.407517,0.367913,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19400,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,0.311929,-0.638465,0.265130,0.334715,1.060847,0.648867,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19401,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,-0.985561,0.844931,-0.040269,-0.267429,-0.631806,-0.615640,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 2308)
Test dataset(data2): (4333, 2308)
Test dataset(data3): (458, 2308)
Test dataset(data4): (4825, 2308)
Test dataset(data5): (5474, 2308)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 2304]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4333, 2304]) torch.Size([4333])
Splited test dataset(data3): torch.Size([458, 2304]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 2304]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 2304]) torch.Size([5474])
Splited test dataset(all): torch.Size([19403, 2304]) torch.Size([19403])

===== Test model: emb_gen-ensem-esm2_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_t5,data1,C018,Escherichia coli K-12 BW25113,270,367,3648,28,0.582312,0.908416,0.57754,0.423862,0.90604,0.992383,0.908593,0.949852,0.748179


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_t5,data2,C016,Escherichia coli K-12 MG1655,293,360,3675,5,0.632394,0.915763,0.616193,0.448698,0.983221,0.998641,0.910781,0.98296,0.845716


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_t5,data3,O046,synthetic bacterium JCVI-Syn3A,344,0,0,114,0.0,0.751092,0.857855,1.0,0.751092,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,294,506,3994,31,0.533874,0.888705,0.522667,0.3675,0.904615,0.992298,0.887556,0.954962,0.686606


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,62,600,4761,51,0.190461,0.881074,0.16,0.093656,0.548673,0.989401,0.888081,0.855292,0.068928


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1263,1833,16078,229,0.541417,0.893728,0.550567,0.407946,0.846515,0.985957,0.897661,0.923488,0.590659



>>>> bert_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,0.042822,-0.126205,-0.050820,-0.129861,0.068868,-0.006040,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,-0.058692,-0.037950,-0.004752,0.014216,-0.023279,0.035774,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,-0.070625,-0.035552,0.038030,0.001899,-0.000246,0.004621,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,-0.037140,-0.036069,0.029161,0.002461,-0.011851,0.017077,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,-0.032631,-0.024123,-0.018862,0.039874,-0.006922,-0.055323,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19398,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,-0.036090,-0.011721,-0.002769,0.041671,0.005540,0.003676,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19399,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,-0.023218,-0.008702,0.014693,0.022481,0.036286,0.010613,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19400,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-0.009309,0.004102,0.003444,0.031261,0.026689,0.051066,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19401,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.007387,-0.008439,0.020698,0.009457,0.050447,-0.007523,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 2052)
Test dataset(data2): (4333, 2052)
Test dataset(data3): (458, 2052)
Test dataset(data4): (4825, 2052)
Test dataset(data5): (5474, 2052)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 2048]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4333, 2048]) torch.Size([4333])
Splited test dataset(data3): torch.Size([458, 2048]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 2048]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 2048]) torch.Size([5474])
Splited test dataset(all): torch.Size([19403, 2048]) torch.Size([19403])

===== Test model: emb_gen-ensem-bert_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,bert_t5,data1,C018,Escherichia coli K-12 BW25113,266,320,3695,32,0.601684,0.918386,0.60181,0.453925,0.892617,0.991414,0.920299,0.943431,0.699757


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,bert_t5,data2,C016,Escherichia coli K-12 MG1655,288,326,3709,10,0.64268,0.922456,0.631579,0.469055,0.966443,0.997311,0.919207,0.981344,0.819989


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,bert_t5,data3,O046,synthetic bacterium JCVI-Syn3A,293,0,0,165,0.0,0.639738,0.780293,1.0,0.639738,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,bert_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,271,455,4045,54,0.513674,0.894508,0.515699,0.373278,0.833846,0.986826,0.898889,0.941801,0.668267


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,bert_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,49,567,4794,64,0.147515,0.884728,0.134431,0.079545,0.433628,0.986826,0.894236,0.834219,0.062125


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,bert_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1167,1668,16243,325,0.519733,0.897284,0.539404,0.41164,0.782172,0.980384,0.906873,0.910317,0.533294



>>>> clstm_esm2_bert <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_bert,1015_bert,1016_bert,1017_bert,1018_bert,1019_bert,1020_bert,1021_bert,1022_bert,1023_bert
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,-0.013456,-0.048067,-0.110174,-0.008105,0.018130,-0.105259,-0.008767,0.005277,0.042083,0.034597
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,0.007268,-0.119998,0.003547,-0.041535,-0.139431,0.001130,-0.025769,-0.031517,0.014989,-0.006627
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.001732,-0.167638,-0.040140,-0.074581,-0.179380,-0.021140,-0.023059,-0.046827,0.031335,-0.032964
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.032899,-0.134339,0.000210,-0.057801,-0.168538,-0.000947,-0.042856,-0.030568,0.011087,-0.040674
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,0.046949,-0.040834,-0.021078,-0.029033,-0.105917,0.012200,0.013685,-0.042306,-0.009481,-0.002596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19444,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,-0.005665,-0.128913,0.003729,-0.027293,-0.126274,-0.013166,-0.049804,-0.017025,-0.028457,-0.000095
19445,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,0.056558,-0.047529,-0.022225,-0.060193,-0.078188,-0.036799,-0.024342,-0.014599,0.013637,-0.009223
19446,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,0.051691,-0.034478,0.004822,0.016618,0.070800,0.009008,-0.033036,-0.031658,-0.041180,-0.027903
19447,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,0.059217,-0.071523,0.001269,-0.009234,-0.043703,0.003266,-0.027528,-0.026730,-0.012166,0.000853


Test dataset(data1): (4313, 3332)
Test dataset(data2): (4379, 3332)
Test dataset(data3): (458, 3332)
Test dataset(data4): (4825, 3332)
Test dataset(data5): (5474, 3332)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 3328]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4379, 3328]) torch.Size([4379])
Splited test dataset(data3): torch.Size([458, 3328]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 3328]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 3328]) torch.Size([5474])
Splited test dataset(all): torch.Size([19449, 3328]) torch.Size([19449])

===== Test model: emb_gen-ensem-clstm_esm2_bert ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert,data1,C018,Escherichia coli K-12 BW25113,252,349,3666,46,0.555632,0.908416,0.560623,0.419301,0.845638,0.987608,0.913076,0.92395,0.660729


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert,data2,C016,Escherichia coli K-12 MG1655,289,339,3720,31,0.608607,0.915506,0.609705,0.460191,0.903125,0.991736,0.916482,0.964183,0.755635


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert,data3,O046,synthetic bacterium JCVI-Syn3A,162,0,0,296,0.0,0.353712,0.522581,1.0,0.353712,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert,data4,C048,Bacteroides thetaiotaomicron VPI-5482,311,207,4293,14,0.737526,0.954197,0.737841,0.600386,0.956923,0.996749,0.954,0.987794,0.876821


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,91,542,4819,22,0.313108,0.896967,0.243968,0.14376,0.80531,0.995455,0.898899,0.907863,0.108841


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1105,1437,16498,409,0.516448,0.905085,0.544872,0.434697,0.729855,0.975809,0.919877,0.911744,0.548982



>>>> clstm_esm2_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19444,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19445,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19446,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19447,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 3332)
Test dataset(data2): (4379, 3332)
Test dataset(data3): (458, 3332)
Test dataset(data4): (4825, 3332)
Test dataset(data5): (5474, 3332)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 3328]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4379, 3328]) torch.Size([4379])
Splited test dataset(data3): torch.Size([458, 3328]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 3328]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 3328]) torch.Size([5474])
Splited test dataset(all): torch.Size([19449, 3328]) torch.Size([19449])

===== Test model: emb_gen-ensem-clstm_esm2_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_t5,data1,C018,Escherichia coli K-12 BW25113,253,302,3713,45,0.586058,0.919546,0.5932,0.455856,0.848993,0.988026,0.924782,0.931773,0.649498


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_t5,data2,C016,Escherichia coli K-12 MG1655,290,291,3768,30,0.640288,0.926696,0.643729,0.499139,0.90625,0.992101,0.928307,0.968357,0.766234


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_t5,data3,O046,synthetic bacterium JCVI-Syn3A,162,0,0,296,0.0,0.353712,0.522581,1.0,0.353712,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,319,165,4335,6,0.788321,0.96456,0.788628,0.659091,0.981538,0.998618,0.963333,0.991974,0.88557


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,90,538,4823,23,0.310575,0.897516,0.242915,0.143312,0.79646,0.995254,0.899646,0.922761,0.109971


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1114,1296,16639,400,0.539572,0.912798,0.567788,0.462241,0.735799,0.976524,0.927739,0.913241,0.573143



>>>> clstm_bert_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19444,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19445,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19446,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19447,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 3076)
Test dataset(data2): (4379, 3076)
Test dataset(data3): (458, 3076)
Test dataset(data4): (4825, 3076)
Test dataset(data5): (5474, 3076)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 3072]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4379, 3072]) torch.Size([4379])
Splited test dataset(data3): torch.Size([458, 3072]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 3072]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 3072]) torch.Size([5474])
Splited test dataset(all): torch.Size([19449, 3072]) torch.Size([19449])

===== Test model: emb_gen-ensem-clstm_bert_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert_t5,data1,C018,Escherichia coli K-12 BW25113,254,372,3643,44,0.546975,0.903547,0.549784,0.405751,0.852349,0.988066,0.907347,0.92614,0.614981


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert_t5,data2,C016,Escherichia coli K-12 MG1655,297,363,3696,23,0.610103,0.911852,0.606122,0.45,0.928125,0.993816,0.910569,0.963674,0.720518


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert_t5,data3,O046,synthetic bacterium JCVI-Syn3A,164,0,0,294,0.0,0.358079,0.527331,1.0,0.358079,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,312,181,4319,13,0.761138,0.959793,0.762836,0.63286,0.96,0.996999,0.959778,0.987746,0.863702


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,95,554,4807,18,0.324322,0.895506,0.249344,0.146379,0.840708,0.996269,0.896661,0.922569,0.113046


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_bert_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1122,1470,16465,392,0.519602,0.904262,0.546517,0.43287,0.741083,0.976746,0.918037,0.91299,0.552671



>>>> esm2_bert_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-0.025803,2.423989,-1.412047,-0.427782,2.635565,-0.527033,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,-0.668316,-0.463429,0.103198,-0.088451,0.004220,0.314487,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,-0.389958,-0.056299,0.404185,-0.130026,0.101402,0.399542,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,-0.132498,-0.263630,0.180769,-0.201304,-0.002199,0.342121,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,-0.286661,-0.111691,0.556191,-0.063293,-0.274769,0.401593,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19444,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,-0.013360,-0.466237,-0.574799,0.004326,-0.280262,0.498790,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19445,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,-0.644403,-0.233777,0.404386,0.101492,-0.407517,0.367913,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19446,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,0.311929,-0.638465,0.265130,0.334715,1.060847,0.648867,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19447,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,-0.985561,0.844931,-0.040269,-0.267429,-0.631806,-0.615640,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 3332)
Test dataset(data2): (4379, 3332)
Test dataset(data3): (458, 3332)
Test dataset(data4): (4825, 3332)
Test dataset(data5): (5474, 3332)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 3328]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4379, 3328]) torch.Size([4379])
Splited test dataset(data3): torch.Size([458, 3328]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 3328]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 3328]) torch.Size([5474])
Splited test dataset(all): torch.Size([19449, 3328]) torch.Size([19449])

===== Test model: emb_gen-ensem-esm2_bert_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert_t5,data1,C018,Escherichia coli K-12 BW25113,264,300,3715,34,0.610203,0.92256,0.612529,0.468085,0.885906,0.990931,0.92528,0.946208,0.667158


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert_t5,data2,C016,Escherichia coli K-12 MG1655,306,327,3732,14,0.648109,0.922128,0.642183,0.483412,0.95625,0.996263,0.919438,0.978719,0.744425


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert_t5,data3,O046,synthetic bacterium JCVI-Syn3A,314,0,0,144,0.0,0.68559,0.813472,1.0,0.68559,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,296,404,4096,29,0.584286,0.910259,0.577561,0.422857,0.910769,0.99297,0.910222,0.960721,0.734123


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,62,582,4779,51,0.194226,0.884362,0.163804,0.096273,0.548673,0.989441,0.891438,0.843767,0.067567


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,esm2_bert_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1242,1613,16322,272,0.552968,0.90308,0.568551,0.435026,0.820343,0.983609,0.910064,0.924257,0.568979



>>>> clstm_esm2_bert_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,0,1,2,3,4,5,...,1014_t5,1015_t5,1016_t5,1017_t5,1018_t5,1019_t5,1020_t5,1021_t5,1022_t5,1023_t5
0,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0001,0,-1.271414,0.000099,0.076290,-0.000540,-0.002149,-0.002770,...,0.027051,0.021510,0.231686,-0.173741,0.079085,-0.236435,-0.114395,0.054386,-0.030780,-0.075774
1,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0002,0,0.080849,0.000191,0.066948,-0.001360,-0.001139,-0.002485,...,-0.023108,-0.003255,-0.010074,-0.010509,0.021057,-0.041092,-0.026289,0.020529,-0.029514,0.032310
2,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0003,0,0.049742,0.000037,0.063013,-0.003911,-0.002671,-0.000340,...,-0.021937,-0.035736,0.000474,-0.025479,0.003724,-0.044224,-0.078591,-0.024167,-0.032960,-0.003994
3,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0004,0,0.438240,0.000130,0.061992,-0.002766,-0.001883,-0.001800,...,0.005614,-0.032511,-0.037446,-0.013102,0.030071,0.009861,-0.022703,-0.002080,-0.011043,0.036730
4,C050,Salmonella enterica subsp. enterica serovar Ty...,STM14_0005,0,0.647014,0.000091,0.058150,-0.003637,-0.002623,-0.002049,...,-0.020996,0.023913,-0.012189,-0.067514,0.045659,0.010079,-0.014732,-0.013761,-0.029590,0.063286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19554,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0913,1,0.244533,0.000047,0.060919,-0.003986,-0.002723,-0.000765,...,0.013594,0.028046,0.010028,-0.060973,0.093713,-0.003019,-0.053488,-0.045007,-0.043997,-0.002393
19555,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0918,1,0.446268,0.000114,0.060191,-0.002962,-0.002494,-0.002787,...,-0.048624,0.031121,0.011467,-0.048133,-0.033131,-0.060800,-0.055844,-0.098644,-0.041724,0.014361
19556,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0930,1,-1.313249,0.000093,0.077817,-0.000610,-0.002145,-0.002522,...,-0.008057,0.051703,0.032014,-0.148301,0.106521,-0.029680,-0.032345,-0.021705,0.096927,0.099435
19557,O046,synthetic bacterium JCVI-Syn3A,JCVISYN3A_0931,1,0.339057,0.000057,0.060354,-0.003875,-0.002801,-0.001344,...,-0.037994,-0.018542,-0.014736,-0.102566,0.023649,-0.022915,-0.034404,-0.085372,0.044055,0.009911


Test dataset(data1): (4313, 4356)
Test dataset(data2): (4489, 4356)
Test dataset(data3): (458, 4356)
Test dataset(data4): (4825, 4356)
Test dataset(data5): (5474, 4356)
Test organism: {'data1': ['Escherichia coli K-12 BW25113'], 'data2': ['Escherichia coli K-12 MG1655'], 'data3': ['synthetic bacterium JCVI-Syn3A'], 'data4': ['Bacteroides thetaiotaomicron VPI-5482'], 'data5': ['Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S']} 5
Splited test dataset(data1): torch.Size([4313, 4352]) torch.Size([4313])
Splited test dataset(data2): torch.Size([4489, 4352]) torch.Size([4489])
Splited test dataset(data3): torch.Size([458, 4352]) torch.Size([458])
Splited test dataset(data4): torch.Size([4825, 4352]) torch.Size([4825])
Splited test dataset(data5): torch.Size([5474, 4352]) torch.Size([5474])
Splited test dataset(all): torch.Size([19559, 4352]) torch.Size([19559])

===== Test model: emb_gen-ensem-clstm_esm2_bert_t5 ====


  model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert_t5,data1,C018,Escherichia coli K-12 BW25113,249,372,3643,49,0.53668,0.902388,0.541893,0.400966,0.83557,0.986728,0.907347,0.92391,0.653797


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert_t5,data2,C016,Escherichia coli K-12 MG1655,355,357,3750,27,0.643422,0.914458,0.648995,0.498596,0.929319,0.992851,0.913075,0.964272,0.773961


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert_t5,data3,O046,synthetic bacterium JCVI-Syn3A,184,0,0,274,0.0,0.401747,0.573209,1.0,0.401747,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,309,210,4290,16,0.731383,0.953161,0.732227,0.595376,0.950769,0.996284,0.953333,0.979275,0.838159


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,95,548,4813,18,0.326123,0.896602,0.251323,0.147745,0.840708,0.996274,0.89778,0.883667,0.104937


Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2_bert_t5,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1192,1487,16496,384,0.533304,0.904341,0.560282,0.444942,0.756345,0.977251,0.917311,0.915194,0.552855


'Model performance:'

Unnamed: 0,comb,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_esm2,data1,C018,Escherichia coli K-12 BW25113,249,371,3644,49,0.537220,0.902620,0.542484,0.401613,0.835570,0.986732,0.907597,0.920239,0.645256
1,clstm_esm2,data2,C016,Escherichia coli K-12 MG1655,269,358,3677,29,0.585527,0.910685,0.581622,0.429027,0.902685,0.992175,0.911276,0.961634,0.741339
2,clstm_esm2,data3,O046,synthetic bacterium JCVI-Syn3A,154,0,0,304,0.000000,0.336245,0.503268,1.000000,0.336245,0.000000,0.000000,,
3,clstm_esm2,data4,C048,Bacteroides thetaiotaomicron VPI-5482,309,182,4318,16,0.754674,0.958964,0.757353,0.629328,0.950769,0.996308,0.959556,0.982608,0.877225
4,clstm_esm2,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,96,543,4818,17,0.331339,0.897698,0.255319,0.150235,0.849558,0.996484,0.898713,0.908792,0.110601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,clstm_esm2_bert_t5,data2,C016,Escherichia coli K-12 MG1655,355,357,3750,27,0.643422,0.914458,0.648995,0.498596,0.929319,0.992851,0.913075,0.964272,0.773961
62,clstm_esm2_bert_t5,data3,O046,synthetic bacterium JCVI-Syn3A,184,0,0,274,0.000000,0.401747,0.573209,1.000000,0.401747,0.000000,0.000000,,
63,clstm_esm2_bert_t5,data4,C048,Bacteroides thetaiotaomicron VPI-5482,309,210,4290,16,0.731383,0.953161,0.732227,0.595376,0.950769,0.996284,0.953333,0.979275,0.838159
64,clstm_esm2_bert_t5,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,95,548,4813,18,0.326123,0.896602,0.251323,0.147745,0.840708,0.996274,0.897780,0.883667,0.104937
