#### Set environments

In [1]:
import torch, os, gc
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score,\
f1_score, precision_score, recall_score, roc_auc_score, average_precision_score

In [2]:
# Set options
embed_ver = ["cnn", "lstm", "clstm"]

data_path = "../data/"
model_path = "../model/seq_gen/"
result_path = "../result/prd-seq_gen/"
os.makedirs(result_path, exist_ok=True)

# data preprocessing options
col_str = ['file_id', 'organism', 'locus_tag', 'ess']

# model options
layer_num = 2
max_len = 1600
batch_size = 256

In [3]:
# Set data list for test dataset
ts_data = {
    "data1": ["C018"],  # "Escherichia coli K-12 BW25113"
    "data2": ["C016"],  # "Escherichia coli K-12 MG1655"
    "data3": ["O046"],  # "synthetic bacterium JCVI-Syn3A"
    "data4": ["C048"],  # Bacteroides thetaiotaomicron VPI-5482
    "data5": ["C050"]  # Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S
}

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Define function to record perfomance result
def record_perform(emb_ver, file_id, organ, y_real, y_conf, y_prd):
    y_real = y_real.cpu().numpy()
    y_conf = y_conf.cpu().numpy()
    y_prd = y_prd.cpu().numpy()
    
    if file_id != "O046":
        auc_roc = [roc_auc_score(y_real, y_conf)]
        auc_pr = [average_precision_score(y_real, y_conf)]
    else:
        auc_roc = None
        auc_pr = None
    
    tn, fp, fn, tp = confusion_matrix(y_real, y_prd).ravel()
    
    result = pd.DataFrame({
        "embed": [emb_ver],
        "file": [file_id],
        "organism": [organ],
        "tp": [tp],
        "fp": [fp],
        "tn": [tn],
        "fn": [fn],
        "mcc": [matthews_corrcoef(y_real, y_prd)],
        "acc": [accuracy_score(y_real, y_prd)],
        "f1": [f1_score(y_real, y_prd)],
        "prc": [precision_score(y_real, y_prd)],
        "rec": [recall_score(y_real, y_prd)],
        "npv": [precision_score(1 - y_real, 1 - y_prd)],
        "tnr": [recall_score(1 - y_real, 1 - y_prd)],
        "auc-roc": auc_roc,
        "auc-pr": auc_pr
    })

    return result


In [6]:
# Set model architecture
class ClassifierCNN(nn.Module):
    def __init__(self, num_layer, pool_len, vocab_size, max_len):
        super(ClassifierCNN, self).__init__()
        emb_dim = 16
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim, padding_idx=0)

        in_dim = emb_dim
        out_dim = 1024
        layers = []
        for i in range(num_layer):
            layers.append(nn.Conv1d(in_dim, out_dim, kernel_size=8))
            self.initialize_weights(layers[-1])
            layers.append(nn.GELU())
            layers.append(nn.AdaptiveMaxPool1d(max(pool_len, max_len // (8 * (i + 1)))))
            in_dim = out_dim
        layers.append(nn.AdaptiveMaxPool1d(pool_len))
        self.emb_block = nn.Sequential(*layers)
        
        self.bn = nn.BatchNorm1d(out_dim)
        self.do = nn.Dropout(0.5)
        self.fc = nn.Linear(out_dim, 1)
    
    def initialize_weights(self, layer):
        nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='linear')
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        # embedding
        emb = self.embedding(x)
        emb = emb.permute(0, 2, 1)
        emb = self.emb_block(emb)
        # # global average pooling w/o paddings
        # val_lens = (emb != 0).any(dim=1).sum(dim=1, keepdim=True)
        # emb = emb.sum(dim=2) / val_lens
        emb = emb.mean(dim=2)
        emb = self.bn(emb)
        # classification
        x = self.do(emb)
        x = self.fc(x)
        return x.squeeze(1)

In [7]:
# Set model architecture
class ClassifierLSTM(nn.Module):
    def __init__(self, num_layer, input_len, vocab_size):
        super(ClassifierLSTM, self).__init__()
        self.input_len = input_len
        
        emb_dim = 16
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim, padding_idx=0)
        
        out_dim = 1024
        self.lstm = nn.LSTM(emb_dim, out_dim // 2, num_layers=num_layer, bidirectional=True, batch_first=True)
        
        self.bn = nn.BatchNorm1d(out_dim)
        self.do = nn.Dropout(0.5)
        self.fc = nn.Linear(out_dim, 1)
    
    def initialize_weights(self, layer):
        nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='linear')
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        # extract valid input sequences
        val_lens = (x != 0).sum(dim=1)
        start_idx = torch.clamp((val_lens - self.input_len) // 2, min=0)
        x = torch.stack([x[i, start:start+self.input_len] for i, start in enumerate(start_idx)], dim=0)
        # embedding
        emb = self.embedding(x)
        val_lens = (x != 0).sum(dim=1).cpu()
        emb = pack_padded_sequence(emb, val_lens, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(emb)
        hidden = hidden.view(self.lstm.num_layers, 2, x.size(0), self.lstm.hidden_size)
        emb = torch.cat((hidden[-1, 0, :, :], hidden[-1, 1, :, :]), dim=1)
        emb = self.bn(emb)
        # classification
        x = self.do(emb)
        x = self.fc(x)
        return x.squeeze(1)

In [8]:
# Set model architecture
class ClassifierCNN_LSTM(nn.Module):
    def __init__(self, pool_len, vocab_size):
        super(ClassifierCNN_LSTM, self).__init__()
        emb_dim = 16        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim, padding_idx=0)

        out_dim = 1024
        self.conv1d = nn.Conv1d(emb_dim, out_dim, kernel_size=8)
        self.initialize_weights(self.conv1d)
        self.pool = nn.AdaptiveAvgPool1d(pool_len)
        self.lstm = nn.LSTM(out_dim, out_dim // 2, bidirectional=True, batch_first=True)
        
        self.bn = nn.BatchNorm1d(out_dim)
        self.do = nn.Dropout(0.5)
        self.fc = nn.Linear(out_dim, 1) 
    
    def initialize_weights(self, layer):
        nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='linear')
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        # embedding
        emb = self.embedding(x)
        emb = emb.permute(0, 2, 1)
        emb = F.gelu(self.conv1d(emb))
        emb = self.pool(emb)
        emb = emb.permute(0, 2, 1)
        # mark valid feature steps
        val_lens = (emb != 0).any(dim=2).sum(dim=1).cpu()
        emb = pack_padded_sequence(emb, val_lens, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(emb)
        emb = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        emb = self.bn(emb)
        # classification
        x = self.do(emb)
        x = self.fc(x)
        return x.squeeze(1)

#### Prepare data

In [None]:
# Load dataset
df = pd.read_csv(data_path + f"data-seq_raw-ts.csv")
display(df)

In [None]:
# Set vocabulary dictionary
valid_aa = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_int = {aa: idx + 1 for idx, aa in enumerate(valid_aa)}  # 0 is padding

print(aa_to_int)

In [None]:
# Replace invalid residues
df['aa_seq'] = df['aa_seq'].str.replace(f"[^{valid_aa}]", "", regex=True)

# # Filter invalid sequences
# df = df[df['aa_seq'].apply(lambda seq: set(seq).issubset(set(valid_aa)))]
# df = df.reset_index(drop=True)

print(df.shape)

In [None]:
# Truncate sequences to max length
seq = df['aa_seq'].str[:max_len].to_list()

print(len(seq))
print(seq[:2])

In [None]:
# Set integer encoding function
def int_encode_seq(seq, aa_to_int):
    seq_list = list(seq)
    return [aa_to_int[aa] for aa in seq_list]

# Encode sequences to integers
encoded_seq = [int_encode_seq(s, aa_to_int) for s in seq]

print(len(encoded_seq))
print(encoded_seq[:2])

In [None]:
# Set padding function
def pad_seq(seq, max_len):
    return seq + [0] * (max_len - len(seq))

# Pad sequences
encoded_seq = np.array([pad_seq(s, max_len) for s in encoded_seq])

print(encoded_seq.shape)
print(encoded_seq[:2])

In [None]:
data = pd.concat([df[col_str], pd.DataFrame(encoded_seq)], axis=1)

display(data)

In [16]:
col_num = [col for col in data.columns if col not in col_str]

In [None]:
# get test datasets
loc_ts = {}
data_ts = {}
org_ts = {}
for ts_ver, ids in ts_data.items():
    # get test sample locations
    loc_ts[ts_ver] = data['file_id'].isin(ids)
    # get test samples
    data_ts[ts_ver] = data[loc_ts[ts_ver]]
    org = []
    # get test organism list
    for i in ids:
        organ = data_ts[ts_ver]['organism'][data_ts[ts_ver]['file_id'] == i].to_list()
        if len(organ) > 0:
            org.append(organ[0])
    org_ts[ts_ver] = org

    print("Test dataset(" + ts_ver + "):", data_ts[ts_ver].shape)
print("Test organism:", org_ts, len(org_ts))

In [None]:
# split info.& inputs & labels of the test datasets
info_ts = {}
y_ts = {}
test_loader = {}
for ts_ver, df in data_ts.items():
    info_ts[ts_ver] = df[col_str]
    X_ts = torch.tensor(df[col_num].astype('long').values)
    y_ts[ts_ver] = torch.tensor(df['ess'].astype('long').values)
    print("Splited test dataset(" + ts_ver + "):", X_ts.shape, y_ts[ts_ver].shape)                    
    # generate dataloader by the test datasets
    dataset_ts = TensorDataset(X_ts, y_ts[ts_ver])
    test_loader[ts_ver] = DataLoader(dataset_ts, batch_size=256, shuffle=False)

In [None]:
# get the total test dataset
loc_ts_all = [sum(loc) >= 1 for loc in zip(*loc_ts.values())]
info_ts_all = data.loc[loc_ts_all, col_str]
X_ts_all = torch.tensor(data.loc[loc_ts_all, col_num].astype('long').values)
y_ts_all = torch.tensor(data.loc[loc_ts_all, 'ess'].astype('long').values)

print("Splited test dataset(all):", X_ts_all.shape, y_ts_all.shape)

In [20]:
# generate dataloader of total test dataset
test_all_dataset = TensorDataset(X_ts_all, y_ts_all)
test_all_loader = DataLoader(test_all_dataset, batch_size=256, shuffle=False)

#### Evaluate model

In [21]:
df_eval = pd.DataFrame()

for ver in embed_ver:
    #### Evaluate model ####
    # set model name
    model_name = f"seq_gen-{ver}"
    print(f"\n===== Test model: {model_name} ====")
    
    if ver == 'cnn':
        model = ClassifierCNN(num_layer=layer_num, pool_len=25, vocab_size=len(aa_to_int) + 1, max_len=max_len).to(device)
        model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))
    elif ver == 'lstm':
        model = ClassifierLSTM(num_layer=layer_num, input_len=100, vocab_size=len(aa_to_int) + 1).to(device)
        model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))
    else:
        model = ClassifierCNN_LSTM(pool_len=25, vocab_size=len(aa_to_int) + 1).to(device)
        model.load_state_dict(torch.load(model_path + model_name + ".pt", map_location=device))
    
    model.eval()

    ## model evaluations by test dataset ##
    for ts_ver, ids in ts_data.items():
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for X_batch, y_batch in test_loader[ts_ver]:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                preds = model(X_batch).squeeze()
                all_preds.append(preds.cpu())
                all_labels.append(y_batch.cpu())
        
        # concatenate results to one tensor
        all_preds = torch.cat(all_preds)
        all_labels = torch.cat(all_labels)

        # convert logits to confidences & classes
        prd_conf = torch.sigmoid(all_preds)
        prd_cls = (prd_conf >= 0.5).int()
        # performances by testset
        perform = record_perform(
            emb_ver=ver,
            file_id="+".join(ids),
            organ="+".join(org_ts[ts_ver]),
            y_real=y_ts[ts_ver],
            y_conf=prd_conf,
            y_prd=prd_cls,
        )
        display(perform)
        df_eval = pd.concat([df_eval, perform], ignore_index=True)
    
    
    ## model evaluation on the total test dataset ##
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_all_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            preds = model(X_batch).squeeze()
            all_preds.append(preds.cpu())
            all_labels.append(y_batch.cpu())
    
    # concatenate results to one tensor
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    # convert logits to confidences & classes
    prd_conf = torch.sigmoid(all_preds)
    prd_cls = (prd_conf >= 0.5).int()

    # performances on total testset
    perform = record_perform(
        emb_ver=ver,
        file_id="+".join([i for ids in ts_data.values() for i in ids]),
        organ="+".join([org for orgs in org_ts.values() for org in orgs]),
        y_real=y_ts_all,
        y_conf=prd_conf,
        y_prd=prd_cls
    )
    display(perform)
    df_eval = pd.concat([df_eval, perform], ignore_index=True)

    # concatenate the protein info. & predicted confidences
    df_prd = pd.DataFrame(prd_conf, columns=["conf"], index=info_ts_all.index)
    df_prd = pd.concat([info_ts_all, df_prd], axis=1)

    # save the model prediction result
    df_prd.to_csv(result_path + f"prd-{model_name}.csv", index=False)

    gc.collect()

# save the model perfomance result
display("Model performance:", df_eval)
df_eval.to_csv("../result/eval-seq_gen.csv", index=False)


===== Test model: seq_gen-cnn-10 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,data1,C018,Escherichia coli K-12 BW25113,223,292,3723,75,0.528392,0.914908,0.548585,0.43301,0.748322,0.980253,0.927273,0.896913,0.565699


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,data2,C016,Escherichia coli K-12 MG1655,234,288,3735,56,0.564588,0.920241,0.576355,0.448276,0.806897,0.985228,0.928412,0.932575,0.635496


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,data3,O046,synthetic bacterium JCVI-Syn3A,98,0,0,360,0.0,0.213974,0.352518,1.0,0.213974,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,data4,C048,Bacteroides thetaiotaomicron VPI-5482,295,169,4331,30,0.739747,0.958756,0.747782,0.635776,0.907692,0.993121,0.962444,0.974694,0.826376


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,75,501,4860,38,0.264252,0.901535,0.217707,0.130208,0.663717,0.992242,0.906547,0.87588,0.113041


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,925,1250,16649,559,0.46627,0.906671,0.505603,0.425287,0.623315,0.967515,0.930164,0.853536,0.430934



===== Test model: seq_gen-cnn-25 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,25,data1,C018,Escherichia coli K-12 BW25113,227,289,3726,71,0.539022,0.916531,0.55774,0.439922,0.761745,0.981301,0.92802,0.909246,0.574567


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,25,data2,C016,Escherichia coli K-12 MG1655,244,296,3727,46,0.581009,0.920705,0.587952,0.451852,0.841379,0.987808,0.926423,0.945708,0.669261


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,25,data3,O046,synthetic bacterium JCVI-Syn3A,125,0,0,333,0.0,0.272926,0.428816,1.0,0.272926,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,25,data4,C048,Bacteroides thetaiotaomicron VPI-5482,288,169,4331,37,0.726358,0.957306,0.736573,0.630197,0.886154,0.991529,0.962444,0.96884,0.82414


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,25,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,68,509,4852,45,0.234675,0.898794,0.197101,0.117851,0.60177,0.990811,0.905055,0.867892,0.093961


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,25,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,952,1263,16636,532,0.477178,0.907393,0.514734,0.429797,0.641509,0.969012,0.929437,0.871021,0.451298



===== Test model: seq_gen-cnn-50 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,50,data1,C018,Escherichia coli K-12 BW25113,227,327,3688,71,0.515657,0.907721,0.532864,0.409747,0.761745,0.981112,0.918555,0.90479,0.555135


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,50,data2,C016,Escherichia coli K-12 MG1655,240,324,3699,50,0.554914,0.913285,0.562061,0.425532,0.827586,0.986663,0.919463,0.940904,0.642655


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,50,data3,O046,synthetic bacterium JCVI-Syn3A,102,0,0,356,0.0,0.222707,0.364286,1.0,0.222707,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,50,data4,C048,Bacteroides thetaiotaomicron VPI-5482,305,180,4320,20,0.748914,0.958549,0.753086,0.628866,0.938462,0.995392,0.96,0.977052,0.820639


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,50,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,88,506,4855,25,0.312863,0.902996,0.248939,0.148148,0.778761,0.994877,0.905615,0.886243,0.117103


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,50,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,962,1337,16562,522,0.471671,0.904091,0.508591,0.418443,0.648248,0.969445,0.925303,0.862505,0.434628



===== Test model: seq_gen-cnn-100 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,100,data1,C018,Escherichia coli K-12 BW25113,204,245,3770,94,0.517816,0.9214,0.546185,0.454343,0.684564,0.975673,0.938979,0.89692,0.506887


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,100,data2,C016,Escherichia coli K-12 MG1655,218,232,3791,72,0.568591,0.929515,0.589189,0.484444,0.751724,0.981362,0.942332,0.926781,0.557369


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,100,data3,O046,synthetic bacterium JCVI-Syn3A,58,0,0,400,0.0,0.126638,0.224806,1.0,0.126638,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,100,data4,C048,Bacteroides thetaiotaomicron VPI-5482,295,105,4395,30,0.803878,0.972021,0.813793,0.7375,0.907692,0.99322,0.976667,0.976516,0.831361


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,100,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,74,482,4879,39,0.265918,0.904823,0.221226,0.133094,0.654867,0.99207,0.910091,0.885228,0.163339


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,100,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,849,1064,16835,635,0.457041,0.912346,0.499853,0.443806,0.572102,0.963652,0.940555,0.839362,0.400301



===== Test model: seq_gen-lstm-10 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,10,data1,C018,Escherichia coli K-12 BW25113,197,2633,1382,101,0.002821,0.366102,0.125959,0.069611,0.661074,0.931895,0.344209,0.525226,0.080101


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,10,data2,C016,Escherichia coli K-12 MG1655,194,2627,1396,96,0.008408,0.368653,0.124719,0.06877,0.668966,0.935657,0.347005,0.527608,0.079511


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,10,data3,O046,synthetic bacterium JCVI-Syn3A,374,0,0,84,0.0,0.816594,0.899038,1.0,0.816594,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,10,data4,C048,Bacteroides thetaiotaomicron VPI-5482,259,3196,1304,66,0.048193,0.323938,0.137037,0.074964,0.796923,0.951825,0.289778,0.555492,0.079302


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,10,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,73,3409,1952,40,0.002993,0.369931,0.040612,0.020965,0.646018,0.97992,0.364111,0.4982,0.020956


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,10,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1097,11865,6034,387,0.043122,0.3679,0.151876,0.084632,0.739218,0.939729,0.337114,0.571692,0.099453



===== Test model: seq_gen-lstm-25 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,25,data1,C018,Escherichia coli K-12 BW25113,157,1970,2045,141,0.018356,0.51055,0.129485,0.073813,0.526846,0.935499,0.50934,0.519933,0.084618


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,25,data2,C016,Escherichia coli K-12 MG1655,157,1959,2064,133,0.027267,0.514955,0.130507,0.074197,0.541379,0.939463,0.51305,0.527233,0.080308


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,25,data3,O046,synthetic bacterium JCVI-Syn3A,392,0,0,66,0.0,0.855895,0.922353,1.0,0.855895,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,25,data4,C048,Bacteroides thetaiotaomicron VPI-5482,231,2958,1542,94,0.028292,0.367461,0.131474,0.072437,0.710769,0.942543,0.342667,0.538607,0.074129


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,25,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,54,2528,2833,59,0.001801,0.527402,0.040074,0.020914,0.477876,0.979599,0.528446,0.505546,0.021613


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,25,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,991,9415,8484,493,0.075604,0.48883,0.166695,0.095234,0.66779,0.945082,0.473993,0.603399,0.118817



===== Test model: seq_gen-lstm-50 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,50,data1,C018,Escherichia coli K-12 BW25113,122,898,3117,176,0.110853,0.750985,0.185129,0.119608,0.409396,0.946553,0.776339,0.627133,0.136484


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,50,data2,C016,Escherichia coli K-12 MG1655,124,914,3109,166,0.117395,0.749594,0.186747,0.119461,0.427586,0.949313,0.772806,0.636865,0.132374


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,50,data3,O046,synthetic bacterium JCVI-Syn3A,142,0,0,316,0.0,0.310044,0.473333,1.0,0.310044,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,50,data4,C048,Bacteroides thetaiotaomicron VPI-5482,123,1071,3429,202,0.081581,0.736166,0.161949,0.103015,0.378462,0.944368,0.762,0.6069,0.09882


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,50,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,30,1083,4278,83,0.022424,0.786993,0.04894,0.026954,0.265487,0.980968,0.797985,0.537801,0.022787


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,50,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,541,3966,13933,943,0.089994,0.746737,0.180604,0.120036,0.364555,0.936609,0.778423,0.619592,0.118844



===== Test model: seq_gen-lstm-100 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,100,data1,C018,Escherichia coli K-12 BW25113,210,366,3649,88,0.457426,0.894737,0.480549,0.364583,0.704698,0.976452,0.908842,0.878034,0.50258


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,100,data2,C016,Escherichia coli K-12 MG1655,220,347,3676,70,0.498316,0.903316,0.513419,0.388007,0.758621,0.981313,0.913746,0.915515,0.569964


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,100,data3,O046,synthetic bacterium JCVI-Syn3A,111,0,0,347,0.0,0.242358,0.390158,1.0,0.242358,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,100,data4,C048,Bacteroides thetaiotaomicron VPI-5482,285,281,4219,40,0.634404,0.933472,0.639731,0.503534,0.876923,0.990608,0.937556,0.955122,0.695311


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,100,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,72,488,4873,41,0.256245,0.903361,0.213967,0.128571,0.637168,0.991656,0.908972,0.875495,0.112316


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,lstm,100,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,898,1482,16417,586,0.423175,0.893309,0.464803,0.377311,0.605121,0.965535,0.917202,0.834473,0.408831



===== Test model: seq_gen-clstm-10 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,10,data1,C018,Escherichia coli K-12 BW25113,236,273,3742,62,0.56909,0.922328,0.584882,0.463654,0.791946,0.983701,0.932005,0.919659,0.560732


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,10,data2,C016,Escherichia coli K-12 MG1655,248,269,3754,42,0.6078,0.927892,0.614622,0.479691,0.855172,0.988936,0.933134,0.953134,0.631968


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,10,data3,O046,synthetic bacterium JCVI-Syn3A,121,0,0,337,0.0,0.264192,0.417962,1.0,0.264192,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,10,data4,C048,Bacteroides thetaiotaomicron VPI-5482,299,171,4329,26,0.745544,0.959171,0.752201,0.63617,0.92,0.99403,0.962,0.969027,0.833304


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,10,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,69,504,4857,44,0.23994,0.89989,0.201166,0.120419,0.610619,0.991022,0.905988,0.877295,0.1014


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,10,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,973,1217,16682,511,0.493589,0.91085,0.529668,0.444292,0.65566,0.970279,0.932007,0.862092,0.434951



===== Test model: seq_gen-clstm-25 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,25,data1,C018,Escherichia coli K-12 BW25113,232,233,3782,66,0.589163,0.930675,0.608126,0.498925,0.778523,0.982848,0.941968,0.908562,0.637758


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,25,data2,C016,Escherichia coli K-12 MG1655,242,227,3796,48,0.625904,0.936239,0.637681,0.515991,0.834483,0.987513,0.943574,0.95022,0.730457


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,25,data3,O046,synthetic bacterium JCVI-Syn3A,108,0,0,350,0.0,0.235808,0.381625,1.0,0.235808,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,25,data4,C048,Bacteroides thetaiotaomicron VPI-5482,282,118,4382,43,0.764892,0.966632,0.777931,0.705,0.867692,0.990282,0.973778,0.975841,0.828197


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,25,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,82,488,4873,31,0.295443,0.905188,0.240117,0.14386,0.725664,0.993679,0.908972,0.895403,0.114213


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,25,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,946,1066,16833,538,0.503808,0.917247,0.54119,0.470179,0.637466,0.969029,0.940444,0.872655,0.484341



===== Test model: seq_gen-clstm-50 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,50,data1,C018,Escherichia coli K-12 BW25113,246,353,3662,52,0.540913,0.906098,0.548495,0.410684,0.825503,0.985999,0.91208,0.909697,0.532151


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,50,data2,C016,Escherichia coli K-12 MG1655,257,349,3674,33,0.57613,0.911431,0.573661,0.424092,0.886207,0.991098,0.913249,0.945072,0.593452


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,50,data3,O046,synthetic bacterium JCVI-Syn3A,153,0,0,305,0.0,0.334061,0.500818,1.0,0.334061,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,50,data4,C048,Bacteroides thetaiotaomicron VPI-5482,305,152,4348,20,0.774364,0.964352,0.780051,0.667396,0.938462,0.995421,0.966222,0.976284,0.862248


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,50,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,87,531,4830,26,0.301413,0.898246,0.23803,0.140777,0.769912,0.994646,0.900951,0.87982,0.111007


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,50,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1048,1385,16514,436,0.504664,0.906052,0.535103,0.430744,0.706199,0.974277,0.922621,0.877395,0.433061



===== Test model: seq_gen-clstm-100 ====


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,100,data1,C018,Escherichia coli K-12 BW25113,249,332,3683,49,0.559266,0.911662,0.566553,0.428571,0.83557,0.98687,0.91731,0.913577,0.557198


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,100,data2,C016,Escherichia coli K-12 MG1655,263,330,3693,27,0.599874,0.917227,0.595696,0.443508,0.906897,0.992742,0.917972,0.950882,0.571837


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,100,data3,O046,synthetic bacterium JCVI-Syn3A,113,0,0,345,0.0,0.246725,0.395797,1.0,0.246725,0.0,0.0,,


  df_eval = pd.concat([df_eval, perform], ignore_index=True)


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,100,data4,C048,Bacteroides thetaiotaomicron VPI-5482,299,178,4322,26,0.739343,0.95772,0.745636,0.626834,0.92,0.99402,0.960444,0.978639,0.776172


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,100,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,84,529,4832,29,0.290682,0.898064,0.231405,0.137031,0.743363,0.994034,0.901324,0.887805,0.117733


Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm,100,test_all,C018+C016+O046+C048+C050,Escherichia coli K-12 BW25113+Escherichia coli...,1008,1369,16530,476,0.488608,0.904813,0.522145,0.424064,0.679245,0.97201,0.923515,0.8691,0.425139


'Model performance:'

Unnamed: 0,embed,pool,testset,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,cnn,10,data1,C018,Escherichia coli K-12 BW25113,223,292,3723,75,0.528392,0.914908,0.548585,0.433010,0.748322,0.980253,0.927273,0.896913,0.565699
1,cnn,10,data2,C016,Escherichia coli K-12 MG1655,234,288,3735,56,0.564588,0.920241,0.576355,0.448276,0.806897,0.985228,0.928412,0.932575,0.635496
2,cnn,10,data3,O046,synthetic bacterium JCVI-Syn3A,98,0,0,360,0.000000,0.213974,0.352518,1.000000,0.213974,0.000000,0.000000,,
3,cnn,10,data4,C048,Bacteroides thetaiotaomicron VPI-5482,295,169,4331,30,0.739747,0.958756,0.747782,0.635776,0.907692,0.993121,0.962444,0.974694,0.826376
4,cnn,10,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,75,501,4860,38,0.264252,0.901535,0.217707,0.130208,0.663717,0.992242,0.906547,0.875880,0.113041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,clstm,100,data2,C016,Escherichia coli K-12 MG1655,263,330,3693,27,0.599874,0.917227,0.595696,0.443508,0.906897,0.992742,0.917972,0.950882,0.571837
68,clstm,100,data3,O046,synthetic bacterium JCVI-Syn3A,113,0,0,345,0.000000,0.246725,0.395797,1.000000,0.246725,0.000000,0.000000,,
69,clstm,100,data4,C048,Bacteroides thetaiotaomicron VPI-5482,299,178,4322,26,0.739343,0.957720,0.745636,0.626834,0.920000,0.994020,0.960444,0.978639,0.776172
70,clstm,100,data5,C050,Salmonella enterica subsp. enterica serovar Ty...,84,529,4832,29,0.290682,0.898064,0.231405,0.137031,0.743363,0.994034,0.901324,0.887805,0.117733
