#### Setting environments

In [13]:
import os, itertools, gc
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score,\
f1_score, precision_score, recall_score, roc_auc_score, average_precision_score

In [14]:
# Set options
embed_ver = ["clstm", "t5"]
test_ver = ["C018", "C039", "C035", "C033", "C061"]

result_path = f"../results/"
save_path = f"../results/prd-conf/"
os.makedirs(save_path, exist_ok=True)

col_str = ['file_id', 'organism', 'locus_tag', 'ess']

In [15]:
# Define function to record perfomance result
def record_perform(comb_ver, file_id, organ, y_real, y_conf, y_prd):    
    if file_id != "O046":
        auc_roc = [roc_auc_score(y_real, y_conf)]
        auc_pr = [average_precision_score(y_real, y_conf)]
    else:
        auc_roc = None
        auc_pr = None
    
    tn, fp, fn, tp = confusion_matrix(y_real, y_prd).ravel()
    
    result = pd.DataFrame({
        "comb": [comb_ver],
        "file": [file_id],
        "organism": [organ],
        "tp": [tp],
        "fp": [fp],
        "tn": [tn],
        "fn": [fn],
        "mcc": [matthews_corrcoef(y_real, y_prd)],
        "acc": [accuracy_score(y_real, y_prd)],
        "f1": [f1_score(y_real, y_prd)],
        "prc": [precision_score(y_real, y_prd)],
        "rec": [recall_score(y_real, y_prd)],
        "npv": [precision_score(1 - y_real, 1 - y_prd)],
        "tnr": [recall_score(1 - y_real, 1 - y_prd)],
        "auc-roc": auc_roc,
        "auc-pr": auc_pr
    })

    return result

#### Evaluate model

In [16]:
df_eval = pd.DataFrame()
total_conf = []
total_label = []
total_cls = []

for ts_ver in test_ver:
    dfs = [(e_ver, pd.read_csv(result_path + f"prd-indiv/{e_ver}-{ts_ver}.csv")) for e_ver in embed_ver]

    for r in range(2, len(dfs) + 1):
        combs = list(itertools.combinations(dfs, r))
        
        for comb in combs:
            comb_ver = "_".join([df[0] for df in comb])
            print(f"\n>>>> {comb_ver} <<<<")
            
            # merge dataset
            data = comb[0][1]
            for df in comb[1:]:
                data = pd.merge(data, df[1], on=col_str, suffixes=("", f"_{df[0]}"))
            
            display("Raw data:", data)
            
            # calculate mean of confidences
            col_num = [col for col in data.columns if col not in col_str]
            data['conf_mean'] = data[col_num].mean(axis=1)
            
            # save the model prediction result
            data.to_csv(f"{save_path}{comb_ver}-{ts_ver}.csv", index=False)

            
            ## evaluations by test dataset ##
            prd_cls = (data['conf_mean'] >= 0.5).astype(int).tolist()
            
            # gather the results
            total_conf.extend(data['conf_mean'].tolist())
            total_label.extend(data['ess'].tolist())
            total_cls.extend(prd_cls)

            # performances by testset
            eval_ts = record_perform(
                comb_ver=comb_ver,
                file_id=ts_ver,
                organ=ts_ver,
                y_real=data['ess'].to_numpy(),
                y_conf=data['conf_mean'].to_numpy(),
                y_prd=np.array(prd_cls)
            )
            df_eval = pd.concat([df_eval, eval_ts], ignore_index=True)
            print(f"- Test in {ts_ver} was done.")
            
            gc.collect()
            
# performances on total testset
eval_ts = record_perform(
    comb_ver=comb_ver,
    file_id="total",
    organ="all",
    y_real=np.array(total_label),
    y_conf=np.array(total_conf),
    y_prd=np.array(total_cls)
)
df_eval = pd.concat([df_eval, eval_ts], ignore_index=True)
print(f"- Test in total testset was done.")

# save the model perfomance result
display("Model performance:", df_eval)
df_eval.to_csv(f"{result_path}eval-conf_avg-strain.csv", index=False)


>>>> clstm_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,conf,conf_t5
0,C018,Escherichia coli K-12 BW25113,BW25113_0001,0,0.154492,0.037812
1,C018,Escherichia coli K-12 BW25113,BW25113_0002,0,0.200179,0.126633
2,C018,Escherichia coli K-12 BW25113,BW25113_0003,0,0.057514,0.317407
3,C018,Escherichia coli K-12 BW25113,BW25113_0004,0,0.244505,0.214612
4,C018,Escherichia coli K-12 BW25113,BW25113_0005,0,0.121918,0.155861
...,...,...,...,...,...,...
4308,C018,Escherichia coli K-12 BW25113,BW25113_4702,0,0.458123,0.008008
4309,C018,Escherichia coli K-12 BW25113,BW25113_4703,0,0.043490,0.196018
4310,C018,Escherichia coli K-12 BW25113,BW25113_4705,0,0.056452,0.151947
4311,C018,Escherichia coli K-12 BW25113,BW25113_4706,0,0.040557,0.440166


- Test in C018 was done.

>>>> clstm_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,conf,conf_t5
0,C039,Pseudomonas aeruginosa MPAO1,PA0001,1,0.288246,0.967049
1,C039,Pseudomonas aeruginosa MPAO1,PA0002,1,0.468010,0.969956
2,C039,Pseudomonas aeruginosa MPAO1,PA0003,0,0.054503,0.641700
3,C039,Pseudomonas aeruginosa MPAO1,PA0004,1,0.980971,0.950914
4,C039,Pseudomonas aeruginosa MPAO1,PA0005,0,0.246532,0.851149
...,...,...,...,...,...,...
5565,C039,Pseudomonas aeruginosa MPAO1,PA5566,0,0.053826,0.258951
5566,C039,Pseudomonas aeruginosa MPAO1,PA5567,0,0.261649,0.757103
5567,C039,Pseudomonas aeruginosa MPAO1,PA5568,1,0.054366,0.940956
5568,C039,Pseudomonas aeruginosa MPAO1,PA5569,1,0.054773,0.812802


- Test in C039 was done.

>>>> clstm_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,conf,conf_t5
0,C035,Caulobacter crescentus,CCNA_00001,0,0.020292,0.325633
1,C035,Caulobacter crescentus,CCNA_00002,0,0.047080,0.817476
2,C035,Caulobacter crescentus,CCNA_00003,1,0.321953,0.902172
3,C035,Caulobacter crescentus,CCNA_00004,1,0.932568,0.945359
4,C035,Caulobacter crescentus,CCNA_00005,1,0.077731,0.902546
...,...,...,...,...,...,...
3881,C035,Caulobacter crescentus,CCNA_03995,0,0.024828,0.909956
3882,C035,Caulobacter crescentus,CCNA_03996,0,0.022914,0.216197
3883,C035,Caulobacter crescentus,CCNA_03997,0,0.070339,0.021555
3884,C035,Caulobacter crescentus,CCNA_03998,0,0.033459,0.274795


- Test in C035 was done.

>>>> clstm_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,conf,conf_t5
0,C033,Salmonella enterica subsp. enterica serovar Ty...,-,0,0.004617,0.558050
1,C033,Salmonella enterica subsp. enterica serovar Ty...,t0001,0,0.009603,0.000401
2,C033,Salmonella enterica subsp. enterica serovar Ty...,t0002,0,0.011417,0.070956
3,C033,Salmonella enterica subsp. enterica serovar Ty...,t0003,0,0.031848,0.069462
4,C033,Salmonella enterica subsp. enterica serovar Ty...,t0004,0,0.022728,0.093716
...,...,...,...,...,...,...
4318,C033,Salmonella enterica subsp. enterica serovar Ty...,t4635,0,0.026657,0.232923
4319,C033,Salmonella enterica subsp. enterica serovar Ty...,t4636,0,0.005716,0.184417
4320,C033,Salmonella enterica subsp. enterica serovar Ty...,t4637,0,0.975730,0.190661
4321,C033,Salmonella enterica subsp. enterica serovar Ty...,t4638,0,0.020986,0.279236


- Test in C033 was done.

>>>> clstm_t5 <<<<


'Raw data:'

Unnamed: 0,file_id,organism,locus_tag,ess,conf,conf_t5
0,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_0001,1,0.057509,0.904253
1,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_0002,1,0.883836,0.956628
2,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_0003,0,0.210097,0.264684
3,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_0004,0,0.270899,0.777131
4,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_0005,0,0.062051,0.050002
...,...,...,...,...,...,...
3322,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_3333,0,0.059030,0.094285
3323,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_3334,0,0.063304,0.357780
3324,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_3335,0,0.057365,0.361783
3325,C061,Brevundimonas subvibrioides ATCC 15264,Bresu_3336,0,0.057278,0.197121


- Test in C061 was done.
- Test in total testset was done.


'Model performance:'

Unnamed: 0,comb,file,organism,tp,fp,tn,fn,mcc,acc,f1,prc,rec,npv,tnr,auc-roc,auc-pr
0,clstm_t5,C018,C018,257,175,3840,41,0.691721,0.949919,0.70411,0.594907,0.862416,0.989436,0.956413,0.946533,0.730878
1,clstm_t5,C039,C039,264,213,5006,87,0.617688,0.94614,0.637681,0.553459,0.752137,0.982918,0.959188,0.966462,0.718407
2,clstm_t5,C035,C035,304,163,3243,176,0.59244,0.912764,0.642027,0.650964,0.633333,0.948523,0.952143,0.933324,0.738322
3,clstm_t5,C033,C033,319,229,3740,35,0.695095,0.938931,0.707317,0.582117,0.90113,0.990728,0.942303,0.971676,0.889318
4,clstm_t5,C061,C061,300,178,2735,114,0.624414,0.912233,0.672646,0.627615,0.724638,0.959986,0.938895,0.944138,0.74662
5,clstm_t5,total,all,1444,958,18564,453,0.641205,0.934124,0.671784,0.601166,0.761202,0.976179,0.950927,0.952455,0.762103
