#### Setting environments

In [2]:
import itertools, gc
import pandas as pd
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score,\
f1_score, precision_score, recall_score, roc_auc_score, average_precision_score

In [None]:
# Set options
embed_ver = ["clstm", "esm2", "bert", "t5"]
result_path = "../result/prd-indiv_class/"
save_path = "../result/"

In [None]:
col_str = ['file_id', 'organism', 'locus_tag', 'ess']
layer_num = 3
unit_decrease = 2

In [4]:
# Set data list for test dataset
ts_data = {
    "data1": ["C018"],  # "Escherichia coli K-12 BW25113"
    "data2": ["C016"],  # "Escherichia coli K-12 MG1655"
    "data3": ["O046"],  # "synthetic bacterium JCVI-Syn3A"
    "data4": ["C048"],  # Bacteroides thetaiotaomicron VPI-5482
    "data5": ["C050"]  # Salmonella enterica subsp. enterica serovar Typhimurium str. 14028S
}

In [5]:
# Define function to record perfomance result
def record_perform(comb_ver, file_id, organ, y_real, y_conf, y_prd):    
    if file_id != "O046":
        auc_roc = [roc_auc_score(y_real, y_conf)]
        auc_pr = [average_precision_score(y_real, y_conf)]
    else:
        auc_roc = None
        auc_pr = None
    
    tn, fp, fn, tp = confusion_matrix(y_real, y_prd).ravel()
    
    result = pd.DataFrame({
        "comb": [comb_ver],
        "file": [file_id],
        "organism": [organ],
        "tp": [tp],
        "fp": [fp],
        "tn": [tn],
        "fn": [fn],
        "mcc": [matthews_corrcoef(y_real, y_prd)],
        "acc": [accuracy_score(y_real, y_prd)],
        "f1": [f1_score(y_real, y_prd)],
        "prc": [precision_score(y_real, y_prd)],
        "rec": [recall_score(y_real, y_prd)],
        "npv": [precision_score(1 - y_real, 1 - y_prd)],
        "tnr": [recall_score(1 - y_real, 1 - y_prd)],
        "auc-roc": auc_roc,
        "auc-pr": auc_pr
    })

    return result

In [None]:
dfs = [(e_ver, pd.read_csv(result_path + f"cls-{e_ver}.csv")) for e_ver in embed_ver]

#### Evaluate model

In [None]:
df_eval = pd.DataFrame()

for r in range(2, len(dfs) + 1):
    combs = list(itertools.combinations(dfs, r))
    
    for comb in combs:
        comb_ver = "_".join([df[0] for df in comb])
        print(f"\n>>>> {comb_ver} <<<<")
        
        # merge dataset
        data = comb[0][1]
        for df in comb[1:]:
            data = pd.merge(data, df[1], on=col_str, suffixes=("", f"_{df[0]}"))
        
        display("Raw data:", data)
        
        # calculate mean of confidences
        col_num = [col for col in data.columns if col not in col_str]
        data['conf_mean'] = data[col_num].mean(axis=1)
    
        # get test datasets
        loc_ts = {}
        data_ts = {}
        org_ts = {}
        for ts_ver, ids in ts_data.items():
            # get test sample locations
            loc_ts[ts_ver] = data['file_id'].isin(ids)
            # get test samples
            data_ts[ts_ver] = data[loc_ts[ts_ver]]
            org = []
            # get test organism list
            for i in ids:
                organ = data_ts[ts_ver]['organism'][data_ts[ts_ver]['file_id'] == i].to_list()
                if len(organ) > 0:
                    org.append(organ[0])
            org_ts[ts_ver] = org
    
            print("Test dataset(" + ts_ver + "):", data_ts[ts_ver].shape)
        print("Test organism:", org_ts, len(org_ts))
        
        # get the total test dataset info.
        loc_ts_all = [sum(loc) >= 1 for loc in zip(*loc_ts.values())]
        info_ts_all = data.loc[loc_ts_all, col_str]
        
        ## evaluations by test dataset ##
        for ts_ver, ids in ts_data.items():
            prd_conf = data_ts[ts_ver]['conf_mean']
            prd_cls = (prd_conf >= 0.5).astype(int)
            # performances by testset
            eval_ts = record_perform(
                comb_ver=comb_ver,
                file_id="+".join(ids),
                organ="+".join(org_ts[ts_ver]),
                y_real=data_ts[ts_ver]['ess'],
                y_conf=prd_conf,
                y_prd=prd_cls,
            )
            df_eval = pd.concat([df_eval, eval_ts], ignore_index=True)
            print(f"- Test in {ts_ver} was done.")
        
        gc.collect()
        
        # evaluation for total test dataset
        prd_conf = data.loc[loc_ts_all, 'conf_mean']
        prd_cls = (prd_conf >= 0.5).astype(int)

        # performances on total testset
        eval_ts = record_perform(
            comb_ver=comb_ver,
            test_ver="test_all",
            file_id="total",
            organ="all",
            y_real=data.loc[loc_ts_all, 'ess'],
            y_conf=prd_conf,
            y_prd=prd_cls
        )
        df_eval = pd.concat([df_eval, eval_ts], ignore_index=True)
        print(f"- Test in total testset was done.")
        
        # concatenate the protein info. & predicted confidences
        df_prd = pd.DataFrame(prd_conf, columns=["conf_mean"], index=info_ts_all.index)
        df_prd = pd.concat([info_ts_all, df_prd], axis=1)

        # save the model prediction result
        df_prd.to_csv(f"{result_path}prd-simple_ensem/{comb_ver}.csv", index=False)
    
        print(f"- Prediction by {comb_ver} was done.")

# save the model perfomance result
display("Model performance:", df_eval)
df_eval.to_csv(f"{result_path}eval-simple_ensem.csv", index=False)