## scripts to evaluate matching results

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import sys
sys.path.append("./")
import metrics
import utils

## note to rerun mf as the matching direction is incorrect

In [24]:
out_dir = "/tonsil_v2/match/bench_out/"
ann_listlv1 = []
slt_f1 = []
ari_f1 = []

for i in range(5):
    batch = 'b'+str(i+1)+'/'
    root_dir = out_dir+batch
    # produce liger matching
    lgx = pd.read_csv(root_dir+"lg/full_embed_x0.csv")
    lgy = pd.read_csv(root_dir+"lg/full_embed_y0.csv")
    lg_dist = utils.cdist_correlation(lgy.to_numpy(), lgx.to_numpy())
    lg_full_match, lg_scores = metrics.get_knn_matching(lg_dist)
    lgmatch = [lg_full_match, np.arange(lgy.shape[0]),lg_scores]
    # harmony matching
    hmx = pd.read_csv(root_dir+"hm/full_embed_x0.csv")
    hmy = pd.read_csv(root_dir+"hm/full_embed_y0.csv")
    hm_dist = utils.cdist_correlation(hmy.to_numpy(), hmx.to_numpy())
    hm_full_match, hm_scores = metrics.get_knn_matching(hm_dist)
    hmmatch = [hm_full_match, np.arange(hmy.shape[0]),hm_scores]
    # bsc matching
    bscx = pd.read_csv(root_dir+"bsc/full_embed_x0.csv")
    bscy = pd.read_csv(root_dir+"bsc/full_embed_y0.csv")
    bsc_dist = utils.cdist_correlation(bscy.to_numpy(), bscx.to_numpy())
    bsc_full_match, bsc_scores = metrics.get_knn_matching(bsc_dist)
    bscmatch = [bsc_full_match, np.arange(bscy.shape[0]),bsc_scores]
    # load mf matching
    mf = pd.read_csv(root_dir+"mf/full_idx.csv")
    mfmatch = [mf['idx1'].tolist(),mf['idx2'].tolist(),mf['score'].tolist()]
    # load sr matching
    #sr = pd.read_csv(root_dir+"sr/full_idx.csv")
    #srmatch = [sr['idx1'].tolist(),sr['idx2'].tolist(),sr['score'].tolist()]
    # load embedding too
    # mf embed
    mfx = pd.read_csv(root_dir+"mf/full_embed_x0.csv")
    mfy = pd.read_csv(root_dir+"mf/full_embed_y0.csv")
    # sr embed
    srx = pd.read_csv(root_dir+"sr/full_embed_x0.csv")
    sry = pd.read_csv(root_dir+"sr/full_embed_y0.csv")
    sr_dist = utils.cdist_correlation(sry.to_numpy(), srx.to_numpy())
    sr_full_match, sr_scores = metrics.get_knn_matching(sr_dist)
    srmatch = [sr_full_match, np.arange(sry.shape[0]),sr_scores]
    
    # read meta info
    temp_dir = '/tonsil_v2/match/bench_input/'
    
    meta_rna = pd.read_csv(temp_dir + batch+ 'meta_rna.csv')
    meta_pro = pd.read_csv(temp_dir + batch+ 'meta_pro.csv')
    
    
    if i >= 0:
        # for liger missing cells only
        lg_id1 = pd.read_csv(root_dir+"lg/d1_id.csv")
        lg_id2 = pd.read_csv(root_dir+"lg/d2_id.csv")
        meta_rna_lg = meta_rna.drop(index = lg_id1['id']-1, axis=0)
        meta_pro_lg = meta_pro.drop(index = lg_id2['id']-1, axis=0)
    else:
        lg_id1 = pd.read_csv(root_dir+"lg/d1_id.csv")
        meta_rna_lg = meta_rna.drop(index = lg_id1['id']-1, axis=0)
        meta_pro_lg = meta_pro
    
    annotation_rna = meta_rna['cluster.info'].to_numpy()
    annotation_pro = meta_pro['cluster.term'].to_numpy()
    
    annotation_rna_lg = meta_rna_lg['cluster.info'].to_numpy()
    annotation_pro_lg = meta_pro_lg['cluster.term'].to_numpy()
    
    #annotation_rna_lg = annotation_rna
    #annotation_pro_lg = annotation_pro
    
    order = (2, 1)
    acc_ann_hm = metrics.get_matching_acc(
            matching=hmmatch, 
            labels1=annotation_rna, 
            labels2=annotation_pro,
            order = order
        )

    acc_ann_lg = metrics.get_matching_acc(
            matching=lgmatch, 
            labels1=annotation_rna_lg, 
            labels2=annotation_pro_lg,
            order = order
        )

    acc_ann_sr = metrics.get_matching_acc(
            matching=srmatch, 
            labels1=annotation_rna, 
            labels2=annotation_pro,
            order = order
        )

    acc_ann_mf = metrics.get_matching_acc(
            matching=mfmatch, 
            labels1=annotation_rna, 
            labels2=annotation_pro,
            order = order
        )
    acc_ann_bsc = metrics.get_matching_acc(
            matching=bscmatch, 
            labels1=annotation_rna, 
            labels2=annotation_pro,
            order = order
        )
    ann_listlv1.extend([acc_ann_mf, acc_ann_sr, acc_ann_lg, acc_ann_hm, acc_ann_bsc])
    
    
    # also need to read the integration metrics
    mf_met = pd.read_csv(root_dir+"mf/metrics.csv")
    sr_met = pd.read_csv(root_dir+"sr/metrics.csv")
    lg_met = pd.read_csv(root_dir+"lg/metrics.csv")
    hm_met = pd.read_csv(root_dir+"hm/metrics.csv")
    bsc_met = pd.read_csv(root_dir+"bsc/metrics.csv")

    slt_f1.extend([mf_met.loc[0,'slt_f1'],sr_met.loc[0,'slt_f1'],
                   lg_met.loc[0,'slt_f1'],hm_met.loc[0,'slt_f1'], bsc_met.loc[0,'slt_f1']])
    ari_f1.extend([mf_met.loc[0,'ari_f1'],sr_met.loc[0,'ari_f1'],
                   lg_met.loc[0,'ari_f1'],hm_met.loc[0,'ari_f1'], bsc_met.loc[0,'ari_f1']])

In [25]:
ann_listlv1

[0.9258816497310222,
 0.3673,
 0.5742240890755742,
 0.5001,
 0.6146,
 0.9220155968806238,
 0.4103,
 0.5704523484116137,
 0.5758,
 0.6377,
 0.9244151169766047,
 0.39503333333333335,
 0.5663232989965663,
 0.5323,
 0.6324666666666666,
 0.9241338112305855,
 0.30243333333333333,
 0.5668188939631321,
 0.5654666666666667,
 0.6189333333333333,
 0.9202159568086383,
 0.3751,
 0.5746524884162806,
 0.5418,
 0.5888]

In [26]:
# construct batch
b = np.array(["b1","b2","b3","b4","b5"])
binfo = np.repeat(b, [5,5,5,5,5], axis=0)
m = ["mf","sr","lg","hm","bsc"]
minfo = m *5

data = {'method':minfo, 'batch':binfo,'slt_f1': slt_f1, 'ari_f1':ari_f1,
       'ann1':ann_listlv1}

matching_result = pd.DataFrame(data)

In [27]:
matching_result

Unnamed: 0,method,batch,slt_f1,ari_f1,ann1
0,mf,b1,0.535768,0.601693,0.925882
1,sr,b1,0.400665,0.359556,0.3673
2,lg,b1,0.532232,0.59016,0.574224
3,hm,b1,0.522549,0.578734,0.5001
4,bsc,b1,0.517877,0.590628,0.6146
5,mf,b2,0.537428,0.60241,0.922016
6,sr,b2,0.406595,0.521881,0.4103
7,lg,b2,0.533307,0.594977,0.570452
8,hm,b2,0.522317,0.575098,0.5758
9,bsc,b2,0.519019,0.595497,0.6377


In [28]:
#matching_result.to_csv("/tonsil_v2/match/bench_out/batch_metrics_result.csv")
matching_result.to_csv("/tonsil_v2/match/bench_out/batch_metrics_resultV2.csv")