# Make a big dataframe containing performance measures for all algorithms on all datasets

* Adobo and sccatch give labels that don't always exactly match the truth values
    * e

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from os import listdir

In [12]:
datasets = ['cb', 'dg', 'jam', 'li_crc','llc', 'peng', 'tm', 'vg']
methods = ['cibersort',
           'gsea',
           'gsva',
           'metaneighbor',
           'ora',
           'adobo',
           'sccatch',
           'SVM',
           'SVMrej',
           'RF',
           'LDA',
           'LDArej',
           'NMC',
           'kNN9',
           'ACTINN',
           'scVI',
           'Cell_BLAST',
           'SingleCellNet',
           'LAmbDA',
           'scPred',
           'CaSTLe',
           'CHETAH',
           'scID',
           'scmapcell',
           'scmapcluster',
           'singleR'
          ]
#nested loops that read the predictions for each dataset, then scores each method
for dataset in datasets:
    preds = pd.read_csv(f'./predictions/{dataset}_predictions.tsv', sep='\t')
    perfdf = pd.DataFrame(columns = ['method', 
                                     'class', 
                                     'precision', 
                                     'recall', 
                                     'f1_score', 
                                     'support', 
                                     'specificity'])
    
    for method in methods:
        #Generate classification scores for the current method
        report = classification_report(y_true=preds.truth,
                                       y_pred=preds[f'{method}'],
                                       output_dict=True)
        #Also generate a confusion matrix
        cm = confusion_matrix(y_true=preds.truth,
                              y_pred=preds[f'{method}'])
        
        #compute false positives/negatives and true positives/negatives from
        #confustion matrix
        FP = cm.sum(axis=0)-np.diag(cm)
        FN = cm.sum(axis=1) - np.diag(cm)
        TP = np.diag(cm)
        TN = cm.sum() - (FP + FN + TP)
        class_spec = TN/(TN+FP)
        i = 0
        #turn all of the different scores into a row for the dataframe
        for key, val in report.items():
            #we don't use accuracy so skip it
            if key =='accuracy': 
                continue
            perflist = []
            perflist.append(method)
            perflist.append(key)
            for key2, val2 in val.items():
                perflist.append(val2)
            if i>=len(class_spec):
                perflist.append(0)
            else:
                perflist.append(class_spec[i])
            perfdf=perfdf.append(pd.Series(perflist, index=perfdf.columns), ignore_index=True)
            i+=1
    #save the dataframe of scores to disk
    perfdf.to_csv(f'./performance/seurat/{dataset}_classification_report.tsv',sep='\t', index=False)
    
    print(dataset)
    print('---------------------------------------------------------------')
    display(perfdf)
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


cb
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,BCELL,0.197917,0.463415,0.277372,82,0.618812
1,cibersort,MYELOID,0.000000,0.000000,0.000000,32,0.834802
2,cibersort,STROMAL,0.193182,0.809524,0.311927,21,0.847312
3,cibersort,TCELL,0.244275,0.941176,0.387879,34,0.780973
4,cibersort,TUMOR,0.000000,0.000000,0.000000,317,1.000000
...,...,...,...,...,...,...,...
187,singleR,STROMAL,0.833333,0.476190,0.606061,21,0.995699
188,singleR,TCELL,0.888889,0.941176,0.914286,34,0.991150
189,singleR,TUMOR,0.956386,0.968454,0.962382,317,0.917160
190,singleR,macro avg,0.875501,0.823658,0.839622,486,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


dg
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,ASTOCYTE,1.000000,0.886364,0.939759,88,1.000000
1,cibersort,IMMUNE,0.608968,1.000000,0.756967,1847,0.319173
2,cibersort,NEOPLASTIC,0.000000,0.000000,0.000000,1091,1.000000
3,cibersort,NEURON,0.000000,0.000000,0.000000,21,1.000000
4,cibersort,OLIGODENDROCYTE,1.000000,0.952941,0.975904,85,1.000000
...,...,...,...,...,...,...,...
239,singleR,OLIGODENDROCYTE,0.988095,0.976471,0.982249,85,0.999715
240,singleR,OPC,0.997462,0.967980,0.982500,406,0.999686
241,singleR,VASCULAR,0.943396,0.980392,0.961538,51,0.999152
242,singleR,macro avg,0.959460,0.972332,0.965199,3589,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


jam
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,B.CELL,0.000000,0.000000,0.000000,818,1.000000
1,cibersort,CAF,0.064702,0.849057,0.120240,106,0.807914
2,cibersort,ENDO.,0.967033,0.846154,0.902564,104,0.999557
3,cibersort,MACROPHAGE,0.948357,0.961905,0.955083,420,0.996594
4,cibersort,MAL,0.961929,0.187810,0.314262,2018,0.996914
...,...,...,...,...,...,...,...
291,singleR,T.CD4,0.623377,0.785047,0.694933,856,0.932592
292,singleR,T.CD8,0.916667,0.825469,0.868681,1759,0.974219
293,singleR,T.CELL,0.455197,0.359773,0.401899,706,0.950753
294,singleR,macro avg,0.798679,0.812094,0.795594,6879,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


li_crc
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,BCELL,0.000000,0.000000,0.000000,17,1.000000
1,cibersort,EPITHELIAL,0.821752,1.000000,0.902156,272,0.321839
2,cibersort,FIBROBLAST,0.000000,0.000000,0.000000,17,1.000000
3,cibersort,MACROPHAGE,0.000000,0.000000,0.000000,19,1.000000
4,cibersort,TCELL,0.928571,0.764706,0.838710,34,0.993846
...,...,...,...,...,...,...,...
188,singleR,FIBROBLAST,1.000000,0.941176,0.969697,17,1.000000
189,singleR,MACROPHAGE,0.944444,0.894737,0.918919,19,0.997059
190,singleR,TCELL,0.969697,0.941176,0.955224,34,0.996923
191,singleR,macro avg,0.962426,0.896594,0.926227,359,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


llc
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,ALVEOLAR_CELL,0.000000,0.000000,0.000000,1335,1.000000
1,cibersort,BASAL_CELLS,0.000000,0.000000,0.000000,68,1.000000
2,cibersort,B_CELLS,0.995234,0.616482,0.761355,4065,0.999748
3,cibersort,CANCER_CELLS,0.640554,0.454876,0.531979,7424,0.957273
4,cibersort,DENDRITIC_CELLS,0.000000,0.000000,0.000000,586,1.000000
...,...,...,...,...,...,...,...
499,singleR,NATURAL_KILLER_CELLS,0.468906,0.970132,0.632229,1741,0.961766
500,singleR,SECRETORY_CLUB_CELLS,0.660099,0.992593,0.792899,135,0.998664
501,singleR,T_CELLS,0.988696,0.827422,0.900898,23149,0.992350
502,singleR,macro avg,0.723759,0.872972,0.767928,51775,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


peng
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,ACINAR_CELL,0.991127,0.981395,0.986237,1935,0.999694
1,cibersort,B_CELL,0.918640,0.982836,0.949654,2447,0.996133
2,cibersort,DUCTAL_CELL_TYPE_1,0.939923,0.991761,0.965146,10317,0.986148
3,cibersort,DUCTAL_CELL_TYPE_2,0.989949,0.983650,0.986790,11315,0.997555
4,cibersort,ENDOCRINE_CELL,0.979592,0.724280,0.832808,729,0.999806
...,...,...,...,...,...,...,...
318,singleR,MACROPHAGE,0.958257,0.980601,0.969300,5361,0.995610
319,singleR,STELLATE_CELL,0.990596,0.909429,0.948279,5907,0.999012
320,singleR,T_CELL,0.968214,0.848907,0.904644,3660,0.998107
321,singleR,macro avg,0.948548,0.948254,0.947350,57530,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


tm
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,BCELL,0.439965,0.980469,0.607381,512,0.821757
1,cibersort,CAF,0.108949,1.000000,0.196491,56,0.886662
2,cibersort,ENDOTHELIAL,0.953846,1.000000,0.976378,62,0.999257
3,cibersort,MACROPHAGE,0.415225,1.000000,0.586797,120,0.957506
4,cibersort,MELANOMA,0.000000,0.000000,0.000000,1252,1.000000
...,...,...,...,...,...,...,...
239,singleR,MELANOMA,0.988105,0.995208,0.991643,1252,0.994728
240,singleR,NK,0.750000,1.000000,0.857143,51,0.995798
241,singleR,TCELL,0.999504,0.985323,0.992363,2044,0.999513
242,singleR,macro avg,0.944444,0.994707,0.966777,4097,0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


vg
---------------------------------------------------------------


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity
0,cibersort,GMP,0.460554,0.481605,0.470845,897,0.975946
1,cibersort,GMP_LIKE,0.553365,0.267587,0.360736,3042,0.965222
2,cibersort,HSC_PROG,0.607071,0.709982,0.654506,3386,0.916105
3,cibersort,HSC_PROG_LIKE,0.925905,0.255873,0.400945,6300,0.991748
4,cibersort,MYELOID,0.000000,0.000000,0.000000,3806,1.000000
...,...,...,...,...,...,...,...
213,singleR,HSC_PROG_LIKE,0.920434,0.714286,0.804361,6300,0.975117
214,singleR,MYELOID,0.693795,0.810825,0.747759,3806,0.924863
215,singleR,MYELOID_LIKE,0.964897,0.488450,0.648577,4502,0.995410
216,singleR,macro avg,0.711109,0.754587,0.684510,21933,0.000000


# Compile all scoresheets into one big dataframe

In [14]:
datasets = ['dg', 'jam', 'li_crc','llc', 'peng', 'tm','vg']
dataset_rename = {'cb':'Breast',
                  'dg': 'Glioblastoma',
                  'jam': 'Melanoma',
                  'li_crc': 'Colorectal',
                  'llc': 'Lung',
                  'peng': 'Pancreatic',
                  'tm':'Metastatic Melanoma',
                  'vg':'AML',
                 }

bigdf = pd.read_csv("performance/seurat/categories/cb_classification_report.tsv", sep='\t')
#bigdf = pd.read_csv("performance/seurat/cb_classification_report.tsv", sep='\t')
bigdf['dataset']='cb'
print(bigdf.shape)
#loop through and append other datasets to this dataframe
for dataset in datasets:
    df = pd.read_csv(f"performance/seurat/categories/{dataset}_classification_report.tsv", sep='\t')
    #df = pd.read_csv(f"performance/seurat/{dataset}_classification_report.tsv", sep='\t')
    df['dataset']=dataset
    bigdf = bigdf.append(df, ignore_index=True)
    
#remove the averaged scores since we use the bootstrapped averages instead
avgdf = bigdf[(bigdf['class'] == 'macro avg')|(bigdf['class'] == 'weighted avg')]
bigdf = bigdf[(bigdf['class'] != 'macro avg')&(bigdf['class'] != 'weighted avg')]
    
print(bigdf.shape)
bigdf.dataset = bigdf.apply(lambda row: dataset_rename[row['dataset']], axis=1)
bigdf.to_csv("performance/seurat/bigdf.tsv", sep="\t", index=False)
display(bigdf)

(192, 10)
(1798, 10)


Unnamed: 0,method,class,precision,recall,f1_score,support,specificity,Cell Labels,category,dataset
0,cibersort,BCELL,0.197917,0.463415,0.277372,82.0,0.618812,BCELL,Immune,Breast
1,cibersort,MYELOID,0.000000,0.000000,0.000000,32.0,0.834802,MYELOID,Immune,Breast
2,cibersort,STROMAL,0.193182,0.809524,0.311927,21.0,0.847312,STROMAL,Stromal,Breast
3,cibersort,TCELL,0.244275,0.941176,0.387879,34.0,0.780973,TCELL,Immune,Breast
4,cibersort,TUMOR,0.000000,0.000000,0.000000,317.0,1.000000,TUMOR,Tumour,Breast
...,...,...,...,...,...,...,...,...,...,...
2207,singleR,GMP_LIKE,0.600583,0.744905,0.665004,3042.0,0.920227,GMP_LIKE,Tumour,AML
2208,singleR,HSC_PROG,0.815497,0.823686,0.819571,3386.0,0.965978,HSC_PROG,Stem/Progenitor,AML
2209,singleR,HSC_PROG_LIKE,0.920434,0.714286,0.804361,6300.0,0.975117,HSC_PROG_LIKE,Tumour,AML
2210,singleR,MYELOID,0.693795,0.810825,0.747759,3806.0,0.924863,MYELOID,Immune,AML
