In [1]:
import scanpy as sc, numpy as np, pandas as pd
import os, re, glob2, anndata as ad
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
from upsetplot import plot, from_contents, from_memberships

In [2]:
adata1 = ad.read("10sets_SS_vireoSet.h5ad")
adata2 = ad.read("10sets_cr_vireoSet.h5ad")

In [3]:
demux_info_SS = pd.read_csv("Demux_info_10Sets_SS_Vireo.tsv", sep='\t')
demux_info_cr = pd.read_csv("Demux_info_10Sets_cr_Vireo.tsv", sep='\t')

In [4]:
demux_info_SS.head(2)

Unnamed: 0,Set,Rep,Concurrence,CS_MS,CS_HD,MS_HD,CS_vireo,MS_vireo,HD_vireo,CS_MS_HD,...,Not_vireo_CS_MS,Not_vireo_CS_HD,Not_vireo_MS_HD,Not_vireo_No_concurrence,No_gt_No_concurrence,No_gt_All,No_gt_CS_MS,No_gt_CS_HD,No_gt_MS_HD,Total_cells
0,NPSAD-20201112-A,1,4824,177,194,630,101,149,534,5127,...,0,0,0,0,542,6169,331,140,0,19136
1,NPSAD-20201112-A,2,4820,415,217,568,87,395,363,5193,...,0,0,0,0,560,5995,770,116,0,20037


In [5]:
Counter(adata1.obs['SubID_cs'])

Counter({'M1176': 7538,
         'H898': 6756,
         'Doublet': 31454,
         'H653': 7419,
         'H1357': 3911,
         'M45480': 5304,
         'M76138': 232,
         'Negative': 787,
         'M44145': 5278,
         'M67780': 6174,
         'M3114': 4932,
         'M48326': 3854,
         'M30944': 5216,
         'M12571': 5819,
         'M34836': 4199,
         'M29736': 3191,
         'M12792': 2607,
         'M1746': 1939,
         'M8926': 3226,
         'M580893': 559,
         'M30150': 5099,
         'M55245': 6024,
         'M24257': 6411,
         'M32215': 4795,
         'M72578': 4485,
         'H1563': 1541,
         'M84511': 9274,
         'M9161': 3359,
         'M11589': 5681,
         'M88010': 3180,
         'R31291350': 2898,
         'M26992': 3903})

In [6]:
Counter(adata1.obs['SubID_vireo'])

Counter({'M1176': 10101,
         'Negative': 15316,
         'Doublet': 9960,
         'donor4': 13005,
         'M76138': 4947,
         'donor3': 14557,
         'M45480': 1979,
         'donor5': 21654,
         'M44145': 5477,
         'M67780': 6396,
         'M48326': 3896,
         'M34836': 4782,
         'M8926': 3857,
         'M29736': 2215,
         'M30150': 5427,
         'M55245': 6222,
         'M24257': 7324,
         'M32215': 2585,
         'M84511': 11043,
         'M9161': 3514,
         'M11589': 5955,
         'M88010': 2737,
         'M26992': 4096})

In [7]:
hashmap_info = pd.read_csv("Hash_map_gtID_SubID.csv")

In [8]:
adata1.obs['set']

AAACCCAAGACCATAA-NPSAD-20201112-A1-cDNA    NPSAD-20201112-A
AAACCCAAGACTACCT-NPSAD-20201112-A1-cDNA    NPSAD-20201112-A
AAACCCAAGCACCGAA-NPSAD-20201112-A1-cDNA    NPSAD-20201112-A
AAACCCAAGCCTCATA-NPSAD-20201112-A1-cDNA    NPSAD-20201112-A
AAACCCAAGCTAGATA-NPSAD-20201112-A1-cDNA    NPSAD-20201112-A
                                                 ...       
TTTGTTGGTTCATCTT-NPSAD-20201030-C2-cDNA    NPSAD-20201030-C
TTTGTTGGTTTACGAC-NPSAD-20201030-C2-cDNA    NPSAD-20201030-C
TTTGTTGTCAGTGCGC-NPSAD-20201030-C2-cDNA    NPSAD-20201030-C
TTTGTTGTCCGATGCG-NPSAD-20201030-C2-cDNA    NPSAD-20201030-C
TTTGTTGTCGCACTCT-NPSAD-20201030-C2-cDNA    NPSAD-20201030-C
Name: set, Length: 167045, dtype: category
Categories (5, object): ['NPSAD-20201030-C', 'NPSAD-20201112-A', 'NPSAD-20201113-C', 'NPSAD-20201215-C', 'NPSAD-20210217-C']

In [9]:
hashmap_info.head(3)

Unnamed: 0,SubNum,SNP_report:Genotyping_Sample_ID,Samp_ID
0,M7718,MSSM_59,NPSAD-20210303-A
1,M79990,MSSM_54,NPSAD-20201221-A
2,M80700,MSSM_167,


In [10]:
def unclass_test_vir(samp_l, test_id, v_id, hm_info):
    values = []
    
    for i in range(v_id.shape[0]):
        gt_avail = hm_info.loc[hm_info['Samp_ID'] == samp_l[i], 'SubNum'].tolist()
        if test_id[i] in gt_avail: # If 'test' method classified as a subtype with a known GT info
            if v_id[i] == test_id[i]:
                values.append('known_same_class') # Interpret as GT was present for vireo and classification is same
            elif v_id[i].startswith('donor'):
                values.append('known_class_to_unknown') # Sub category of misclassification: GT of the classified SUBID is present but it was categorized into other sample, which had no GT info
            elif v_id[i] in gt_avail:
                values.append('known_class_to_known_other') # Sub category of misclassification: GT of the classified SUBID is present but it was categorized into other sample, with known GT info
            else:
                values.append('known_class_vireo_unclass') # Sub category of misclassification: GT of the classified SUBID is present but it was categorized as either 'Doublet' or'Negative
        
        elif (test_id[i] == 'Doublet' or test_id[i] == 'Negative') and (v_id[i] == 'Doublet' or v_id[i] == 'Negative'):
            values.append(f"{test_id[i]}_to_{v_id[i]}")
            
        elif test_id[i] not in gt_avail and test_id[i] != 'Doublet' and test_id[i] != 'Negative' and test_id[i] != 'Not Present': # We don't have GT info for SubIDs starting with any other char than 'M'
            if v_id[i].startswith('donor'):
                values.append('no_gt_but_class')
            elif v_id[i] == 'Doublet' or v_id[i]  == 'Negative':
                values.append('no_gt_unclass')
            else:
                values.append('no_gt_into_known')
        
        elif (test_id[i] == 'Doublet' or test_id[i] == 'Negative') and (v_id[i] != 'Doublet' or v_id[i] != 'Negative'):
            temp_val = 'unknown' if v_id[i].startswith('donor') else 'known_class'
            values.append(f"{test_id[i]}_to_{temp_val}")
        
        else: # For the case "Not a Singlet"
            if v_id[i] in gt_avail:
                values.append(f"Not_Present_to_known")
            elif v_id[i] == 'Doublet' or v_id[i] == 'Negative':
                values.append(f"Not_Present_to_unclass")
            else: # To categories identified by vireo i.e donor1, donor2, etc.
                values.append(f"Not_Present_to_unknown")
                
            
    return values
    

In [11]:
#del adata1.obs['H_vs_V'], adata2.obs['H_vs_V'], adata1.obs['M_vs_V'], adata2.obs['M_vs_V'], adata1.obs['S_vs_V'], adata2.obs['S_vs_V']

In [12]:
adata1.obs['H_vs_V'] = unclass_test_vir(adata1.obs['set'], adata1.obs['SubID_hd'], adata1.obs['SubID_vireo'], hashmap_info)
adata1.obs['M_vs_V'] = unclass_test_vir(adata1.obs['set'], adata1.obs['SubID_ms'], adata1.obs['SubID_vireo'], hashmap_info)
adata1.obs['S_vs_V'] = unclass_test_vir(adata1.obs['set'], adata1.obs['SubID_cs'], adata1.obs['SubID_vireo'], hashmap_info)

In [13]:
adata1.obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACCCAAGACCATAA-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176
AAACCCAAGACTACCT-NPSAD-20201112-A1-cDNA,no_gt_unclass,H898,Negative
AAACCCAAGCACCGAA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCAAGCCTCATA-NPSAD-20201112-A1-cDNA,no_gt_but_class,H653,donor4
AAACCCAAGCTAGATA-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176
AAACCCAAGGAGAATG-NPSAD-20201112-A1-cDNA,no_gt_into_known,H1357,M76138
AAACCCAAGGAGATAG-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176
AAACCCAAGGGAGTTC-NPSAD-20201112-A1-cDNA,no_gt_but_class,H898,donor3
AAACCCAAGGTCGTCC-NPSAD-20201112-A1-cDNA,no_gt_into_known,H1357,M76138
AAACCCAAGTTGCCCG-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176


In [14]:
adata1[adata1.obs['SubID_hd'] == 'Doublet'].obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACCCAAGCACCGAA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCACACACGCCA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCAGTCCTCATC-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCAGTGCGGCTT-NPSAD-20201112-A1-cDNA,Doublet_to_unknown,Doublet,donor3
AAACCCATCTTAGCCC-NPSAD-20201112-A1-cDNA,Doublet_to_known_class,Doublet,M1176
AAACGAAAGGATGTTA-NPSAD-20201112-A1-cDNA,Doublet_to_Negative,Doublet,Negative
AAACGAAAGGTGCAGT-NPSAD-20201112-A1-cDNA,Doublet_to_known_class,Doublet,M1176
AAACGAAAGTTGGCGA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACGCTAGTAGAATC-NPSAD-20201112-A1-cDNA,Doublet_to_unknown,Doublet,donor3
AAACGCTCAAAGGGTC-NPSAD-20201112-A1-cDNA,Doublet_to_unknown,Doublet,donor3


In [15]:
adata1[adata1.obs['SubID_hd'] == 'Negative'].obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACGAATCATTGGTG-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor4
AAAGAACCATCTTCGC-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor3
AAAGAACTCCTCTTTC-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M76138
AAAGGATGTCCTCATC-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor3
AAAGGGCGTCGCGTTG-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M1176
AAAGTGACACGACGCT-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M76138
AAAGTGATCCTCACGT-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M76138
AAATGGACACTCCGAG-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor3
AACAACCGTGAATTAG-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor5
AACAAGAGTTTCGACA-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M1176


In [16]:
adata2.obs['H_vs_V'] = unclass_test_vir(adata2.obs['set'], adata2.obs['SubID_hd'], adata2.obs['SubID_vireo'], hashmap_info)
adata2.obs['M_vs_V'] = unclass_test_vir(adata2.obs['set'], adata2.obs['SubID_ms'], adata2.obs['SubID_vireo'], hashmap_info)
adata2.obs['S_vs_V'] = unclass_test_vir(adata2.obs['set'], adata2.obs['SubID_cs'], adata2.obs['SubID_vireo'], hashmap_info)

In [17]:
adata2.obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACCCAAGACCATAA-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176
AAACCCAAGACTACCT-NPSAD-20201112-A1-cDNA,no_gt_unclass,H898,Negative
AAACCCAAGCACCGAA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCAAGCCTCATA-NPSAD-20201112-A1-cDNA,no_gt_into_known,H653,M76138
AAACCCAAGCTAGATA-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176
AAACCCAAGGACTAAT-NPSAD-20201112-A1-cDNA,known_class_vireo_unclass,M45480,Negative
AAACCCAAGGAGAATG-NPSAD-20201112-A1-cDNA,no_gt_but_class,H1357,donor3
AAACCCAAGGAGATAG-NPSAD-20201112-A1-cDNA,known_same_class,M1176,M1176
AAACCCAAGGGAGTTC-NPSAD-20201112-A1-cDNA,no_gt_but_class,H898,donor5
AAACCCAAGGTCGTCC-NPSAD-20201112-A1-cDNA,no_gt_but_class,H1357,donor3


In [18]:
adata2[adata2.obs['SubID_hd'] == 'Doublet'].obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACCCAAGCACCGAA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCACACACGCCA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCAGTCCTCATC-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCAGTGCGGCTT-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACCCATCTTAGCCC-NPSAD-20201112-A1-cDNA,Doublet_to_known_class,Doublet,M1176
AAACGAAAGGATGTTA-NPSAD-20201112-A1-cDNA,Doublet_to_unknown,Doublet,donor4
AAACGAAAGGTGCAGT-NPSAD-20201112-A1-cDNA,Doublet_to_known_class,Doublet,M1176
AAACGAAAGTTGGCGA-NPSAD-20201112-A1-cDNA,Doublet_to_Doublet,Doublet,Doublet
AAACGCTAGTAGAATC-NPSAD-20201112-A1-cDNA,Doublet_to_unknown,Doublet,donor5
AAACGCTCAAAGGGTC-NPSAD-20201112-A1-cDNA,Doublet_to_unknown,Doublet,donor5


In [19]:
adata2[adata2.obs['SubID_hd'] == 'Negative'].obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACGAATCATTGGTG-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M76138
AAAGAACCATCTTCGC-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor5
AAAGAACTCCTCTTTC-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor3
AAAGGATGTCCTCATC-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor5
AAAGGGCGTCGCGTTG-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M1176
AAAGTGACACGACGCT-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor3
AAAGTGATCCTCACGT-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor3
AAATGGACACTCCGAG-NPSAD-20201112-A1-cDNA,Negative_to_unknown,Negative,donor5
AACAACCGTGAATTAG-NPSAD-20201112-A1-cDNA,Negative_to_Doublet,Negative,Doublet
AACAAGAGTTTCGACA-NPSAD-20201112-A1-cDNA,Negative_to_known_class,Negative,M1176


In [20]:
adata2[adata2.obs['SubID_hd'] == 'Not Present'].obs[['H_vs_V', 'SubID_hd', 'SubID_vireo']].head(10)

Unnamed: 0,H_vs_V,SubID_hd,SubID_vireo
AAACCCACATGAGTAA-NPSAD-20201030-C1-cDNA,Not_Present_to_unclass,Not Present,Negative
AATTCCTAGATCACCT-NPSAD-20201030-C1-cDNA,Not_Present_to_known,Not Present,M26992
ACATCCCAGAACTGAT-NPSAD-20201030-C1-cDNA,Not_Present_to_known,Not Present,M88010
AGAACCTGTGTTGCCG-NPSAD-20201030-C1-cDNA,Not_Present_to_unclass,Not Present,Negative
AGATGCTTCAAGCCAT-NPSAD-20201030-C1-cDNA,Not_Present_to_known,Not Present,M88010
AGCATCATCTACTGCC-NPSAD-20201030-C1-cDNA,Not_Present_to_known,Not Present,M88010
AGGACGAAGGATGAGA-NPSAD-20201030-C1-cDNA,Not_Present_to_unclass,Not Present,Negative
AGGCCACCACAGAAGC-NPSAD-20201030-C1-cDNA,Not_Present_to_unclass,Not Present,Negative
AGTAACCTCACCTACC-NPSAD-20201030-C1-cDNA,Not_Present_to_known,Not Present,M88010
ATCACGATCCGACATA-NPSAD-20201030-C1-cDNA,Not_Present_to_unclass,Not Present,Negative


In [21]:
Counter(adata2.obs['SubID_vireo'])

Counter({'M1176': 10105,
         'Negative': 15156,
         'Doublet': 10288,
         'M76138': 5294,
         'donor3': 11066,
         'donor5': 23587,
         'donor4': 14161,
         'M45480': 2015,
         'M44145': 5474,
         'M67780': 6400,
         'M48326': 3946,
         'M34836': 4776,
         'M29736': 2364,
         'M8926': 3854,
         'M30150': 5431,
         'M55245': 6251,
         'M24257': 7299,
         'M32215': 2703,
         'M84511': 10915,
         'M9161': 3482,
         'M11589': 5922,
         'M88010': 2724,
         'M26992': 4034})

In [22]:
list(Counter(adata1.obs['H_vs_V']).keys())

['known_same_class',
 'no_gt_unclass',
 'Doublet_to_Doublet',
 'no_gt_but_class',
 'no_gt_into_known',
 'known_class_vireo_unclass',
 'Doublet_to_unknown',
 'Doublet_to_known_class',
 'Doublet_to_Negative',
 'Negative_to_unknown',
 'Negative_to_known_class',
 'known_class_to_unknown',
 'Negative_to_Negative',
 'known_class_to_known_other',
 'Not_Present_to_unknown',
 'Not_Present_to_unclass',
 'Not_Present_to_known',
 'Negative_to_Doublet']

In [23]:
Counter(adata2.obs['SubID_ms'])

Counter({'M1176': 8413,
         'H898': 7065,
         'Doublet': 29697,
         'H653': 6271,
         'M45480': 3597,
         'H1357': 4082,
         'Negative': 7166,
         'M76138': 93,
         'M12571': 7861,
         'M44145': 5165,
         'M67780': 6052,
         'M3114': 6431,
         'M48326': 3883,
         'M30944': 3173,
         'M34836': 4220,
         'M29736': 2361,
         'M12792': 2274,
         'M1746': 1646,
         'M8926': 3393,
         'M580893': 560,
         'M30150': 5022,
         'M55245': 5972,
         'M24257': 6374,
         'M32215': 4543,
         'M72578': 4506,
         'H1563': 1522,
         'M84511': 9093,
         'M9161': 3138,
         'M11589': 4773,
         'Not Present': 134,
         'M88010': 2631,
         'R31291350': 2487,
         'M26992': 3649})

In [24]:
adata1[(adata1.obs['set'] == 'NPSAD-20201112-A') & (adata1.obs['rep'] == '1') & (adata1.obs['H_vs_V'] == 'known_same_class')].shape[0]

4547

In [29]:
demux_info = []
for sample in demux_info_SS['Set'].unique():
    #categories = list(Counter(adata1.obs['H_vs_V']).keys())
    #new_set = set((sample, 'HTOdemux', '1'))
    #temp_val = [new_set.add(adata1.obs[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1') & (adata1.obs['H_vs_V'] == i)].shape[0]) for i in categories ]
    #print(type(temp_val))
    temp_val = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].obs['H_vs_V'])
    demux_info.append([sample, 'HTOdemux', '1', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].obs['S_vs_V'])
    demux_info.append([sample, 'Calico_solo', '1', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].obs['M_vs_V'])
    demux_info.append([sample, 'MULTIseq', '1', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    
    # Rep 2
    temp_val = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2')].obs['H_vs_V'])
    demux_info.append([sample, 'HTOdemux', '2', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2')].obs['S_vs_V'])
    demux_info.append([sample, 'Calico_solo', '2', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2')].obs['M_vs_V'])
    demux_info.append([sample, 'MULTIseq', '2', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    
demux_df = pd.DataFrame(demux_info, columns=['Set', 'Demux_prog', 'Rep', 'Same_classification', 'Lost_cells_to_vireo', 'Same_unclassified' ,'Misc_class', 'Gained_cells_from_vireo'])
demux_df.to_csv("10Sets_SS_Vireo_Final_analysis.tsv", sep = "\t", index=False)

In [30]:
demux_info = []
for sample in demux_info_cr['Set'].unique():
    #categories = list(Counter(adata2.obs['H_vs_V']).keys())
    #new_set = set((sample, 'HTOdemux', '1'))
    #temp_val = [new_set.add(adata2.obs[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1') & (adata2.obs['H_vs_V'] == i)].shape[0]) for i in categories ]
    #print(type(temp_val))
    temp_val = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1')].obs['H_vs_V'])
    demux_info.append([sample, 'HTOdemux', '1', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1')].obs['S_vs_V'])
    demux_info.append([sample, 'Calico_solo', '1', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1')].obs['M_vs_V'])
    demux_info.append([sample, 'MULTIseq', '1', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    
    # Rep 2
    temp_val = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2')].obs['H_vs_V'])
    demux_info.append([sample, 'HTOdemux', '2', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2')].obs['S_vs_V'])
    demux_info.append([sample, 'Calico_solo', '2', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    temp_val = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2')].obs['M_vs_V'])
    demux_info.append([sample, 'MULTIseq', '2', temp_val['known_same_class'], temp_val['known_class_vireo_unclass'] +temp_val['no_gt_unclass'], temp_val['Doublet_to_Doublet'] + temp_val['Doublet_to_Negative']+ temp_val['Negative_to_Negative']+ temp_val['Negative_to_Doublet'], 
                      temp_val['known_class_to_known_other'] + temp_val['known_class_to_unknown'] + temp_val['no_gt_into_known'], temp_val['Doublet_to_unknown'] + temp_val['Doublet_to_known_class']+temp_val['Negative_to_unknown']+temp_val['Negative_to_known_class']])
    
    
demux_df = pd.DataFrame(demux_info, columns=['Set', 'Demux_prog', 'Rep', 'Same_classification', 'Lost_cells_to_vireo', 'Same_unclassified' ,'Misc_class', 'Gained_cells_from_vireo'])
demux_df.to_csv("10Sets_cr_Vireo_Final_analysis.tsv", sep = "\t", index=False)

In [25]:
Counter(adata1[(adata1.obs['set'] == 'NPSAD-20201112-A') & (adata1.obs['rep'] == '2')].obs['H_vs_V'])

Counter({'no_gt_but_class': 5001,
         'Doublet_to_Doublet': 701,
         'known_class_vireo_unclass': 993,
         'no_gt_unclass': 1268,
         'Doublet_to_known_class': 1298,
         'known_same_class': 4348,
         'Doublet_to_unknown': 1141,
         'Negative_to_Negative': 369,
         'no_gt_into_known': 1927,
         'Negative_to_known_class': 1044,
         'Negative_to_unknown': 1114,
         'Doublet_to_Negative': 436,
         'known_class_to_unknown': 172,
         'known_class_to_known_other': 108,
         'Not_Present_to_unknown': 13,
         'Negative_to_Doublet': 59,
         'Not_Present_to_unclass': 43,
         'Not_Present_to_known': 2})

In [26]:
Counter(adata2[(adata2.obs['set'] == 'NPSAD-20201113-C') & (adata2.obs['rep'] == '1')].obs['H_vs_V'])

Counter({'no_gt_but_class': 8277,
         'known_same_class': 6805,
         'Doublet_to_unknown': 1365,
         'known_class_vireo_unclass': 858,
         'Doublet_to_known_class': 955,
         'Doublet_to_Negative': 332,
         'Negative_to_known_class': 387,
         'Negative_to_Doublet': 11,
         'Doublet_to_Doublet': 1389,
         'Negative_to_Negative': 204,
         'no_gt_unclass': 569,
         'Negative_to_unknown': 516,
         'known_class_to_known_other': 18,
         'known_class_to_unknown': 40,
         'no_gt_into_known': 22})

In [27]:
adata1.write('10sets_SS_vireoSet_after_analysis.h5ad')
adata2.write('10sets_cr_vireoSet_after_analysis.h5ad')

... storing 'H_vs_V' as categorical
... storing 'M_vs_V' as categorical
... storing 'S_vs_V' as categorical
... storing 'H_vs_V' as categorical
... storing 'M_vs_V' as categorical
... storing 'S_vs_V' as categorical


In [46]:
#demux_info = []
for sample in demux_info_SS['Set'].unique()[:1]:
    df_to_compare_1 = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].obs[['SubID_cs', 'SubID_ms', 'SubID_hd', 'SubID_vireo']]
    #df_to_compare_1.eq(df_to_compare_1.iloc[:, 0], axis=0).all(1)

In [47]:
df_to_compare_1

Unnamed: 0,SubID_cs,SubID_ms,SubID_hd,SubID_vireo
AAACCCAAGACCATAA-NPSAD-20201112-A1-cDNA,M1176,M1176,M1176,M1176
AAACCCAAGACTACCT-NPSAD-20201112-A1-cDNA,H898,H898,H898,Negative
AAACCCAAGCACCGAA-NPSAD-20201112-A1-cDNA,Doublet,Doublet,Doublet,Doublet
AAACCCAAGCCTCATA-NPSAD-20201112-A1-cDNA,H653,H653,H653,donor4
AAACCCAAGCTAGATA-NPSAD-20201112-A1-cDNA,M1176,M1176,M1176,M1176
...,...,...,...,...
TTTGTTGTCATACGAC-NPSAD-20201112-A1-cDNA,H898,H898,H898,donor3
TTTGTTGTCCATTGGA-NPSAD-20201112-A1-cDNA,H898,H898,H898,donor3
TTTGTTGTCTCTAGGA-NPSAD-20201112-A1-cDNA,Doublet,Doublet,M76138,donor3
TTTGTTGTCTTAGGAC-NPSAD-20201112-A1-cDNA,H653,H653,H653,donor4


In [48]:
Counter(df_to_compare_1['SubID_cs']).keys()

dict_keys(['M1176', 'H898', 'Doublet', 'H653', 'H1357', 'M45480', 'M76138', 'Negative'])

In [49]:
Counter(df_to_compare_1['SubID_ms']).keys()

dict_keys(['M1176', 'H898', 'Doublet', 'H653', 'H1357', 'M45480', 'Negative', 'Not Present', 'M76138'])

In [50]:
Counter(df_to_compare_1['SubID_hd']).keys()

dict_keys(['M1176', 'H898', 'Doublet', 'H653', 'H1357', 'M45480', 'Negative', 'M76138', 'Not Present'])

In [51]:
Counter(df_to_compare_1['SubID_vireo']).keys()

dict_keys(['M1176', 'Negative', 'Doublet', 'donor4', 'M76138', 'donor3', 'M45480', 'donor5'])

In [67]:
Counter(df_to_compare_1['SubID_hd'])

Counter({'M1176': 4007,
         'H898': 3164,
         'Doublet': 3556,
         'H653': 2933,
         'H1357': 2237,
         'M45480': 1627,
         'Negative': 1467,
         'M76138': 105,
         'Not Present': 40})

In [68]:
Counter(df_to_compare_1['SubID_ms'])

Counter({'M1176': 4165,
         'H898': 3452,
         'Doublet': 3697,
         'H653': 3075,
         'H1357': 2099,
         'M45480': 1621,
         'Negative': 950,
         'Not Present': 40,
         'M76138': 37})

In [43]:
df_to_compare_1.iloc[:,1].eq(df_to_compare_1.iloc[:, 2], axis=0).all()

False

In [59]:
Counter(df_to_compare_1['SubID_hd'] == "Not Present")

Counter({False: 19096, True: 40})

In [66]:
df_to_compare_1.loc[(df_to_compare_1['SubID_ms'] == "Not Present"), 'SubID_hd']

AAGCCATTCGCATTAG-NPSAD-20201112-A1-cDNA    Not Present
AATAGAGAGAACGTGC-NPSAD-20201112-A1-cDNA    Not Present
AGACCATGTCATGCAT-NPSAD-20201112-A1-cDNA    Not Present
AGGTTGTCATCTAACG-NPSAD-20201112-A1-cDNA    Not Present
AGTCTCCTCTTGGATG-NPSAD-20201112-A1-cDNA    Not Present
AGTGCCGTCTTTCCAA-NPSAD-20201112-A1-cDNA    Not Present
ATACCGATCTCTGCTG-NPSAD-20201112-A1-cDNA    Not Present
ATAGAGAAGTCATAGA-NPSAD-20201112-A1-cDNA    Not Present
ATCGATGTCCATTTCA-NPSAD-20201112-A1-cDNA    Not Present
ATTGGGTCAACGAGGT-NPSAD-20201112-A1-cDNA    Not Present
CAAAGAAAGAAGCGCT-NPSAD-20201112-A1-cDNA    Not Present
CAAGCTAGTCGAACAG-NPSAD-20201112-A1-cDNA    Not Present
CACAGGCGTTCCAAAC-NPSAD-20201112-A1-cDNA    Not Present
CACCGTTTCGATTCCC-NPSAD-20201112-A1-cDNA    Not Present
CACGGGTAGCCATGCC-NPSAD-20201112-A1-cDNA    Not Present
CGTAAGTGTTTCGTTT-NPSAD-20201112-A1-cDNA    Not Present
CTATCTAAGCCGTAAG-NPSAD-20201112-A1-cDNA    Not Present
CTTGAGACATTCTCTA-NPSAD-20201112-A1-cDNA    Not Present
GAAGAATGTG

In [63]:
df_to_compare_1.loc[(df_to_compare_1['SubID_hd'] == "Not Present") | (df_to_compare_1['SubID_ms'] == "Not Present"), "SubID_vireo"].shape

(40,)

In [64]:
df_to_compare_1.loc[(df_to_compare_1['SubID_hd'] == "Not Present") & (df_to_compare_1['SubID_ms'] == "Not Present"), "SubID_vireo"].shape

(40,)

In [70]:
for sample in demux_info_SS['Set'].unique():
    print(sample + '1')
    print(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].shape[0])
    print(sample + '2')
    print(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2')].shape[0])

NPSAD-20201112-A1
19136
NPSAD-20201112-A2
20037
NPSAD-20201113-C1
21643
NPSAD-20201113-C2
20280
NPSAD-20210217-C1
8998
NPSAD-20210217-C2
8236
NPSAD-20201215-C1
18834
NPSAD-20201215-C2
14503
NPSAD-20201030-C1
16695
NPSAD-20201030-C2
18683


In [71]:
for sample in demux_info_cr['Set'].unique():
    print(sample + '1')
    print(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1')].shape[0])
    print(sample + '2')
    print(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2')].shape[0])

NPSAD-20201112-A1
19407
NPSAD-20201112-A2
20339
NPSAD-20201113-C1
21748
NPSAD-20201113-C2
20376
NPSAD-20210217-C1
9016
NPSAD-20210217-C2
8262
NPSAD-20201215-C1
18825
NPSAD-20201215-C2
14530
NPSAD-20201030-C1
16454
NPSAD-20201030-C2
18290


In [75]:
for sample in demux_info_SS['Set'].unique():
    print(sample + '1')
    print(Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1') & (adata1.obs['SubID_ms'] == 'Not Present')].obs['S_vs_V']))
    
    print(sample + '2')
    print(Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2') & (adata1.obs['SubID_ms'] == 'Not Present')].obs['S_vs_V']))

NPSAD-20201112-A1
Counter({'no_gt_unclass': 12, 'no_gt_but_class': 9, 'known_class_vireo_unclass': 8, 'known_class_to_unknown': 5, 'Doublet_to_Negative': 2, 'Doublet_to_Doublet': 1, 'no_gt_into_known': 1, 'Negative_to_known_class': 1, 'known_class_to_known_other': 1})
NPSAD-20201112-A2
Counter({'known_class_vireo_unclass': 23, 'no_gt_unclass': 18, 'no_gt_but_class': 7, 'Doublet_to_unknown': 3, 'known_class_to_unknown': 3, 'known_same_class': 1, 'no_gt_into_known': 1, 'Doublet_to_Negative': 1, 'Doublet_to_Doublet': 1})
NPSAD-20201113-C1
Counter({'no_gt_but_class': 25, 'no_gt_unclass': 20, 'known_class_vireo_unclass': 11, 'known_same_class': 10, 'Doublet_to_unknown': 6, 'Doublet_to_Negative': 5, 'Doublet_to_Doublet': 3, 'Negative_to_unknown': 3, 'Doublet_to_known_class': 2, 'known_class_to_unknown': 2})
NPSAD-20201113-C2
Counter({'no_gt_unclass': 48, 'no_gt_but_class': 14, 'known_class_vireo_unclass': 14, 'Doublet_to_Negative': 6, 'Negative_to_Negative': 5, 'Doublet_to_known_class': 4, '

In [76]:
for sample in demux_info_cr['Set'].unique():
    print(sample + '1')
    print(Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1') & (adata2.obs['SubID_ms'] == 'Not Present')].obs['S_vs_V']))
    
    print(sample + '2')
    print(Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2') & (adata2.obs['SubID_ms'] == 'Not Present')].obs['S_vs_V']))

NPSAD-20201112-A1
Counter()
NPSAD-20201112-A2
Counter()
NPSAD-20201113-C1
Counter()
NPSAD-20201113-C2
Counter()
NPSAD-20210217-C1
Counter()
NPSAD-20210217-C2
Counter()
NPSAD-20201215-C1
Counter()
NPSAD-20201215-C2
Counter()
NPSAD-20201030-C1
Counter({'known_class_vireo_unclass': 13, 'known_same_class': 10, 'Doublet_to_Negative': 3, 'no_gt_but_class': 3, 'Doublet_to_known_class': 2, 'no_gt_unclass': 2})
NPSAD-20201030-C2
Counter({'known_class_vireo_unclass': 43, 'Negative_to_Negative': 19, 'known_same_class': 13, 'no_gt_unclass': 11, 'no_gt_but_class': 4, 'Doublet_to_known_class': 4, 'Doublet_to_Negative': 4, 'Negative_to_unknown': 1, 'Doublet_to_Doublet': 1, 'Doublet_to_unknown': 1})


In [79]:
Counter(adata2[(adata2.obs['set'] == 'NPSAD-20201112-A') & (adata2.obs['rep'] == '1')].obs['SubID_ms'])

Counter({'M1176': 4178,
         'H898': 3508,
         'Doublet': 3742,
         'H653': 3110,
         'M45480': 1766,
         'H1357': 2105,
         'Negative': 958,
         'M76138': 40})

In [83]:
Counter(adata2[(adata2.obs['set'] == 'NPSAD-20201112-A') & (adata2.obs['rep'] == '1')].obs['SubID_hd'])

Counter({'M1176': 4021,
         'H898': 3215,
         'Doublet': 3604,
         'H653': 2964,
         'M45480': 1767,
         'H1357': 2245,
         'Negative': 1480,
         'M76138': 111})

In [84]:
for sample in demux_info_SS['Set'].unique():
    print(sample + '1')
    print(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1') & (adata1.obs['SubID_ms'] == 'Not Present')].shape[0])
    
    print(sample + '2')
    print(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2') & (adata1.obs['SubID_ms'] == 'Not Present')].shape[0])

NPSAD-20201112-A1
40
NPSAD-20201112-A2
58
NPSAD-20201113-C1
87
NPSAD-20201113-C2
104
NPSAD-20210217-C1
4
NPSAD-20210217-C2
6
NPSAD-20201215-C1
4
NPSAD-20201215-C2
1
NPSAD-20201030-C1
177
NPSAD-20201030-C2
365


In [85]:
for sample in demux_info_cr['Set'].unique():
    print(sample + '1')
    print(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1') & (adata2.obs['SubID_ms'] == 'Not Present')].shape[0])
    
    print(sample + '2')
    print(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2') & (adata2.obs['SubID_ms'] == 'Not Present')].shape[0])

NPSAD-20201112-A1
0
NPSAD-20201112-A2
0
NPSAD-20201113-C1
0
NPSAD-20201113-C2
0
NPSAD-20210217-C1
0
NPSAD-20210217-C2
0
NPSAD-20201215-C1
0
NPSAD-20201215-C2
0
NPSAD-20201030-C1
33
NPSAD-20201030-C2
101


In [87]:
check_df = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1') & (adata1.obs['SubID_ms'] != 'Not Present')].obs[['SubID_cs', 'SubID_ms', 'S_vs_V']]

In [94]:
Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])

Counter({'known_same_class': 495,
         'known_class_vireo_unclass': 117,
         'Doublet_to_known_class': 45,
         'no_gt_into_known': 41,
         'Doublet_to_Doublet': 14,
         'no_gt_but_class': 23,
         'known_class_to_known_other': 103,
         'known_class_to_unknown': 7,
         'no_gt_unclass': 21,
         'Doublet_to_Negative': 17,
         'Doublet_to_unknown': 2})

In [103]:
Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1])).keys()

dict_keys([True, False])

(CS+MS)+diff(CS+MS)Vireo

In [107]:
#temp_l = []
for sample in demux_info_SS['Set'].unique():
    print(sample + '1')
    check_df = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1') & (adata1.obs['SubID_ms'] != 'Not Present')].obs[['SubID_cs', 'SubID_ms', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])
    
    print(sample + '2')
    check_df = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2') & (adata1.obs['SubID_ms'] != 'Not Present')].obs[['SubID_cs', 'SubID_ms', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])

NPSAD-20201112-A1
16726 2370 485 99
NPSAD-20201112-A2
17531 2448 238 54
NPSAD-20201113-C1
17786 3770 1716 189
NPSAD-20201113-C2
16815 3361 1575 155
NPSAD-20210217-C1
7935 1059 72 23
NPSAD-20210217-C2
7238 992 176 12
NPSAD-20201215-C1
18395 435 48 11
NPSAD-20201215-C2
14337 165 4 3
NPSAD-20201030-C1
15633 885 47 31
NPSAD-20201030-C2
17025 1293 12 3


In [106]:
for sample in demux_info_cr['Set'].unique():
    print(sample + '1')
    check_df = adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1') & (adata2.obs['SubID_ms'] != 'Not Present')].obs[['SubID_cs', 'SubID_ms', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])
    
    print(sample + '2')
    check_df = adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2') & (adata2.obs['SubID_ms'] != 'Not Present')].obs[['SubID_cs', 'SubID_ms', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])

NPSAD-20201112-A1
17016 2391 490 100
NPSAD-20201112-A2
17865 2474 238 60
NPSAD-20201113-C1
17941 3807 1713 190
NPSAD-20201113-C2
16973 3403 1585 159
NPSAD-20210217-C1
7962 1054 71 24
NPSAD-20210217-C2
7261 1001 182 11
NPSAD-20201215-C1
18422 403 42 10
NPSAD-20201215-C2
14365 165 4 2
NPSAD-20201030-C1
15551 870 54 29
NPSAD-20201030-C2
16991 1198 10 2


(CS+HD)+diff(CS+HD)Vireo

In [108]:
#temp_l = []
for sample in demux_info_SS['Set'].unique():
    print(sample + '1')
    check_df = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1') & (adata1.obs['SubID_hd'] != 'Not Present')].obs[['SubID_cs', 'SubID_hd', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])
    
    print(sample + '2')
    check_df = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2') & (adata1.obs['SubID_hd'] != 'Not Present')].obs[['SubID_cs', 'SubID_hd', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])

NPSAD-20201112-A1
16487 2609 461 128
NPSAD-20201112-A2
16386 3593 534 195
NPSAD-20201113-C1
16996 4560 2023 273
NPSAD-20201113-C2
16327 3849 1770 205
NPSAD-20210217-C1
8277 717 58 8
NPSAD-20210217-C2
7310 920 135 11
NPSAD-20201215-C1
18235 595 44 24
NPSAD-20201215-C2
14046 456 31 16
NPSAD-20201030-C1
15236 1282 132 82
NPSAD-20201030-C2
15881 2437 235 130


In [109]:
for sample in demux_info_cr['Set'].unique():
    print(sample + '1')
    check_df = adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1') & (adata2.obs['SubID_hd'] != 'Not Present')].obs[['SubID_cs', 'SubID_hd', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])
    
    print(sample + '2')
    check_df = adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2') & (adata2.obs['SubID_hd'] != 'Not Present')].obs[['SubID_cs', 'SubID_hd', 'S_vs_V']]
    counter_cs_ms = Counter(check_df.iloc[:, 0].eq(check_df.iloc[:,1]))
    check_dict = Counter(check_df.loc[~check_df.iloc[:, 0].eq(check_df.iloc[:,1]), 'S_vs_V'])
    
    print(counter_cs_ms[True], counter_cs_ms[False], check_dict['Doublet_to_known_class']+check_dict['Negative_to_known_class'] + check_dict['Doublet_to_unknown']+check_dict['Negative_to_unknown'],
         check_dict['Doublet_to_Negative'] + check_dict['Doublet_to_Doublet'] + check_dict['Negative_to_Negative'] + check_dict['Negative_to_Doublet'])

NPSAD-20201112-A1
16774 2633 469 123
NPSAD-20201112-A2
16703 3636 525 220
NPSAD-20201113-C1
17128 4620 2025 277
NPSAD-20201113-C2
16476 3900 1783 210
NPSAD-20210217-C1
8304 712 58 8
NPSAD-20210217-C2
7336 926 139 11
NPSAD-20201215-C1
18266 559 39 21
NPSAD-20201215-C2
14072 458 30 17
NPSAD-20201030-C1
15157 1264 135 84
NPSAD-20201030-C2
15873 2316 228 133


Individually, each demux method performance

In [125]:
temp_l=[]
for sample in demux_info_SS['Set'].unique():
    gt_avail = hashmap_info.loc[hashmap_info['Samp_ID'] == sample, 'SubNum'].tolist()
    for demux_method in ['SubID_cs', 'SubID_ms', 'SubID_hd', 'SubID_vireo']:
        #print(sample + '1'+ ':\t' + demux_method)
        check_dict = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].obs[demux_method])
        #print(check_dict)
        known_gt_cells=0
        unknown_gt_cells=0
        unclass_cells=0
        for k,v in check_dict.items():
            if k in gt_avail:
                known_gt_cells+=v
            elif k not in gt_avail and k != 'Doublet' and k != 'Negative':
                unknown_gt_cells+=v
            elif k == 'Doublet' or k == 'Negative' :
                unclass_cells+=v
                
        tot_cells = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '1')].shape[0]
        #print(f'Total cells: {tot_cells}')
        #print(f'Total cells assigned (to those with GT): {known_gt_cells}')
        #print(f'Total cells assigned (to those without GT): {unknown_gt_cells}')
        #print(f'Total cells unclassified (Doublets+Negatives): {unclass_cells}')
        temp_l.append([sample, '1', demux_method, tot_cells, known_gt_cells, unknown_gt_cells, unclass_cells])
        
    for demux_method in ['SubID_cs', 'SubID_ms', 'SubID_hd', 'SubID_vireo']:
        #print(sample + '2'+ ':\t' + demux_method)
        check_dict = Counter(adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2')].obs[demux_method])
        known_gt_cells=0
        unknown_gt_cells=0
        unclass_cells=0
        for k,v in check_dict.items():
            if k in gt_avail:
                known_gt_cells+=v
            elif k not in gt_avail and k != 'Doublet' and k != 'Negative':
                unknown_gt_cells+=v
            elif k == 'Doublet' or k == 'Negative':
                unclass_cells+=v
                
        tot_cells = adata1[(adata1.obs['set'] == sample) & (adata1.obs['rep'] == '2')].shape[0]
        #print(f'Total cells: {tot_cells}')
        #print(f'Total cells assigned (to those with GT): {known_gt_cells}')
        #print(f'Total cells assigned (to those without GT): {unknown_gt_cells}')
        #print(f'Total cells unclassified (Doublets+Negatives): {unclass_cells}')
        temp_l.append([sample, '2', demux_method, tot_cells, known_gt_cells, unknown_gt_cells, unclass_cells])
        
temp_df = pd.DataFrame(temp_l, columns=['Set', 'Rep', 'Demux_method', 'Total_cells', 'Cells assigned\(known_gt\)', 'Cells assigned\(unknown_gt\)', 'cells unclass\(Doubs+Negs\)'])
temp_df.to_csv('10Sets_indiv_demux_perf_SS.tsv', sep='\t', index=False)

NPSAD-20201112-A1:	SubID_cs
NPSAD-20201112-A1:	SubID_ms
NPSAD-20201112-A1:	SubID_hd
NPSAD-20201112-A1:	SubID_vireo
NPSAD-20201112-A2:	SubID_cs
NPSAD-20201112-A2:	SubID_ms
NPSAD-20201112-A2:	SubID_hd
NPSAD-20201112-A2:	SubID_vireo
NPSAD-20201113-C1:	SubID_cs
NPSAD-20201113-C1:	SubID_ms
NPSAD-20201113-C1:	SubID_hd
NPSAD-20201113-C1:	SubID_vireo
NPSAD-20201113-C2:	SubID_cs
NPSAD-20201113-C2:	SubID_ms
NPSAD-20201113-C2:	SubID_hd
NPSAD-20201113-C2:	SubID_vireo
NPSAD-20210217-C1:	SubID_cs
NPSAD-20210217-C1:	SubID_ms
NPSAD-20210217-C1:	SubID_hd
NPSAD-20210217-C1:	SubID_vireo
NPSAD-20210217-C2:	SubID_cs
NPSAD-20210217-C2:	SubID_ms
NPSAD-20210217-C2:	SubID_hd
NPSAD-20210217-C2:	SubID_vireo
NPSAD-20201215-C1:	SubID_cs
NPSAD-20201215-C1:	SubID_ms
NPSAD-20201215-C1:	SubID_hd
NPSAD-20201215-C1:	SubID_vireo
NPSAD-20201215-C2:	SubID_cs
NPSAD-20201215-C2:	SubID_ms
NPSAD-20201215-C2:	SubID_hd
NPSAD-20201215-C2:	SubID_vireo
NPSAD-20201030-C1:	SubID_cs
NPSAD-20201030-C1:	SubID_ms
NPSAD-20201030-C1:	SubID

In [126]:
temp_l=[]
for sample in demux_info_cr['Set'].unique():
    gt_avail = hashmap_info.loc[hashmap_info['Samp_ID'] == sample, 'SubNum'].tolist()
    for demux_method in ['SubID_cs', 'SubID_ms', 'SubID_hd', 'SubID_vireo']:
        #print(sample + '1'+ ':\t' + demux_method)
        check_dict = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1')].obs[demux_method])
        known_gt_cells=0
        unknown_gt_cells=0
        unclass_cells=0
        for k,v in check_dict.items():
            if k in gt_avail:
                known_gt_cells+=v
            elif k not in gt_avail and k != 'Doublet' and k != 'Negative':
                unknown_gt_cells+=v
            elif k == 'Doublet' or k == 'Negative':
                unclass_cells+=v
                
        tot_cells = adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '1')].shape[0]
#         print(f'Total cells: {tot_cells}')
#         print(f'Total cells assigned (to those with GT): {known_gt_cells}')
#         print(f'Total cells assigned (to those without GT): {unknown_gt_cells}')
#         print(f'Total cells unclassified (Doublets+Negatives): {unclass_cells}')
        temp_l.append([sample, '1', demux_method, tot_cells, known_gt_cells, unknown_gt_cells, unclass_cells])

    for demux_method in ['SubID_cs', 'SubID_ms', 'SubID_hd', 'SubID_vireo']:
        #print(sample + '2'+ ':\t' + demux_method)
        check_dict = Counter(adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2')].obs[demux_method])
        known_gt_cells=0
        unknown_gt_cells=0
        unclass_cells=0
        for k,v in check_dict.items():
            if k in gt_avail:
                known_gt_cells+=v
            elif k not in gt_avail and k != 'Doublet' and k != 'Negative':
                unknown_gt_cells+=v
            elif k == 'Doublet' or k == 'Negative':
                unclass_cells+=v
                
        tot_cells = adata2[(adata2.obs['set'] == sample) & (adata2.obs['rep'] == '2')].shape[0]
#         print(f'Total cells: {tot_cells}')
#         print(f'Total cells assigned (to those with GT): {known_gt_cells}')
#         print(f'Total cells assigned (to those without GT): {unknown_gt_cells}')
#         print(f'Total cells unclassified (Doublets+Negatives): {unclass_cells}')
        temp_l.append([sample, '2', demux_method, tot_cells, known_gt_cells, unknown_gt_cells, unclass_cells])

temp_df = pd.DataFrame(temp_l, columns=['Set', 'Rep', 'Demux_method', 'Total_cells', 'Cells assigned\(known_gt\)', 'Cells assigned\(unknown_gt\)', 'cells unclass\(Doubs+Negs\)'])
temp_df.to_csv('10Sets_indiv_demux_perf_cr.tsv', sep='\t', index=False)

NPSAD-20201112-A1:	SubID_cs
NPSAD-20201112-A1:	SubID_ms
NPSAD-20201112-A1:	SubID_hd
NPSAD-20201112-A1:	SubID_vireo
NPSAD-20201112-A2:	SubID_cs
NPSAD-20201112-A2:	SubID_ms
NPSAD-20201112-A2:	SubID_hd
NPSAD-20201112-A2:	SubID_vireo
NPSAD-20201113-C1:	SubID_cs
NPSAD-20201113-C1:	SubID_ms
NPSAD-20201113-C1:	SubID_hd
NPSAD-20201113-C1:	SubID_vireo
NPSAD-20201113-C2:	SubID_cs
NPSAD-20201113-C2:	SubID_ms
NPSAD-20201113-C2:	SubID_hd
NPSAD-20201113-C2:	SubID_vireo
NPSAD-20210217-C1:	SubID_cs
NPSAD-20210217-C1:	SubID_ms
NPSAD-20210217-C1:	SubID_hd
NPSAD-20210217-C1:	SubID_vireo
NPSAD-20210217-C2:	SubID_cs
NPSAD-20210217-C2:	SubID_ms
NPSAD-20210217-C2:	SubID_hd
NPSAD-20210217-C2:	SubID_vireo
NPSAD-20201215-C1:	SubID_cs
NPSAD-20201215-C1:	SubID_ms
NPSAD-20201215-C1:	SubID_hd
NPSAD-20201215-C1:	SubID_vireo
NPSAD-20201215-C2:	SubID_cs
NPSAD-20201215-C2:	SubID_ms
NPSAD-20201215-C2:	SubID_hd
NPSAD-20201215-C2:	SubID_vireo
NPSAD-20201030-C1:	SubID_cs
NPSAD-20201030-C1:	SubID_ms
NPSAD-20201030-C1:	SubID