# **Computing Average Expression per Seurat and Paper Cluster**

* https://www.nature.com/articles/nmeth.2365

[Chung Breast](#cb)

[Darmanis Glio](#dg)

[Jerby Arnon Mel](#jam)

[Li CRC](#crc)

In [1]:
import pandas as pd
import numpy as np
datadir = '~/Documents/school/research/scrnaseq_data'

from collections import Counter
from sklearn.preprocessing import MinMaxScaler

def get_avg_expr(cell_frame, cluster_frame, method):
    avg_expr = pd.DataFrame({'genes':cell_frame.iloc[:,0]}) # make a dataframe with the gene names as column
    gold_std = pd.DataFrame()

    #for each cluster listed in the cluster_frame column, sum the columns
    for i in range(1, max(cluster_frame[method])+1):
        name = f'clust{i}'
        
        #use the most frequent label in each cluster as the true label
        c = Counter(cluster_frame[cluster_frame[method]==i]['truth'])
        label = c.most_common(1)[0][0].upper() 
        gold_std = gold_std.append(other={'cluster':name, 
                                          'label':(label)},
                                   ignore_index=True)
        print(f'\nCells in {name}:\n{c}\n')
        
        try:
            cluster_data = cell_frame.loc[:,cluster_frame[cluster_frame[method]==i]['cell']]
        except KeyError:
            print(f'{name} is missing cells from the input data, probably filtered')
            sdiff =set(cluster_frame[cluster_frame[method]==i]['cell']).difference(cell_frame.columns) 
            print(len(sdiff))
            print(sdiff)
            cluster_data = cell_frame.loc[:,cell_frame.columns.intersection(cluster_frame[cluster_frame[method]==i]['cell'])]
        
        #TODO might need to scale the data
        #scaler = MinMaxScaler()
        #cluster_avg = scaler.fit_transform(cluster_avg.reshape(-1,1))
        
        #TODO might need to check for and remove NAs
        cluster_avg = cluster_data.mean(axis=1)
        avg_expr[name] = cluster_avg
    
    # Remove clusters that were filtered out due to either being bulk-seq or unknonw
    removed_cols = 1
    for column in avg_expr.columns:
        # if the cluster is all NaN values, drop it
        if avg_expr[column].isnull().values.all():
            print(f'{column} has NaN values')
            avg_expr = avg_expr.drop(column, axis=1)
            gold_std = gold_std[gold_std['cluster'] != column]
            removed_cols -= 1
    if removed_cols <= 0:
        #renumber the clusters 
        cols = ['genes']
        cols += [f'clust{i}' for i in range(1,max(cluster_frame[method])+removed_cols)]
        avg_expr.columns = cols
        gold_std['cluster'] = [f'clust{i}' for i in range(1,max(cluster_frame[method])+removed_cols)]
        
            
    
    return avg_expr, gold_std

<a id='cb'></a>
## **Chung Breast**

In [17]:
data = pd.read_csv(f'{datadir}/chung_breast/filtered/counts.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/chung_breast/chung_breast_clusters.csv')
display(data.head())
display(data_meta.head())

Unnamed: 0,gene,BC01_02,BC01_03,BC01_04,BC01_05,BC01_06,BC01_08,BC01_10,BC01_12,BC01_33,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,A1BG,25,10,3,0,142,31,0,0,20,...,4,11,114,0,0,0,0,0,0,0
1,A1BG-AS1,0,0,0,12,0,0,0,0,0,...,0,744,0,0,0,0,0,0,0,0
2,A1CF,0,0,0,0,0,0,321,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A2M,0,0,1,0,0,3,0,0,0,...,14326,15851,603,74245,32537,9410,119823,6285,32378,50387
4,A2M-AS1,0,0,0,0,0,0,0,0,0,...,748,393,0,5299,1633,421,2584,202,2434,2214


Unnamed: 0,cell,seurat,truth,paper
0,BC01_02,10,Tumor,4
1,BC01_03,10,Tumor,4
2,BC01_04,10,Tumor,4
3,BC01_05,10,Tumor,4
4,BC01_06,10,Tumor,4


In [5]:
avg_expr, gold_std = get_avg_expr(data, data_meta, '')
display(avg_expr)
display(gold_std)
#TODO see if i can compress this and still run javier's code
#avg_expr.to_csv('./chung_breast/seurat_counts_E_xy_matrix.tsv', sep='\t', index=False)
#gold_std.to_csv('./chung_breast/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'Immune': 2})

clust1 is missing cells from the input data, probably filtered
{'BC06_Pooled', 'BC09_Pooled'}

Cells in clust2:
Counter({'Tcell': 54})


Cells in clust3:
Counter({'Myeloid': 38})


Cells in clust4:
Counter({'Tumor': 326})

clust4 is missing cells from the input data, probably filtered
{'BC02_Pooled', 'BC01_Pooled', 'BC03_Pooled', 'BC05_Pooled', 'BC03LN_Pooled', 'BC10_Pooled', 'BC11_Pooled', 'BC04_Pooled', 'BC08_Pooled'}

Cells in clust5:
Counter({'Stromal': 23})


Cells in clust6:
Counter({'Bcell': 83})

clust1 has NaN values


Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5
0,A1BG,11.259259,11.447368,19.864353,26.043478,12.759036
1,A1BG-AS1,0.111111,0.105263,9.722397,0.086957,4.289157
2,A1CF,0.055556,3.157895,5.943218,0.434783,0.638554
3,A2M,70.740741,1876.105263,1259.716088,768.000000,0.674699
4,A2M-AS1,0.129630,50.947368,56.315457,18.434783,0.048193
...,...,...,...,...,...,...
38553,ZYG11A,15.722222,2.421053,124.766562,3.260870,30.120482
38554,ZYG11B,246.000000,116.500000,146.031546,214.739130,102.240964
38555,ZYX,346.407407,333.815789,231.602524,294.260870,147.216867
38556,ZZEF1,276.388889,254.421053,124.261830,165.695652,237.120482


Unnamed: 0,cluster,label
1,clust1,TCELL
2,clust2,MYELOID
3,clust3,TUMOR
4,clust4,STROMAL
5,clust5,BCELL


In [18]:
data = pd.read_csv(f'{datadir}/chung_breast/filtered/fqn.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/chung_breast/chung_breast_clusters.csv')
display(data.head())
display(data_meta.head())

Unnamed: 0,C0,BC01_02,BC01_03,BC01_04,BC01_05,BC01_06,BC01_08,BC01_10,BC01_12,BC01_33,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,A1BG,2.775797,1.094859,1.536536,0.0,5.776871,3.515868,0.0,0.0,3.160243,...,4.158362,3.079034,6.509272,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1BG-AS1,0.0,0.0,0.0,2.296493,0.0,0.0,0.0,0.0,0.0,...,0.0,8.589806,0.0,0.0,0.002799,0.748128,0.0,0.0,0.0,0.013939
2,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,7.554574,0.0,0.0,...,0.0,0.0,0.0,0.0,0.005592,1.665764,0.0,0.0,0.0,0.00838
3,A2M,0.0,0.0,0.733038,0.0,0.0,1.013939,0.0,0.0,0.0,...,13.946047,13.081743,9.149042,17.074537,15.844292,13.653033,17.074537,12.645862,14.708106,16.047717
4,A2M-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.675334,7.523669,0.0,13.095681,10.727023,8.986218,11.615723,6.603749,10.675625,11.683558


Unnamed: 0,cell,seurat,truth,paper
0,BC01_02,10,Tumor,4
1,BC01_03,10,Tumor,4
2,BC01_04,10,Tumor,4
3,BC01_05,10,Tumor,4
4,BC01_06,10,Tumor,4


In [20]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
display(avg_expr)
display(gold_std)
#TODO see if i can compress this and still run javier's code
avg_expr.to_csv('./chung_breast/seurat_fqn_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./chung_breast/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'Tcell': 51, 'Bcell': 45, 'Myeloid': 11, 'Tumor': 3})


Cells in clust2:
Counter({'Tumor': 75})


Cells in clust3:
Counter({'Tumor': 44, 'Stromal': 18, 'Myeloid': 4, 'Tcell': 2})


Cells in clust4:
Counter({'Tumor': 50, 'Stromal': 1, 'Tcell': 1, 'Myeloid': 1})


Cells in clust5:
Counter({'Tumor': 52, 'Stromal': 1})


Cells in clust6:
Counter({'Tumor': 47})


Cells in clust7:
Counter({'Bcell': 38, 'Tumor': 1})


Cells in clust8:
Counter({'Tumor': 24, 'Stromal': 1})


Cells in clust9:
Counter({'Myeloid': 22, 'Tumor': 1, 'Stromal': 1})


Cells in clust10:
Counter({'Tumor': 20, 'Stromal': 1})



Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9,clust10
0,A1BG,1.521044,0.522252,2.466370,1.847591,1.233673,0.288798,0.544004,2.135447,0.804940,1.993655
1,A1BG-AS1,0.282706,0.621468,0.399832,0.314434,0.227499,0.076516,0.261947,1.381896,0.371051,0.969579
2,A1CF,0.404227,0.188735,0.509545,0.127317,0.478700,0.354380,0.111602,0.024512,0.415713,0.671768
3,A2M,1.373209,0.453172,4.765913,1.318681,2.028120,0.668312,0.530718,0.254359,9.416378,0.343547
4,A2M-AS1,0.279894,0.262929,2.697241,0.429142,0.309638,0.071196,0.139123,0.165153,5.871009,0.107702
...,...,...,...,...,...,...,...,...,...,...,...
38553,ZYG11A,1.256530,4.633337,2.444392,5.260406,0.787375,3.311452,2.230670,4.130863,0.695820,0.000000
38554,ZYG11B,5.298208,5.341063,4.696810,5.110921,5.461114,4.763328,4.318161,4.615147,5.283850,5.453133
38555,ZYX,3.738658,4.011764,4.437787,7.703578,2.612358,4.212710,4.427903,5.046454,7.287819,7.060861
38556,ZZEF1,5.054665,3.909542,2.816822,3.716406,3.543644,4.613953,5.799188,4.203876,4.559795,3.530057


Unnamed: 0,cluster,label
0,clust1,TCELL
1,clust2,TUMOR
2,clust3,TUMOR
3,clust4,TUMOR
4,clust5,TUMOR
5,clust6,TUMOR
6,clust7,BCELL
7,clust8,TUMOR
8,clust9,MYELOID
9,clust10,TUMOR


<a id='dg'></a>
## **Darmanis Glioblastoma**

In [2]:
data = pd.read_csv(f'{datadir}/darmanis_glioblastoma/raw/counts.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/darmanis_glioblastoma/darmanis_clusters.csv')

In [6]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
display(avg_expr)
display(gold_std)

avg_expr.to_csv('./darmanis_glioblastoma/seurat_counts_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./darmanis_glioblastoma/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'Astocyte': 88})


Cells in clust2:
Counter({'OPC': 406})


Cells in clust3:
Counter({'Neoplastic': 1091})


Cells in clust4:
Counter({'Neuron': 21})


Cells in clust5:
Counter({'Vascular': 51})


Cells in clust6:
Counter({'Oligodendrocyte': 85})


Cells in clust7:
Counter({'Immune cell': 1847})



Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7
0,1/2-SBSRNA4,11.125000,1.864532,3.194317,4.095238,0.588235,0.541176,0.433676
1,A1BG,0.193182,0.219212,9.384968,13.000000,3.450980,21.352941,4.749865
2,A1BG-AS1,0.000000,0.002463,1.240147,0.809524,0.215686,0.364706,0.361126
3,A1CF,0.000000,0.152709,0.396884,0.000000,0.000000,0.129412,0.002166
4,A2LD1,3.659091,1.800493,1.329973,0.000000,5.882353,0.000000,1.175961
...,...,...,...,...,...,...,...,...
23455,ZYG11B,12.875000,36.960591,21.728689,34.523810,15.019608,24.388235,6.567948
23456,ZYX,16.715909,46.135468,128.826764,25.142857,77.137255,13.647059,94.570655
23457,ZZEF1,25.454545,26.256158,17.452796,20.761905,3.647059,22.047059,23.659989
23458,ZZZ3,15.840909,24.261084,23.822181,26.000000,7.294118,63.200000,7.370872


Unnamed: 0,cluster,label
0,clust1,ASTOCYTE
1,clust2,OPC
2,clust3,NEOPLASTIC
3,clust4,NEURON
4,clust5,VASCULAR
5,clust6,OLIGODENDROCYTE
6,clust7,IMMUNE CELL


In [3]:
data = pd.read_csv(f'{datadir}/darmanis_glioblastoma/raw/fqn.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/darmanis_glioblastoma/darmanis_clusters.csv')

In [5]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
display(avg_expr)
display(gold_std)

avg_expr.to_csv('./darmanis_glioblastoma/seurat_fqn_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./darmanis_glioblastoma/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'Immune cell': 574})


Cells in clust2:
Counter({'Immune cell': 512, 'Neoplastic': 1})


Cells in clust3:
Counter({'Immune cell': 480, 'Neoplastic': 2})


Cells in clust4:
Counter({'OPC': 397})


Cells in clust5:
Counter({'Neoplastic': 386, 'Astocyte': 6, 'OPC': 1})


Cells in clust6:
Counter({'Neoplastic': 262})


Cells in clust7:
Counter({'Neoplastic': 239, 'OPC': 6, 'Astocyte': 2})


Cells in clust8:
Counter({'Immune cell': 173, 'Vascular': 1})


Cells in clust9:
Counter({'Neoplastic': 104, 'Neuron': 1})


Cells in clust10:
Counter({'Immune cell': 102})


Cells in clust11:
Counter({'Oligodendrocyte': 81})


Cells in clust12:
Counter({'Astocyte': 78})


Cells in clust13:
Counter({'Neoplastic': 58, 'OPC': 2})


Cells in clust14:
Counter({'Neoplastic': 39, 'Immune cell': 6, 'Oligodendrocyte': 4, 'Astocyte': 2, 'Vascular': 1})


Cells in clust15:
Counter({'Vascular': 49})


Cells in clust16:
Counter({'Neuron': 20})



Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9,clust10,clust11,clust12,clust13,clust14,clust15,clust16
0,1/2-SBSRNA4,0.094551,0.052505,0.021762,0.117196,0.139090,0.059301,0.312518,0.038825,0.179480,0.000000,0.212729,0.042691,0.000000,0.176590,0.198301,0.299197
1,A1BG,0.116795,0.647762,1.182275,0.025450,0.668011,0.948721,1.095264,0.780966,0.913298,0.557561,1.669770,0.059668,0.876163,0.002502,0.806698,0.920533
2,A1BG-AS1,0.049521,0.068071,0.025905,0.013216,0.168648,0.183332,0.049700,0.006994,0.151711,0.091450,0.056705,0.002697,0.091466,0.002495,0.069308,0.205741
3,A1CF,0.017390,0.012502,0.016366,0.022482,0.022611,0.012887,0.056755,0.006538,0.056304,0.002546,0.071362,0.002624,0.004912,0.002487,0.004738,0.008002
4,A2LD1,0.078930,0.194115,0.124956,0.055431,0.155752,0.097099,0.101139,0.005890,0.015643,0.086150,0.002426,0.190515,0.098046,0.002479,0.347334,0.007299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23455,ZYG11B,0.557992,0.685497,0.911877,1.405379,1.024520,0.817568,0.964453,0.407088,1.478279,0.773352,0.869944,0.786263,0.620685,0.137901,0.810192,2.107222
23456,ZYX,4.227528,4.970446,5.538162,2.190212,2.635364,6.911799,1.591430,3.675969,5.055442,5.631031,0.542784,0.731744,2.712218,0.140753,4.062914,1.563082
23457,ZZEF1,1.174485,1.503910,1.523051,0.924411,1.035667,0.667251,0.584437,0.824149,1.239819,1.972724,0.890612,1.098510,0.893934,0.145327,0.564298,1.288031
23458,ZZZ3,0.369511,0.365218,0.737066,0.915586,1.111705,1.162826,1.076030,0.337809,0.986630,0.465258,1.106914,1.140918,1.536220,0.000000,0.521019,1.286949


Unnamed: 0,cluster,label
0,clust1,IMMUNE CELL
1,clust2,IMMUNE CELL
2,clust3,IMMUNE CELL
3,clust4,OPC
4,clust5,NEOPLASTIC
5,clust6,NEOPLASTIC
6,clust7,NEOPLASTIC
7,clust8,IMMUNE CELL
8,clust9,NEOPLASTIC
9,clust10,IMMUNE CELL


<a id='jam'></a>
## **Jerby Arnon Melanoma**

In [6]:
data = pd.read_csv(f'{datadir}/jerby_arnon_melanoma/filtered/counts.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/jerby_arnon_melanoma/jerby_arnon_melanoma_clusters.csv')

In [7]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
#avg_expr = avg_expr.drop('clust9', axis=1)
display(avg_expr)
display(gold_std)

#avg_expr.to_csv('./jerby_arnon_melanoma/paper_E_xy_matrix.tsv', sep='\t', index=False)
#gold_std.to_csv('./jerby_arnon_melanoma/paper_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'T.CD8': 834, 'T.CD4': 84, 'T cell': 64, 'NK': 38, 'Macrophage': 8, 'Mal': 2})

clust1 is missing cells from the input data, probably filtered
{'CY75_1F54_G12_S564_comb', 'CY89F54_F07_S259_comb', 'CY89F54_G11_S275_comb', 'Cy81_F54_CD45_G05_S269_comb', 'CY89F54_D07_S235_comb', 'CY75_1F54_H08_S572_comb', 'CY89F54_E01_S241_comb', 'CY89F54_C04_S220_comb', 'Cy81_F54_CD45_F04_S256_comb', 'Cy81_F54_CD45_H12_S288_comb', 'CY89F54_A08_S200_comb', 'CY75_1F54_F01_S541_comb', 'CY75_1F54_F12_S552_comb', 'CY89F54_C08_S224_comb', 'CY75_1F54_F04_S544_comb', 'CY75_1F54_F10_S550_comb', 'CY75_1F54_E09_S537_comb', 'CY89F54_F05_S257_comb', 'CY89F54_B08_S212_comb'}

Cells in clust2:
Counter({'T.CD4': 384, 'T.CD8': 304, 'T cell': 217, 'NK': 20, 'B cell': 4, 'Mal': 4, 'Macrophage': 1})

clust2 is missing cells from the input data, probably filtered
{'CY75_1F54_A12_S492_comb', 'CY75_1F54_E11_S539_comb', 'CY75_1F54_F06_S546_comb', 'CY75_1F54_G09_S561_comb', 'CY75_1F54_H09_S573_comb', '

Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9,clust10,clust11,clust12,clust13,clust14,clust15,clust16,clust17
0,C9orf152,0.007913,0.023359,0.491484,0.001610,0.209343,0.006061,0.047170,0.005076,0.009404,0.004219,0.004950,0.017045,0.006410,0.000000,0.040984,0.010989,0.014286
1,RPS11,159.012859,188.962180,383.077859,387.119163,348.752595,26.852525,393.525943,731.804569,259.968652,2149.451477,259.762376,721.426136,248.903846,254.910448,339.221311,225.703297,428.614286
2,ELMO2,42.088032,19.825362,11.624088,49.579710,57.572664,8.498990,49.575472,84.545685,27.115987,97.451477,52.178218,98.517045,56.724359,17.552239,24.696721,17.076923,24.000000
3,CREB3L1,0.011869,0.075640,0.010949,0.009662,0.008651,0.066667,0.002358,0.000000,0.000000,2.092827,0.000000,0.119318,0.000000,0.000000,0.000000,0.142857,32.057143
4,PNMA1,14.570722,14.672970,7.779805,17.850242,27.043253,9.668687,11.978774,45.299492,30.150470,83.413502,17.930693,22.619318,27.487179,8.597015,32.827869,20.593407,13.271429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23681,PIK3IP1,123.655786,169.957731,131.588808,452.755233,225.025952,8.915152,86.353774,14.984772,19.514107,8.936709,89.861386,23.511364,4.115385,3.343284,7.049180,47.934066,48.514286
23682,SNRPD2,30.600396,24.068966,45.489051,63.745572,73.762976,10.557576,52.646226,225.043147,45.746082,668.995781,92.683168,167.642045,103.102564,58.985075,80.204918,43.549451,83.771429
23683,SLC39A6,18.945598,8.480534,18.826034,11.581320,15.906574,23.701010,29.655660,73.467005,102.163009,56.573840,24.227723,58.642045,114.910256,51.865672,79.516393,3.109890,60.014286
23684,CTSC,129.087043,84.418242,48.555961,77.112721,228.605536,8.422222,1073.181604,81.025381,41.426332,297.502110,137.495050,244.539773,95.564103,51.820896,56.221311,107.285714,105.542857


Unnamed: 0,cluster,label
0,clust1,T.CD8
1,clust2,T.CD4
2,clust3,B CELL
3,clust4,T CELL
4,clust5,T.CD8
5,clust6,MAL
6,clust7,MACROPHAGE
7,clust8,MAL
8,clust9,MAL
9,clust10,MAL


In [35]:
data = pd.read_csv(f'{datadir}/jerby_arnon_melanoma/filtered/fqn.tsv.xz', sep='\t')
data.columns = data.columns.str.upper()
data.columns = data.columns.str.replace('FNA','F54')
data_meta = pd.read_csv(f'{datadir}/jerby_arnon_melanoma/jerby_arnon_melanoma_clusters.csv')
data_meta.cell = data_meta.cell.str.upper()

In [36]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
#avg_expr = avg_expr.drop('clust9', axis=1)
display(avg_expr)
display(gold_std)

avg_expr.to_csv('./jerby_arnon_melanoma/seurat_fqn_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./jerby_arnon_melanoma/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'T.CD8': 834, 'T.CD4': 84, 'T cell': 64, 'NK': 38, 'Macrophage': 8, 'Mal': 2})


Cells in clust2:
Counter({'T.CD4': 384, 'T.CD8': 304, 'T cell': 217, 'NK': 20, 'B cell': 4, 'Mal': 4, 'Macrophage': 1})


Cells in clust3:
Counter({'B cell': 779, 'T cell': 42, 'T.CD4': 5, 'Mal': 4, 'T.CD8': 2, 'Macrophage': 1})


Cells in clust4:
Counter({'T cell': 272, 'T.CD4': 241, 'T.CD8': 107, 'Mal': 7, 'CAF': 5, 'B cell': 3, 'NK': 2, 'Endo': 1})


Cells in clust5:
Counter({'T.CD8': 382, 'T.CD4': 100, 'T cell': 85, 'NK': 30, 'Endo': 1})


Cells in clust6:
Counter({'Mal': 484, 'Endo': 7, 'T.CD8': 2, 'CAF': 1, 'B cell': 1})


Cells in clust7:
Counter({'Macrophage': 404, 'Mal': 10, 'B cell': 6, 'T.CD4': 3, 'T cell': 2, 'Endo': 1})


Cells in clust8:
Counter({'Mal': 379, 'CAF': 9, 'Endo': 3, 'T cell': 1, 'Macrophage': 1, 'T.CD4': 1})


Cells in clust9:
Counter({'Mal': 330, 'T.CD4': 2, 'T.CD8': 2, 'T cell': 1, 'CAF': 1, 'Endo': 1, 'Macrophage': 1})


Cells in clust10:
Counter({'M

Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9,clust10,clust11,clust12,clust13,clust14,clust15,clust16,clust17
0,C9orf152,0.014644,0.021449,0.018687,0.005632,0.014047,0.008741,0.011991,0.014679,0.004273,0.009508,0.006001,0.012390,0.003123,0.000000,0.025776,0.008830,0.007353
1,RPS11,7.207495,7.938193,8.812948,8.016513,7.318849,5.788669,7.918238,7.462823,8.315908,9.105323,7.390002,8.470134,7.937702,8.528750,8.920667,7.586241,8.436257
2,ELMO2,1.865099,1.512645,0.756625,1.111613,1.489299,1.922498,2.193574,1.483687,1.966022,2.207032,1.916710,3.069241,2.799337,1.843360,2.732563,1.195471,1.680713
3,CREB3L1,0.106365,0.113071,0.096524,0.115886,0.114075,0.108515,0.083101,0.095771,0.059933,0.138164,0.080552,0.060005,0.028141,0.035636,0.042240,0.149839,2.511102
4,PNMA1,0.963255,0.868027,0.377147,0.628560,0.718090,2.434527,0.795835,0.947383,2.482696,1.505444,0.864397,0.895330,1.449149,0.813962,3.369642,1.435213,1.480459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23681,PIK3IP1,4.064595,5.834744,3.396586,5.542942,3.477079,1.838391,2.486690,0.275271,0.989684,0.199749,2.922683,0.854159,0.228536,0.593021,0.833824,2.607772,1.757386
23682,SNRPD2,3.250443,3.411004,3.388428,2.827432,2.945087,3.445710,3.391555,4.742840,4.959801,7.339970,4.889847,5.463621,5.928891,5.162217,6.166370,3.441281,4.785002
23683,SLC39A6,0.981412,0.727141,0.914100,0.497802,0.501020,4.536556,1.504336,1.814045,5.765983,1.497481,1.226062,2.205379,5.308243,3.505274,5.618572,0.390708,3.410771
23684,CTSC,6.488019,6.383980,5.886648,6.379421,6.714492,3.616199,8.239469,4.307384,4.615100,4.798501,5.797581,6.296024,5.128570,4.426330,5.411391,5.380368,5.601058


Unnamed: 0,cluster,label
0,clust1,T.CD8
1,clust2,T.CD4
2,clust3,B CELL
3,clust4,T CELL
4,clust5,T.CD8
5,clust6,MAL
6,clust7,MACROPHAGE
7,clust8,MAL
8,clust9,MAL
9,clust10,MAL


<a id='crc'></a>
## **Li CRC**

In [3]:
data = pd.read_csv(f'{datadir}/li_crc/filtered/counts.tsv.xz', sep='\t')

data_meta = pd.read_csv(f'{datadir}/li_crc/crc_clusters_with_infercnv.csv')
#data_meta = data_meta.fillna('na')
display(data.head())
data_meta.head()

Unnamed: 0,gene,ensembl,RHC3546__Tcell__.C6E879,RHC3552__Epithelial__.2749FE,RHC3553__Epithelial__.2749FE,RHC3555__Bcell__.7DEA7B,RHC3556__Epithelial__.2749FE,RHC3557__Bcell__.7DEA7B,RHC3562__Bcell__.7DEA7B,RHC3563__Bcell__.7DEA7B,...,RHC5991__Fibroblast__.FB4E09,RHC5995__Tcell__.C6E879,RHC5999__Macrophage__.FFFF55,RHC6004__Tcell__.C6E879,RHC6010__Tcell__.C6E879,RHC6022__Tcell__.C6E879,RHC6030__Fibroblast__.FB4E09,RHC6033__Macrophage__.FFFF55,RHC6039__Tcell__.C6E879,RHC6041__Macrophage__.FFFF55
0,TSPAN6,ENSG00000000003.10,3.0,0.0,0.0,0.0,443.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TNMD,ENSG00000000005.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DPM1,ENSG00000000419.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SCYL3,ENSG00000000457.9,0.0,0.0,1.0,451.0,0.0,24.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C1orf112,ENSG00000000460.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,cell,ascend,cellranger,cidr,countClust,raceid,rca,SAFE,SAME_AIC,SAME_BIC,sc3,scran,simlr,paper_truth,tscan,old_paper,truth,infercnv,conicsmat,paper
0,RHC3546__Tcell__.C6E879,1,3,1,3,1,1,12,3,1,7,2,1,Tcell,1,6,Tcell,normal,Normal,6
1,RHC3552__Epithelial__.2749FE,2,1,2,4,1,0,3,2,2,6,1,3,Epithelial,2,4,Malignant,cancer,Normal,2
2,RHC3553__Epithelial__.2749FE,2,1,3,2,1,6,4,2,2,6,1,3,Epithelial,2,4,Malignant,cancer,Normal,2
3,RHC3555__Bcell__.7DEA7B,1,1,4,5,2,13,1,3,1,7,2,2,Bcell,2,8,Bcell,cancer,Normal,5
4,RHC3556__Epithelial__.2749FE,1,1,2,2,1,6,11,1,2,6,2,4,Epithelial,1,4,Malignant,cancer,Tumour,2


In [4]:
data['gene'] = data.apply(lambda row: f'{row.gene}_{row.ensembl}', axis=1)
data = data.drop('ensembl', axis=1)
data.head()

Unnamed: 0,gene,RHC3546__Tcell__.C6E879,RHC3552__Epithelial__.2749FE,RHC3553__Epithelial__.2749FE,RHC3555__Bcell__.7DEA7B,RHC3556__Epithelial__.2749FE,RHC3557__Bcell__.7DEA7B,RHC3562__Bcell__.7DEA7B,RHC3563__Bcell__.7DEA7B,RHC3565__Epithelial__.2749FE,...,RHC5991__Fibroblast__.FB4E09,RHC5995__Tcell__.C6E879,RHC5999__Macrophage__.FFFF55,RHC6004__Tcell__.C6E879,RHC6010__Tcell__.C6E879,RHC6022__Tcell__.C6E879,RHC6030__Fibroblast__.FB4E09,RHC6033__Macrophage__.FFFF55,RHC6039__Tcell__.C6E879,RHC6041__Macrophage__.FFFF55
0,TSPAN6_ENSG00000000003.10,3.0,0.0,0.0,0.0,443.0,11.0,0.0,0.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TNMD_ENSG00000000005.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DPM1_ENSG00000000419.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SCYL3_ENSG00000000457.9,0.0,0.0,1.0,451.0,0.0,24.0,0.0,0.0,813.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C1orf112_ENSG00000000460.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,787.999,...,0.0,0.0,0.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
#avg_expr = avg_expr.drop('clust9', axis=1)
display(avg_expr)
display(gold_std)


Cells in clust1:
Counter({'Epithelial': 9})


Cells in clust2:
Counter({'Malignant': 263})


Cells in clust3:
Counter({'Macrophage': 19})


Cells in clust4:
Counter({'Fibroblast': 17})


Cells in clust5:
Counter({'Bcell': 17})


Cells in clust6:
Counter({'Tcell': 34})



Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6
0,TSPAN6_ENSG00000000003.10,20.777778,198.730038,0.000000,2.529412,0.647059,4.264706
1,TNMD_ENSG00000000005.5,0.000000,0.019011,0.000000,0.000000,0.000000,0.000000
2,DPM1_ENSG00000000419.8,2.333333,71.502015,91.947368,74.588235,3.588235,28.823912
3,SCYL3_ENSG00000000457.9,164.555556,27.357422,0.421053,2.941176,78.235294,40.205882
4,C1orf112_ENSG00000000460.12,0.000000,4.764259,0.000000,0.000000,3.000000,2.529412
...,...,...,...,...,...,...,...
57236,BX649553.4_ENSGR0000264819.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57237,RN7SL355P_ENSGR0000265350.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57238,MIR3690_ENSGR0000265658.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57239,AL732314.1_ENSGR0000266731.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Unnamed: 0,cluster,label
0,clust1,EPITHELIAL
1,clust2,MALIGNANT
2,clust3,MACROPHAGE
3,clust4,FIBROBLAST
4,clust5,BCELL
5,clust6,TCELL


In [6]:
avg_expr.to_csv('./li_crc/seurat_counts_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./li_crc/seurat_gold_std.tsv', sep='\t', index=False, header=False)

In [40]:
data = pd.read_csv(f'{datadir}/li_crc/filtered/fqn.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/li_crc/crc_clusters.csv')
data.head()

Unnamed: 0,gene,RHC3546__Tcell__.C6E879,RHC3552__Epithelial__.2749FE,RHC3553__Epithelial__.2749FE,RHC3555__Bcell__.7DEA7B,RHC3556__Epithelial__.2749FE,RHC3557__Bcell__.7DEA7B,RHC3562__Bcell__.7DEA7B,RHC3563__Bcell__.7DEA7B,RHC3565__Epithelial__.2749FE,...,RHC5991__Fibroblast__.FB4E09,RHC5995__Tcell__.C6E879,RHC5999__Macrophage__.FFFF55,RHC6004__Tcell__.C6E879,RHC6010__Tcell__.C6E879,RHC6022__Tcell__.C6E879,RHC6030__Fibroblast__.FB4E09,RHC6033__Macrophage__.FFFF55,RHC6039__Tcell__.C6E879,RHC6041__Macrophage__.FFFF55
0,TSPAN6_ENSG00000000003.10,2.92049,0.0,0.0,0.0,8.92611,4.176829,0.0,0.0,5.424207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TNMD_ENSG00000000005.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DPM1_ENSG00000000419.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SCYL3_ENSG00000000457.9,0.0,0.0,4.744431,9.086155,0.0,5.391073,0.0,0.0,9.684788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C1orf112_ENSG00000000460.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.594735,...,0.0,0.0,0.0,5.625984,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
#avg_expr, gold_std = get_avg_expr(data, data_meta, 'paper')
#avg_expr = avg_expr.drop('clust9', axis=1)
display(avg_expr)
display(gold_std)


Cells in clust1:
Counter({'Epithelial': 92, 'Bcell': 9, 'Tcell': 7, 'Fibroblast': 4, 'Macrophage': 1})


Cells in clust2:
Counter({'Epithelial': 80, 'Bcell': 3})


Cells in clust3:
Counter({'Epithelial': 31, 'Fibroblast': 12})


Cells in clust4:
Counter({'Epithelial': 42})


Cells in clust5:
Counter({'Epithelial': 27, 'Bcell': 3, 'Macrophage': 1, 'Tcell': 1})


Cells in clust6:
Counter({'Tcell': 26, 'Bcell': 2})


Cells in clust7:
Counter({'Macrophage': 17, 'Fibroblast': 1})



Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7
0,TSPAN6_ENSG00000000003.10,1.799864,3.151303,2.617854,4.372271,1.891750,0.436312,0.000000
1,TNMD_ENSG00000000005.5,0.000000,0.023667,0.000000,0.000000,0.000000,0.000000,0.000000
2,DPM1_ENSG00000000419.8,1.105045,2.303175,1.710974,3.491304,0.428581,0.743298,1.691457
3,SCYL3_ENSG00000000457.9,1.165419,0.891507,1.142392,1.160872,2.000673,0.576151,0.000000
4,C1orf112_ENSG00000000460.12,0.098350,0.381758,0.099792,0.335368,0.329553,0.200928,0.000000
...,...,...,...,...,...,...,...,...
57236,BX649553.4_ENSGR0000264819.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57237,RN7SL355P_ENSGR0000265350.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57238,MIR3690_ENSGR0000265658.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57239,AL732314.1_ENSGR0000266731.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Unnamed: 0,cluster,label
0,clust1,EPITHELIAL
1,clust2,EPITHELIAL
2,clust3,EPITHELIAL
3,clust4,EPITHELIAL
4,clust5,EPITHELIAL
5,clust6,TCELL
6,clust7,MACROPHAGE


In [42]:
avg_expr.to_csv('./li_crc/seurat_fqn_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./li_crc/seurat_gold_std.tsv', sep='\t', index=False, header=False)

## **Tirosh Melanoma**

In [2]:
data = pd.read_csv(f'{datadir}/tirosh_melanoma/filtered/counts.tsv.xz', sep='\t')
data_meta = pd.read_csv(f'{datadir}/tirosh_melanoma/tirosh_melanoma_clusters.csv')

In [3]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
#avg_expr = avg_expr.drop('clust9', axis=1)
display(avg_expr)
display(gold_std)
avg_expr.to_csv('./tirosh_melanoma/seurat_counts_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./tirosh_melanoma/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'Melanoma': 1758})

clust1 is missing cells from the input data, probably filtered
{'Cy81_Bulk_CD45_C07_S127_comb', 'Cy81_FNA_CD45_C01_S217_comb', 'cy79-p5-CD45-neg-PDL1-neg-B04-S784-comb', 'cy81-Bulk-CD45-neg-G02-S170-comb', 'cy79-p3-CD45-neg-PD1-pos-AS-C3-R1-B11-S311-comb', 'cy53-1-CD45-neg-G02-S362-comb', 'CY89CORE11_E07_S151_comb', 'CY88_5_E01_S721_comb', 'cy79-p5-CD45-neg-PDL1-neg-F04-S832-comb', 'cy78-CD45-neg-3-B03-S687-comb', 'cy79-p5-CD45-neg-PDL1-pos-A05-S677-comb', 'Cy71_CD45_A04_S484_comb', 'Cy59_84', 'CY88_3_C02_S602_comb', 'CY88_5_F08_S740_comb', 'cy80_cd_90_neg_H04_S184_comb', 'Cy59_19', 'cy79-p3-CD45-neg-PD1-pos-AS-C3-R1-H12-S384-comb', 'cy79-p4-CD45-neg-PDL1-neg-C06-S1086-comb', 'CY88_3_B02_S590_comb', 'CY89FNAQ2_A11_S395_comb', 'CY65_NEG_B_AAGAGGCA_AGAGTAGA', 'Cy71_CD45_H12_S672_comb', 'cy79-p5-CD45-neg-PDL1-pos-A10-S682-comb', 'cy79-p5-CD45-neg-PDL1-pos-F04-S736-comb', 'Cy71_CD45_H03_S663_comb', 'cy79-p5-CD45-neg-PDL1-neg-B12-S792-comb', 'c

Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7
0,C9orf152,0.007649,0.019008,0.001975,0.007563,0.000000,0.000000,0.002538
1,RPS11,224.481719,285.373612,430.222783,223.724492,195.484385,210.916852,274.980558
2,ELMO2,2.852806,5.851607,1.678656,3.280825,1.010815,2.601344,4.639077
3,CREB3L1,0.013718,0.020493,0.000000,0.004437,0.107708,3.931328,0.000000
4,PNMA1,3.693170,3.715540,0.907353,0.737000,2.607292,2.581311,0.608173
...,...,...,...,...,...,...,...,...
23681,PIK3IP1,11.621116,53.649785,23.804359,8.910913,13.072846,5.305328,23.726904
23682,SNRPD2,39.215924,37.149619,32.278674,23.277048,26.461600,31.311738,20.722269
23683,SLC39A6,8.093677,2.686751,3.501586,1.170579,0.264431,4.148836,2.523808
23684,CTSC,9.367180,24.363489,7.914845,65.646040,11.312785,22.418377,37.836058


Unnamed: 0,cluster,label
0,clust1,MELANOMA
1,clust2,T CELL
2,clust3,B CELL
3,clust4,MACROPHAGE
4,clust5,ENDOTHELIAL
5,clust6,CAF
6,clust7,NK


In [37]:
data = pd.read_csv(f'{datadir}/tirosh_melanoma/filtered/fqn.tsv.xz', sep='\t')
data.columns = data.columns.str.upper()
data.columns = data.columns.str.replace(".","_")
data.columns = data.columns.str.replace("-","_")
data_meta = pd.read_csv(f'{datadir}/tirosh_melanoma/tirosh_melanoma_clusters.csv')
data_meta.cell = data_meta.cell.str.upper()
print(data.shape)
display(data.head())
print(data_meta.shape)
display(data_meta.head())

(23686, 4556)


Unnamed: 0,UNNAMED: 0,CY72_CD45_H02_S758_COMB,CY58_1_CD45_B02_S974_COMB,CY71_CD45_D08_S524_COMB,CY81_FNA_CD45_B01_S301_COMB,CY80_II_CD45_B07_S883_COMB,CY81_BULK_CD45_B10_S118_COMB,CY72_CD45_D09_S717_COMB,CY74_CD45_A03_S387_COMB,CY71_CD45_B05_S497_COMB,...,CY75_1_CD45_CD8_7__S265_COMB,CY75_1_CD45_CD8_3__S127_COMB,CY75_1_CD45_CD8_1__S61_COMB,CY75_1_CD45_CD8_1__S12_COMB,CY75_1_CD45_CD8_1__S25_COMB,CY75_1_CD45_CD8_7__S223_COMB,CY75_1_CD45_CD8_1__S65_COMB,CY75_1_CD45_CD8_1__S93_COMB,CY75_1_CD45_CD8_1__S76_COMB,CY75_1_CD45_CD8_7__S274_COMB
0,C9orf152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,RPS11,8.960467,8.45073,8.778203,8.07253,8.398219,8.009361,7.988597,8.205928,8.504811,...,0.0,8.009361,5.255807,0.0,6.123362,4.945712,4.370679,7.192856,3.648094,3.173299
2,ELMO2,0.0,0.0,1.324017,0.002215,0.0,0.00095,0.0,0.0,0.0,...,0.0,0.0,2.651803,3.968258,0.0,0.0,4.890027,0.0,0.0,0.0
3,CREB3L1,0.0,0.0,0.0,0.002215,0.0,0.00095,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PNMA1,0.0,0.0,0.0,0.002215,0.0,0.00095,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(4262, 20)


Unnamed: 0.1,Unnamed: 0,cell,seurat_x,ascend,cellRanger,cidr,clue,countClust,RaceID,rca,SAFE,SAME_AIC,SAME_BIC,sc3,scran,seurat_y,simlr,truth,tscan,paper
0,0,CY72_CD45_H02_S758_COMB,3,1,4,1,11,1,1618,4,3,10,14,1,4,3,7,B Cell,1,3
1,1,CY58_1_CD45_B02_S974_COMB,2,1,2,2,1,4,2017,3,1,1,2,11,1,1,14,T Cell,4,2
2,2,CY71_CD45_D08_S524_COMB,6,2,2,3,6,2,15,9,8,12,3,29,2,5,3,Melanoma,2,1
3,3,CY81_FNA_CD45_B01_S301_COMB,7,2,9,3,5,2,3,1,8,8,13,26,2,8,10,Melanoma,2,1
4,4,CY80_II_CD45_B07_S883_COMB,12,2,1,3,6,2,4,1,8,13,13,28,2,10,11,Melanoma,2,1


In [39]:
avg_expr, gold_std = get_avg_expr(data, data_meta, 'seurat')
#avg_expr = avg_expr.drop('clust9', axis=1)
display(avg_expr)
display(gold_std)
avg_expr.to_csv('./tirosh_melanoma/seurat_fqn_E_xy_matrix.tsv', sep='\t', index=False)
gold_std.to_csv('./tirosh_melanoma/seurat_gold_std.tsv', sep='\t', index=False, header=False)


Cells in clust1:
Counter({'T Cell': 647, 'Melanoma': 13, 'NK': 2})


Cells in clust2:
Counter({'T Cell': 612, 'NK': 35, 'Melanoma': 12})


Cells in clust3:
Counter({'B Cell': 477, 'Melanoma': 39, 'T Cell': 10, 'NK': 1})


Cells in clust4:
Counter({'Melanoma': 495, 'T Cell': 2, 'B Cell': 1})

clust4 is missing cells from the input data, probably filtered
25
{'CY79_P5_CD45_NEG_PDL1_POS_F04_S736_COMB', 'CY79_P4_CD45_NEG_PDL1_NEG_G08_S1136_COMB', 'CY79_P4_CD45_NEG_PDL1_POS_B08_S404_COMB', 'CY79_P1_CD45_NEG_PDL1_POS_AS_C1_R1_E09_S57_COMB', 'CY79_P1_CD45_NEG_PDL1_POS_AS_C1_R1_A09_S9_COMB', 'CY79_P4_CD45_NEG_PDL1_NEG_F05_S1121_COMB', 'CY79_P3_CD45_NEG_PDL1_NEG_E06_S246_COMB', 'CY79_P5_CD45_NEG_PDL1_POS_A10_S682_COMB', 'CY79_P5_CD45_NEG_PDL1_POS_B03_S687_COMB', 'CY79_P3_CD45_NEG_PDL1_NEG_G02_S266_COMB', 'CY79_P5_CD45_NEG_PDL1_NEG_B04_S784_COMB', 'CY79_P3_CD45_NEG_PDL1_NEG_F10_S262_COMB', 'CY79_P3_CD45_NEG_PDL1_NEG_G06_S270_COMB', 'CY79_P3_CD45_NEG_PD1_POS_AS_C3_R1_H12_S384_COMB', 'CY79_P3_CD4

Unnamed: 0,genes,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9,clust10,clust11,clust12,clust13,clust14,clust15,clust16
0,C9orf152,0.000000,0.006585,0.000000,0.000000,0.000000,0.006164,0.000000,0.004002,0.000000,0.062614,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,RPS11,8.043538,7.508610,8.446184,5.684244,7.793109,7.837679,7.447360,8.006008,7.534315,7.097360,5.452143,7.790134,7.423019,7.621801,7.587896,8.022255
2,ELMO2,0.691081,0.940772,0.373119,0.806326,0.727118,0.689677,0.916461,0.897817,0.819289,0.455898,1.589946,0.718194,0.627765,0.368461,0.478094,0.139422
3,CREB3L1,0.013911,0.007736,0.002623,0.006959,0.009561,0.002293,0.001844,0.001832,0.005748,0.002503,0.044243,0.001311,0.004455,0.040082,0.791831,0.001988
4,PNMA1,0.331826,0.457601,0.123116,1.271695,0.435868,1.127032,0.386618,1.308299,0.276031,0.304412,0.869749,0.373661,0.421944,0.558732,0.665716,0.260640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23681,PIK3IP1,4.424874,2.881089,2.321314,1.043088,3.499246,0.364023,0.221382,0.362601,1.279063,2.725578,2.807394,0.110991,1.803667,1.910376,0.918843,1.535753
23682,SNRPD2,3.226707,3.005602,3.222935,3.350762,3.400856,5.046979,5.024141,5.934351,3.234658,2.843125,2.309248,6.328818,5.053093,3.175820,4.051389,3.069968
23683,SLC39A6,0.342101,0.498359,0.570628,2.193003,0.447226,2.808872,1.376747,2.359694,0.492876,0.332272,0.402925,2.498652,0.530496,0.103747,0.981059,0.648489
23684,CTSC,2.924636,3.480223,2.680369,1.058801,3.536890,1.669476,1.706306,2.154734,4.994231,4.821545,3.237048,2.317124,3.277081,2.177966,3.204463,4.524900


Unnamed: 0,cluster,label
0,clust1,T CELL
1,clust2,T CELL
2,clust3,B CELL
3,clust4,MELANOMA
4,clust5,T CELL
5,clust6,MELANOMA
6,clust7,MELANOMA
7,clust8,MELANOMA
8,clust9,MACROPHAGE
9,clust10,T CELL
