# Importing modules and settings

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
from matplotlib.pyplot import rc_context

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

General settings of Scanpy

In [5]:
sc.settings.verbosity = 3 
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


scanpy==1.9.3 anndata==0.8.0 umap==0.5.2 numpy==1.23.5 scipy==1.9.3 pandas==1.4.3 scikit-learn==1.2.2 statsmodels==0.13.5 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5


In [6]:
umap_cmap = sns.blend_palette(['xkcd:light grey', 'xkcd:indigo'], as_cmap = True)

# Declaring the input and output files

In [7]:
adata = sc.read_h5ad('./h5ad_files/smed_size_analysis_202306.h5ad')

In [8]:
adata

AnnData object with n_obs × n_vars = 28738 × 24000
    obs: 'Size', 'Library', 'Sample', 'n_counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4'
    var: 'gene_ids', 'feature_types', 'longest_isoform', 'gene_type', 'gene_JakkeGuo', 'gene_Jakke_ver1', 'gene_ddv6', 'jakkeguo_collapsed', 'jakke_ver1_collapsed', 'ddv6_collapsed', 'Preferred_name', 'Description', 'PFAMs', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden_1', 'dendrogram_leiden_2', 'dendrogram_leiden_3', 'dendrogram_leiden_4', 'hvg', 'leiden', 'leiden_1_colors', 'leiden_2_colors', 'leiden_3_colors', 'leiden_4_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups_logreg_leiden_1', 'rank_genes_groups_logreg_leiden_2', 'rank_genes_groups_logreg_leiden_3', 'rank_genes_groups_logreg_leiden_4', 'rank_genes_

In [9]:
leiden_names = adata.obs.columns[adata.obs.columns.str.contains('leiden')].to_list()

In [10]:
leiden_names

['leiden_1', 'leiden_2', 'leiden_3', 'leiden_4']

In [11]:
adata.var

Unnamed: 0,gene_ids,feature_types,longest_isoform,gene_type,gene_JakkeGuo,gene_Jakke_ver1,gene_ddv6,jakkeguo_collapsed,jakke_ver1_collapsed,ddv6_collapsed,...,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,highly_variable,means,dispersions,dispersions_norm,mean,std
h1SMcG0000001,h1SMcG0000001,Gene Expression,h1SMcT0000001.1,other,-,-,dd_Smed_v6_17026_0_4,SMEST050392001.1,SMESG000050392.1,"dd_Smed_v6_17026_0_4,dd_Smed_v6_17026_0_5",...,68,0.002499,99.764012,72.0,True,0.002016,0.187538,0.499244,0.001365,0.031118
h1SMcG0000009,h1SMcG0000009,Gene Expression,h1SMcT0000009.2,hconf,SMEST029652002.1,MSTRG.9914,dd_Smed_v6_5323_0_1,"SMEST029652002.1,SmMSTRG.9914.2,SmMSTRG.9914.3...",MSTRG.9914,dd_Smed_v6_5323_0_1,...,173,0.006108,99.399618,176.0,True,0.005309,0.118957,0.310118,0.003640,0.050272
h1SMcG0000010,h1SMcG0000010,Gene Expression,h1SMcT0000010.1,other,-,-,-,-,-,-,...,9,0.000312,99.968766,9.0,True,0.000254,0.123262,0.321988,0.000173,0.011020
h1SMcG0000011,h1SMcG0000011,Gene Expression,h1SMcT0000011.1,hconf,SMEST029655001.1,MSTRG.9916,dd_Smed_v6_6416_0_1,"SMEST029655001.1,SMEST029655002.1,SmMSTRG.9916...",MSTRG.9916,dd_Smed_v6_6416_0_1,...,1705,0.063092,94.082943,1818.0,True,0.053033,0.159237,0.421197,0.036676,0.158263
h1SMcG0000012,h1SMcG0000012,Gene Expression,h1SMcT0000012.1,hconf,SmMSTRG.9913.2,MSTRG.9913,dd_Smed_v6_20296_0_1,"SMEST029666001.1,SmMSTRG.9913.2",MSTRG.9913,dd_Smed_v6_20296_0_1,...,22,0.000798,99.923651,23.0,True,0.000683,0.161131,0.426421,0.000461,0.018268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
h1SMnG0035270,h1SMnG0035270,Gene Expression,h1SMnT0035270.1,hconf,-,-,-,-,-,-,...,1,0.000035,99.996530,1.0,True,0.000042,0.191891,0.511248,0.000028,0.004682
h1SMnG0035333,h1SMnG0035333,Gene Expression,h1SMnT0035333.1,other,-,-,-,-,-,-,...,1,0.000035,99.996530,1.0,True,0.000047,0.309073,0.834401,0.000030,0.005071
h1SMnG0035418,h1SMnG0035418,Gene Expression,h1SMnT0035418.1,other,-,-,-,-,-,-,...,2,0.000069,99.993059,2.0,True,0.000071,0.090285,0.231048,0.000048,0.005916
h1SMnG0035546,h1SMnG0035546,Gene Expression,h1SMnT0035546.1,other,-,-,-,-,-,-,...,1,0.000035,99.996530,1.0,True,0.000044,0.226124,0.605653,0.000028,0.004793


In [12]:
adata.obs.columns


Index(['Size', 'Library', 'Sample', 'n_counts', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_1',
       'leiden_2', 'leiden_3', 'leiden_4'],
      dtype='object')

# Selecting one clustering layer from leiden_names

In [13]:
clusteringlayer = 'leiden_3'

# Pandas dataframe with markers and excel file

In [14]:
list(adata.uns)

['dendrogram_leiden_1',
 'dendrogram_leiden_2',
 'dendrogram_leiden_3',
 'dendrogram_leiden_4',
 'hvg',
 'leiden',
 'leiden_1_colors',
 'leiden_2_colors',
 'leiden_3_colors',
 'leiden_4_colors',
 'log1p',
 'neighbors',
 'pca',
 'rank_genes_groups_logreg_leiden_1',
 'rank_genes_groups_logreg_leiden_2',
 'rank_genes_groups_logreg_leiden_3',
 'rank_genes_groups_logreg_leiden_4',
 'rank_genes_groups_wilcox_leiden_1',
 'rank_genes_groups_wilcox_leiden_2',
 'rank_genes_groups_wilcox_leiden_3',
 'rank_genes_groups_wilcox_leiden_4',
 'umap']

In [15]:
markers_w = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(50)

In [16]:
markers_w

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,h1SMcG0019136,h1SMcG0008035,h1SMcG0014354,h1SMcG0000998,h1SMnG0035616,h1SMnG0035616,h1SMnG0035616,h1SMcG0017400,h1SMcG0017400,h1SMcG0002269,...,h1SMcG0017676,h1SMcG0014354,h1SMnG0027695,h1SMnG0018264,h1SMnG0035616,h1SMcG0006357,h1SMcG0004811,h1SMnG0035616,h1SMnG0027428,h1SMcG0008035
1,h1SMcG0020223,h1SMnG0035616,h1SMcG0022555,h1SMcG0014354,h1SMcG0008035,h1SMcG0005534,h1SMnG0035608,h1SMcG0003474,h1SMcG0016328,h1SMcG0015757,...,h1SMcG0017679,h1SMcG0000998,h1SMcG0013195,h1SMcG0019136,h1SMcG0008035,h1SMcG0006356,h1SMnG0035616,h1SMcG0008035,h1SMnG0035070,h1SMnG0035353
2,h1SMcG0015883,h1SMnG0035608,h1SMcG0015236,h1SMcG0009472,h1SMcG0010823,h1SMcG0005535,h1SMcG0010823,h1SMcG0009632,h1SMcG0016327,h1SMnG0009123,...,h1SMcG0017677,h1SMcG0022555,h1SMcG0005152,h1SMcG0018081,h1SMnG0024066,h1SMcG0012636,h1SMcG0015722,h1SMcG0010823,h1SMcG0000515,h1SMnG0035616
3,h1SMcG0005152,h1SMcG0019482,h1SMcG0016741,h1SMcG0022555,h1SMcG0007496,h1SMnG0035608,h1SMcG0009632,h1SMcG0015598,h1SMcG0003473,h1SMcG0009175,...,h1SMcG0017680,h1SMcG0009472,h1SMnG0007035,h1SMcG0002601,h1SMcG0003975,h1SMnG0024643,h1SMcG0011140,h1SMcG0009124,h1SMcG0005979,h1SMcG0013999
4,h1SMcG0009596,h1SMcG0013999,h1SMcG0001082,h1SMcG0006857,h1SMnG0035608,h1SMcG0010823,h1SMcG0010835,h1SMcG0008074,h1SMcG0020223,h1SMcG0014491,...,h1SMcG0017560,h1SMcG0007433,h1SMcG0005526,h1SMcG0009605,h1SMnG0024133,h1SMcG0019845,h1SMcG0006494,h1SMcG0010835,h1SMcG0007496,h1SMcG0021498
5,h1SMcG0019758,h1SMcG0010823,h1SMcG0018373,h1SMcG0007433,h1SMcG0010835,h1SMnG0035175,h1SMcG0009633,h1SMcG0009633,h1SMcG0011169,h1SMcG0009632,...,h1SMnG0023745,h1SMcG0019136,h1SMnG0013564,h1SMcG0013018,h1SMnG0035352,h1SMcG0012539,h1SMcG0004781,h1SMcG0013162,h1SMcG0016146,h1SMnG0017088
6,h1SMcG0020195,h1SMcG0007496,h1SMcG0000998,h1SMcG0001082,h1SMcG0013999,h1SMcG0005543,h1SMnG0035175,h1SMcG0008073,h1SMcG0013759,h1SMcG0006393,...,h1SMcG0017681,h1SMcG0001082,h1SMcG0019758,h1SMcG0002117,h1SMcG0008834,h1SMcG0014350,h1SMcG0012486,h1SMnG0031692,h1SMcG0022240,h1SMcG0021514
7,h1SMcG0003247,h1SMcG0010835,h1SMcG0018100,h1SMcG0016741,h1SMcG0019482,h1SMcG0005613,h1SMcG0015722,h1SMcG0007791,h1SMcG0001608,h1SMcG0018987,...,h1SMcG0017559,h1SMcG0018373,h1SMcG0019136,h1SMcG0015722,h1SMcG0002703,h1SMnG0019544,h1SMcG0003975,h1SMnG0035608,h1SMcG0008035,h1SMnG0035352
8,h1SMcG0000076,h1SMcG0002117,h1SMcG0014134,h1SMcG0018100,h1SMcG0002117,h1SMcG0005567,h1SMcG0012505,h1SMcG0019666,h1SMcG0016144,h1SMcG0009814,...,h1SMcG0013873,h1SMcG0006857,h1SMcG0011689,h1SMcG0002300,h1SMnG0006514,h1SMcG0008504,h1SMcG0019482,h1SMnG0035138,h1SMcG0013540,h1SMcG0015722
9,h1SMcG0009890,h1SMnG0035175,h1SMcG0006857,h1SMcG0018373,h1SMcG0015722,h1SMcG0004067,h1SMnG0035352,h1SMcG0016327,h1SMcG0015571,h1SMcG0018473,...,h1SMcG0012505,h1SMcG0016741,h1SMcG0007378,h1SMcG0009268,h1SMcG0020430,h1SMcG0021073,h1SMcG0009478,h1SMcG0013999,h1SMnG0026699,h1SMcG0022103


In [17]:
markers_w_l = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals_adj']).head(50)

In [18]:
markers_l = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(50)

In [19]:
markers_l

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,h1SMcG0015883,h1SMcG0008035,h1SMcG0020022,h1SMcG0000998,h1SMcG0008035,h1SMcG0005534,h1SMnG0035616,h1SMcG0017400,h1SMcG0017400,h1SMcG0002269,...,h1SMcG0017676,h1SMcG0000998,h1SMnG0027695,h1SMnG0018264,h1SMnG0024066,h1SMcG0006357,h1SMcG0004811,h1SMcG0009124,h1SMcG0008035,h1SMcG0008035
1,h1SMcG0020223,h1SMcG0005241,h1SMcG0016673,h1SMcG0006857,h1SMcG0020537,h1SMcG0005535,h1SMcG0012529,h1SMcG0008074,h1SMcG0003473,h1SMcG0015757,...,h1SMcG0017679,h1SMcG0014354,h1SMcG0013195,h1SMcG0019136,h1SMnG0024133,h1SMcG0006356,h1SMcG0011140,h1SMcG0008035,h1SMnG0027428,h1SMnG0017088
2,h1SMcG0015241,h1SMcG0004237,h1SMcG0015236,h1SMcG0014354,h1SMnG0035616,h1SMcG0005567,h1SMcG0009632,h1SMcG0003474,h1SMcG0013759,h1SMnG0009123,...,h1SMcG0017677,h1SMcG0019136,h1SMnG0007035,h1SMcG0002117,h1SMnG0006514,h1SMcG0012636,h1SMcG0021885,h1SMcG0008893,h1SMcG0000515,h1SMnG0035353
3,h1SMcG0019136,h1SMnG0020097,h1SMcG0000998,h1SMcG0014406,h1SMcG0009785,h1SMcG0005613,h1SMnG0035608,h1SMcG0015598,h1SMcG0016328,h1SMcG0014491,...,h1SMcG0017680,h1SMcG0006857,h1SMnG0013564,h1SMcG0002601,h1SMcG0008035,h1SMnG0024643,h1SMcG0004781,h1SMnG0031692,h1SMcG0005979,h1SMcG0013999
4,h1SMcG0004827,h1SMnG0012397,h1SMcG0017034,h1SMcG0008106,h1SMcG0001488,h1SMcG0005543,h1SMcG0017047,h1SMcG0019136,h1SMcG0015571,h1SMcG0006393,...,h1SMcG0008035,h1SMcG0022555,h1SMcG0005152,h1SMcG0009605,h1SMcG0002713,h1SMcG0019845,h1SMcG0006230,h1SMcG0019316,h1SMcG0005232,h1SMcG0021498
5,h1SMcG0009545,h1SMnG0021196,h1SMcG0014354,h1SMcG0009472,h1SMcG0013999,h1SMcG0005539,h1SMcG0000479,h1SMcG0014919,h1SMnG0009126,h1SMcG0007332,...,h1SMcG0013999,h1SMcG0017092,h1SMcG0001350,h1SMcG0011164,h1SMcG0002703,h1SMcG0008504,h1SMcG0008035,h1SMnG0026677,h1SMnG0035070,h1SMcG0013162
6,h1SMcG0016202,h1SMcG0009165,h1SMcG0003207,h1SMcG0001827,h1SMcG0004525,h1SMcG0005538,h1SMcG0000719,h1SMcG0003229,h1SMcG0005487,h1SMnG0032240,...,h1SMcG0017560,h1SMcG0002622,h1SMcG0011689,h1SMnG0024039,h1SMcG0007395,h1SMcG0012539,h1SMcG0006494,h1SMnG0035138,h1SMcG0004788,h1SMcG0021514
7,h1SMcG0004823,h1SMcG0004050,h1SMcG0010878,h1SMcG0017363,h1SMcG0013627,h1SMnG0035616,h1SMcG0012533,h1SMcG0019693,h1SMcG0007170,h1SMcG0009814,...,h1SMnG0023745,h1SMcG0007433,h1SMcG0005526,h1SMcG0005133,h1SMcG0016999,h1SMnG0019544,h1SMcG0012486,h1SMnG0017044,h1SMnG0026699,h1SMcG0022104
8,h1SMcG0016775,h1SMcG0013999,h1SMcG0022555,h1SMcG0019319,h1SMcG0013162,h1SMnG0000528,h1SMcG0021968,h1SMcG0000830,h1SMcG0016327,h1SMcG0009175,...,h1SMcG0021885,h1SMcG0003233,h1SMcG0012789,h1SMcG0015722,h1SMcG0003975,h1SMcG0014350,h1SMcG0005312,h1SMcG0008883,h1SMcG0008930,h1SMnG0019762
9,h1SMcG0017243,h1SMcG0007442,h1SMcG0017092,h1SMcG0009250,h1SMcG0009590,h1SMcG0008441,h1SMcG0022202,h1SMcG0010163,h1SMcG0005213,h1SMcG0015836,...,h1SMcG0007451,h1SMcG0018373,h1SMnG0014671,h1SMcG0018081,h1SMcG0008834,h1SMnG0035080,h1SMcG0017184,h1SMcG0005979,h1SMcG0016146,h1SMcG0011389


In [20]:
with pd.ExcelWriter('./outputs/'+clusteringlayer+'_markers_wilcoxon.xlsx') as writer:
    for col in markers_w.columns:
        df = adata.raw.var.loc[markers_w[col][markers_w_l[col] < 0.05].to_list()][['gene_type','gene_ddv6', 'Preferred_name','Description']]
        df.to_excel(writer, sheet_name='Cluster '+ col)

In [21]:
with pd.ExcelWriter('./outputs/'+clusteringlayer+'_markers_logreg.xlsx') as writer:
    for col in markers_w.columns:
        df = adata.raw.var.loc[markers_l['50'].to_list()][['gene_type','gene_ddv6', 'Preferred_name','Description']]
        df.to_excel(writer, sheet_name='Cluster '+ col)

# Plots markers

In [22]:
def get_plots (clusteringlayer, cluster, li_markers):
    fig, axs = plt.subplots(3, 3, figsize = (15, 15))
    
    sc.pl.umap(adata, color= clusteringlayer, legend_loc = 'on data', groups = cluster, na_in_legend = False, size = 5, legend_fontsize = 7, title = clusteringlayer+' cluster '+cluster, show = False, ax = axs[0, 0])
    
    while len(li_markers) < 8:
        li_markers.append(None)

    gene01 = li_markers[0]
    gene02 = li_markers[1]
    gene10 = li_markers[2]
    gene11 = li_markers[3]
    gene12 = li_markers[4]
    gene20 = li_markers[5]
    gene21 = li_markers[6]
    gene22 = li_markers[7]


    #Row 0 first row
    sc.pl.umap(adata, color= gene01, title = gene01, color_map = umap_cmap, show = False, ax = axs[0, 1])
    sc.pl.umap(adata, color= gene02, title = gene02, color_map = umap_cmap, show = False, ax = axs[0, 2])
    

    #Row 1 second row
    
    sc.pl.umap(adata, color= gene10, title = gene10, color_map = umap_cmap, show = False, ax = axs[1, 0])
    sc.pl.umap(adata, color= gene11, title = gene11, color_map = umap_cmap, show = False, ax = axs[1, 1])
    sc.pl.umap(adata, color= gene12, title = gene12, color_map = umap_cmap, show = False, ax = axs[1, 2])
    

    #Row 2 third row
    
    sc.pl.umap(adata, color= gene20, title = gene20, color_map = umap_cmap, show = False, ax = axs[2, 0])
    sc.pl.umap(adata, color= gene21, title = gene21, color_map = umap_cmap, show = False, ax = axs[2, 1])
    sc.pl.umap(adata, color= gene22, title = gene22, color_map = umap_cmap, show = False, ax = axs[2, 2])
    #new_fig = fig
    #plt.close(fig)
    #fig.clf()
    return fig
    plt.close(fig)

In [23]:
for i in adata.obs[clusteringlayer].cat.categories:
    li = []
    lfc_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['logfoldchanges'][i])
    pval_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals'][i])
    he_m = max(list(set(pval_s[pval_s < 0.05].index.to_list()) & set(lfc_s[lfc_s > 0].index.to_list())))
    wl = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i]
    lr = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(30)[i]
    li = wl[wl.isin(lr)].to_list()
    figure = get_plots(clusteringlayer, i, li)
    figure.savefig('./outputs/markers/'+ clusteringlayer+'_cluster_'+i+'.pdf',format = 'pdf')
    #change directory
    figure.clf()
    plt.close(figure)

  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(


  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(
  cax = scatter(


# Extracting counts per cluster and size

The counts are extracted from the unnormalised dataset, with the raw matrix counts. We input the cluster information from the adata object

In [24]:
adata_u = sc.read_10x_mtx(
    "../common_matrices/sizes230505/Sizes_matrix/",
    var_names='gene_symbols',
    cache=True)

... reading from cache file cache\..-common_matrices-sizes230505-Sizes_matrix-matrix.h5ad


In [25]:
adata_u

AnnData object with n_obs × n_vars = 38576 × 41230
    var: 'gene_ids', 'feature_types'

We slice the adata unnormalised object to contain only the cells that are in the adata object

In [26]:
adata_u = adata_u[adata.obs.index].copy()

In [27]:
adata_u

AnnData object with n_obs × n_vars = 28738 × 41230
    var: 'gene_ids', 'feature_types'

We input the information from the size in the adata_u object

In [28]:
adata_u.obs['Size'] = adata.obs['Size']

We input the information from the 4 clustering resolutions in the adata_u object

In [29]:
adata_u.obs

Unnamed: 0,Size
L23_1_CATCAAGTGAACAGGCGTCTGTCA,L
L23_1_AACGCTTAACACAGAAAAGACGGA,L
L23_1_AGTCACTAGAACAGGCCGGATTGC,M
L23_1_CCGTGAGAGCTAACGACGCATACA,M
L23_1_CGACTGGAATAGCGACTATCAGCA,M
...,...
L23_2_CCGAAGTACCTCTATCAACGCTTA,M
L23_2_CGAACTTAACCTCCAACCGTGAGA,M
L23_2_ACAGCAGATCTTCACAAGATGTAC,L
L23_2_AAACATCGATGCCTAACATACCAA,L


In [30]:
leiden_names = adata.obs.columns[adata.obs.columns.str.contains('leiden')]

In [31]:
for le in leiden_names:
    adata_u.obs[le] = adata.obs[le]

In [32]:
adata_u.obs

Unnamed: 0,Size,leiden_1,leiden_2,leiden_3,leiden_4
L23_1_CATCAAGTGAACAGGCGTCTGTCA,L,2,7,8,9
L23_1_AACGCTTAACACAGAAAAGACGGA,L,10,17,16,19
L23_1_AGTCACTAGAACAGGCCGGATTGC,M,1,0,1,0
L23_1_CCGTGAGAGCTAACGACGCATACA,M,3,27,26,26
L23_1_CGACTGGAATAGCGACTATCAGCA,M,3,15,18,65
...,...,...,...,...,...
L23_2_CCGAAGTACCTCTATCAACGCTTA,M,12,42,43,47
L23_2_CGAACTTAACCTCCAACCGTGAGA,M,1,0,1,0
L23_2_ACAGCAGATCTTCACAAGATGTAC,L,21,28,30,32
L23_2_AAACATCGATGCCTAACATACCAA,L,8,19,19,43


Finally, we obtain the unnormalised counts from the matrix and add them as a adata_u.obs column

In [33]:
adata_u.obs['counts'] = pd.Series(np.array(adata_u.X.sum(axis =1)).flatten(), index = adata_u.obs.index).astype('int')

In [34]:
adata_u.obs

Unnamed: 0,Size,leiden_1,leiden_2,leiden_3,leiden_4,counts
L23_1_CATCAAGTGAACAGGCGTCTGTCA,L,2,7,8,9,295
L23_1_AACGCTTAACACAGAAAAGACGGA,L,10,17,16,19,125
L23_1_AGTCACTAGAACAGGCCGGATTGC,M,1,0,1,0,480
L23_1_CCGTGAGAGCTAACGACGCATACA,M,3,27,26,26,668
L23_1_CGACTGGAATAGCGACTATCAGCA,M,3,15,18,65,267
...,...,...,...,...,...,...
L23_2_CCGAAGTACCTCTATCAACGCTTA,M,12,42,43,47,706
L23_2_CGAACTTAACCTCCAACCGTGAGA,M,1,0,1,0,724
L23_2_ACAGCAGATCTTCACAAGATGTAC,L,21,28,30,32,174
L23_2_AAACATCGATGCCTAACATACCAA,L,8,19,19,43,336


In [35]:
adata_u.obs

Unnamed: 0,Size,leiden_1,leiden_2,leiden_3,leiden_4,counts
L23_1_CATCAAGTGAACAGGCGTCTGTCA,L,2,7,8,9,295
L23_1_AACGCTTAACACAGAAAAGACGGA,L,10,17,16,19,125
L23_1_AGTCACTAGAACAGGCCGGATTGC,M,1,0,1,0,480
L23_1_CCGTGAGAGCTAACGACGCATACA,M,3,27,26,26,668
L23_1_CGACTGGAATAGCGACTATCAGCA,M,3,15,18,65,267
...,...,...,...,...,...,...
L23_2_CCGAAGTACCTCTATCAACGCTTA,M,12,42,43,47,706
L23_2_CGAACTTAACCTCCAACCGTGAGA,M,1,0,1,0,724
L23_2_ACAGCAGATCTTCACAAGATGTAC,L,21,28,30,32,174
L23_2_AAACATCGATGCCTAACATACCAA,L,8,19,19,43,336


In [36]:
adata_u.obs.to_csv('./outputs/csv_counts/sizes_counts_20230605.tsv', sep="\t")