## Merge SS2 cells by mouse into separate bams for ReadZs runs

ReadZs is slow, requiring a lot of jobs, when run in SS2 mode with each cell as a separate file

To avoid this, I've previously grouped all the reads from all cells into a single file, but when I did this I didn't take into account the different mice the cells originated from

Now I'll create a grouped bam for each mouse to run through ReadZs as separate runs

In [1]:
import pandas as pd
import numpy as np

In [77]:
#mapping of cell_id to mouse_id
meta_df = pd.read_csv(
    '/oak/stanford/groups/horence/Roozbeh/single_cell_project/utility_files/meta_data/BICCN/SS2/sample_metadata.csv',
)
cols = [
    'Unnamed: 0',
    'cell_id',
    'gender',
    'external_donor_name',
]
meta_df = meta_df[cols]
meta_df = meta_df.rename(columns={
    'Unnamed: 0':'cell_id',
    'cell_id':'cell_id_num',
    'external_donor_name':'mouse_id',
})
meta_df['cell_id'] = 'SS2_'+meta_df['cell_id']
display(meta_df.shape)

#mapping of cell_id to subclass_label
ann_df = pd.read_csv('/oak/stanford/groups/horence/rob/readzs_fork/MOp_SS2_metadata.tsv',sep='\t')
display(ann_df.shape)

meta_merge = meta_df.merge(ann_df,on='cell_id')
meta_merge.head()

(6300, 4)

(6288, 7)

Unnamed: 0,cell_id,cell_id_num,gender,mouse_id,cluster_id,cluster_label,subclass_label,class_label,cluster_color,size
0,SS2_LS-15395_S41_E1-50,543130581,F,260674,19,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136
1,SS2_LS-15395_S42_E1-50,543130589,F,260674,19,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136
2,SS2_LS-15395_S43_E1-50,543130597,F,260674,46,L6 NP Trh,L5/6 NP,Glutamatergic,#47867A,72
3,SS2_LS-15395_S44_E1-50,543130605,F,260674,45,L5 NP Slc17a8,L5/6 NP,Glutamatergic,#3C78BC,138
4,SS2_LS-15395_S46_E1-50,543130621,F,260674,38,L5 IT S100b_2,L5 IT,Glutamatergic,#2F8C4D,83


In [56]:
#there are 45 different mice total (far more than I expected)
meta_merge['mouse_id'].nunique()

45

In [76]:
#although cell_id_num can be duplicated, it is always associated with the same subclass_label
meta_merge.groupby('cell_id_num')['subclass_label'].nunique().max()

1

In [89]:
#there are 17 ontologies
meta_merge['subclass_label'].nunique()

17

In [103]:
#some mice only have a single ontology
counts_df = meta_merge.groupby('mouse_id').agg(
    num_onts = ('subclass_label','nunique'),
    tot_cells = ('cell_id','nunique'),
).sort_values(['num_onts','tot_cells'],ascending=[False,False])

counts_df

Unnamed: 0_level_0,num_onts,tot_cells
mouse_id,Unnamed: 1_level_1,Unnamed: 2_level_1
265462,12,207
304659,12,205
304661,10,180
260674,10,148
265460,8,182
361997,8,127
315373,7,417
315872,7,302
298354,7,178
315377,7,175


In [105]:
counts_df['num_onts'].ge(8).sum()

6

In [98]:
meta_merge[meta_merge['mouse_id'].eq(308790)]

Unnamed: 0,cell_id,cell_id_num,gender,mouse_id,cluster_id,cluster_label,subclass_label,class_label,cluster_color,size
4189,SS2_SM-DD443_S41_E1-50,609861138,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
4190,SS2_SM-DD443_S42_E1-50,609861150,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
4191,SS2_SM-DD443_S43_E1-50,609861162,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
4192,SS2_SM-DD443_S44_E1-50,609861175,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
4193,SS2_SM-DD443_S45_E1-50,609861187,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
...,...,...,...,...,...,...,...,...,...,...
4565,SS2_SM-DD44O_S12_E1-50,611381681,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
4566,SS2_SM-DD44O_S13_E1-50,611381693,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624
4567,SS2_SM-DD44O_S14_E1-50,611381705,M,308790,35,L5 IT Tcap_1,L5 IT,Glutamatergic,#5DDB65,226
4568,SS2_SM-DD44O_S15_E1-50,611381717,M,308790,33,L5 IT Rspo1,L5 IT,Glutamatergic,#3CBC78,624


In [96]:
meta_merge[meta_merge['subclass_label'].eq('Pvalb')]['mouse_id'].value_counts()

388066    117
388065    117
388068    106
371346     59
387400     32
387397     29
387399     20
298355     17
314183     14
306957     10
298354      7
263283      7
304659      5
265462      2
260674      1
Name: mouse_id, dtype: int64

In [49]:
meta_df.groupby('mouse_id').size().sort_values(ascending=False)

mouse_id
315373    417
387400    317
387397    310
315872    302
387399    302
306906    212
265462    207
304659    205
292651    201
265460    182
304661    180
298215    178
298354    178
315377    175
319137    173
298355    159
328804    148
260674    148
263283    144
325714    142
325133    142
306909    141
319139    138
262258    127
361997    127
328802    124
306910    121
388066    117
388065    117
388068    106
262259    101
308790     91
357950     87
319138     69
371346     64
317499     62
342679     54
332663     53
260371     45
342680     40
308789     33
349305     24
314183     14
306957     10
316321      1
dtype: int64

In [40]:
cells_per_mouse_ont = (
    meta_df.groupby(
        ['mouse_id','subclass_label']
    )['cell_id'].nunique().reset_index(
        name='num_cells'
    )
)

cells_per_mouse_ont

Unnamed: 0,mouse_id,subclass_label,num_cells
0,260371,L2/3 IT,10
1,260371,L5 IT,12
2,260371,L5/6 NP,1
3,260371,L6 CT,10
4,260371,L6 IT,10
...,...,...,...
207,387400,Sst,91
208,387400,Vip,118
209,388065,Pvalb,117
210,388066,Pvalb,117


In [51]:
meta_df[meta_df['mouse_id'].eq(388066)]['subclass_label'].unique()

array(['Pvalb'], dtype=object)

In [17]:
list(meta_df.columns)

['file_id',
 'ar_id',
 'exp_component_vendor_name',
 'batch',
 'batch_vendor_name',
 'tube',
 'tube_internal_name',
 'tube_contents_nm',
 'tube_contents_nm_from_vendor',
 'tube_avg_size_bp',
 'tube_input_fmol',
 'r1_index',
 'r2_index',
 'index_sequence_pair',
 'organism',
 'facs_container',
 'sample_name',
 'patched_cell_container',
 'cell_name',
 'cell_id',
 'sample_quantity_count',
 'sample_quantity_pg',
 'external_donor_name',
 'gender',
 'control',
 'cell_prep_type',
 'full_genotype',
 'facs_population_plan',
 'cre_line',
 'reporter',
 'injection_roi',
 'injection_materials',
 'roi',
 'patchseq_roi',
 'medical_conditions',
 'slice_min_pos',
 'slice_max_pos',
 'rna_amplification_set',
 'rna_amplification',
 'amp_date',
 'pcr_cycles',
 'percent_cdna_longer_than_400bp',
 'rna_amplification_pass_fail',
 'amplified_quantity_ng',
 'library_prep_set',
 'library_prep',
 'lib_date',
 'library_input_ng',
 'avg_size_bp',
 'quantification2_ng',
 'quantification_fmol',
 'quantification2_nm',
 