In [1]:
# This notebook gathers metadata for the MERFISH dataset. 
# In particular we want to get (x,y,z) co-ordinates, and celltype annotations for the MERFISH and Zhuang datasets. 
import anndata as ad
import pandas as pd
import numpy as np
import toml

In [2]:
paths = toml.load('/allen/programs/celltypes/workgroups/mousecelltypes/Rohan/code/abcex/config.toml')['root']
expr_data = ad.read_h5ad(paths['data']+'expression_matrices/MERFISH-C57BL6J-638850/20230830/C57BL6J-638850-log2.h5ad', backed='r')

In [3]:
genes_anno = pd.read_csv(paths['data']+'metadata/MERFISH-C57BL6J-638850/20230830/gene.csv')
cells_anno_1 = pd.read_csv(paths['data']+'metadata/MERFISH-C57BL6J-638850/20230830/cell_metadata.csv')
cells_anno_2 = pd.read_csv(paths['data']+'metadata/MERFISH-C57BL6J-638850/20230830/views/cell_metadata_with_cluster_annotation.csv')
cells_anno_3 = pd.read_csv(paths['data']+'metadata/MERFISH-C57BL6J-638850/20230830/views/ccf_coords.csv')
# ccf_coords.csv was generated using 01_merfish_ccf_registration_tutorial.ipynb adapted from abc_atlas_access/notebooks.

In [4]:
print(genes_anno.columns)
print(cells_anno_1.columns)
print(cells_anno_2.columns)
print(cells_anno_3.columns)

Index(['gene_identifier', 'gene_symbol', 'transcript_identifier', 'name',
       'mapped_ncbi_identifier'],
      dtype='object')
Index(['cell_label', 'brain_section_label', 'cluster_alias',
       'average_correlation_score', 'feature_matrix_label', 'donor_label',
       'donor_genotype', 'donor_sex', 'x', 'y', 'z'],
      dtype='object')
Index(['cell_label', 'brain_section_label', 'cluster_alias',
       'average_correlation_score', 'feature_matrix_label', 'donor_label',
       'donor_genotype', 'donor_sex', 'x', 'y', 'z', 'neurotransmitter',
       'class', 'subclass', 'supertype', 'cluster', 'neurotransmitter_color',
       'class_color', 'subclass_color', 'supertype_color', 'cluster_color'],
      dtype='object')
Index(['cell_label', 'brain_section_label', 'cluster_alias',
       'average_correlation_score', 'feature_matrix_label', 'donor_label',
       'donor_genotype', 'donor_sex', 'x_section', 'y_section', 'z_section',
       'neurotransmitter', 'class', 'subclass', 'supertype'

In [5]:
expr_data_ = expr_data[cells_anno_3['cell_label'],:]

In [6]:
print(expr_data.shape)
print(cells_anno_1.shape)
print(cells_anno_2.shape)
print('---')
print(expr_data_.shape)
print(cells_anno_3.shape)

(4334174, 550)
(3938808, 11)
(3938808, 21)
---
(3739961, 550)
(3739961, 38)


In [7]:
assert np.all(cells_anno_1['cell_label']==cells_anno_2['cell_label'])
assert np.all(expr_data_.obs.index==cells_anno_3['cell_label'])
display(cells_anno_3['parcellation_division'].value_counts().to_frame())

Unnamed: 0_level_0,count
parcellation_division,Unnamed: 1_level_1
Isocortex,935742
STR,401346
CB,383127
HPF,304642
MB,281852
OLF,274354
MY,147562
P,136569
TH,133805
HY,132902


In [8]:
# The Zhuang dataset seems to have a separate annotation for ccf coordinates.
display(cells_anno_3['parcellation_organ'].value_counts().to_frame())
display(cells_anno_3['parcellation_structure'].value_counts().to_frame())

Unnamed: 0_level_0,count
parcellation_organ,Unnamed: 1_level_1
brain,3671375
unassigned,68586


Unnamed: 0_level_0,count
parcellation_structure,Unnamed: 1_level_1
CP,234174
MOs,90859
MOp,83976
PIR,77916
MOB,77675
...,...
ISN,45
IV,36
ACVII,36
BA,25


In [9]:
parc = ['parcellation_organ', 'parcellation_category',
       'parcellation_division', 'parcellation_structure',
       'parcellation_substructure']

df = cells_anno_3[parc].drop_duplicates()
display(df)

Unnamed: 0,parcellation_organ,parcellation_category,parcellation_division,parcellation_structure,parcellation_substructure
0,brain,grey,HPF,DG,DG-po
1,brain,grey,P,TRN,TRN
2,brain,grey,P,P-unassigned,P-unassigned
3,brain,fiber tracts,cbf,arb,arb
7,brain,grey,P,PG,PG
...,...,...,...,...,...
743025,brain,grey,MB,IV,IV
910693,brain,grey,MB,IPN,IPN-unassigned
932528,brain,fiber tracts,cm,Xn,ts
958180,brain,grey,Isocortex,FRP,FRP6b


In [10]:
meta = cells_anno_3.copy()
print(meta.shape)

(3739961, 38)


In [11]:
expr_data_ = expr_data[meta['cell_label'],:]
print(expr_data_.shape)

(3739961, 550)


In [12]:
# 200 micrometers is 0.2 mm. 
z = meta['z_section'].unique()
z = np.sort(z)
print(np.diff(z))

[0.2 0.4 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2
 0.2 0.2 0.4 0.2 0.2 0.4 0.2 0.2 0.2 0.2 0.2 0.4 0.2 0.2 0.2 0.2 0.4 0.2
 0.2 0.2 0.2 0.2 0.4 0.2 0.4 0.2 0.2 0.2 0.4 0.2 0.2 0.8 0.8 0.4]


In [13]:
# making sure that all of this is from the same brain. 
temp = meta['brain_section_label'].drop_duplicates().to_frame()
temp['brain_section_label'] = temp['brain_section_label'].str.split('.').str[0]
temp.drop_duplicates(inplace=True)
assert len(temp) == 1, "multiple brains found"
display(temp)


Unnamed: 0,brain_section_label
0,C57BL6J-638850


In [14]:
#meta = meta.loc[(meta['z_section']==3.6) & (meta['brain_section_label']=='C57BL6J-638850.19')]
expr_data_ = expr_data[meta['cell_label'],:]

In [15]:
expr_data_.shape

(3739961, 550)

In [16]:
# from scipy.spatial import KDTree
# import matplotlib.pyplot as plt
# import seaborn as sns
# custom_params = {"axes.spines.right": False, "axes.spines.top": False}
# sns.set_theme(style="ticks", font_scale=0.8, rc=custom_params)
# %config InlineBackend.figure_format='retina'

# kdt = KDTree(meta[['x_section','y_section','z_section']], leafsize=20)

In [17]:
# dist_mm, ind = kdt.query(meta[['x_section','y_section','z_section']].iloc[1000], 
#                          k=20)
# dist_um = dist_mm * 1e3
# display(meta.iloc[ind])

# f, ax = plt.subplots(1,1,figsize = (3,2))
# ax.plot(dist_um,'.')
# ax.set(xlabel = 'k-th neighbor', ylabel=r'distance ($\mu m$)', ylim = (0, 100))
# plt.show()

In [18]:
display(meta.head(3))

Unnamed: 0,cell_label,brain_section_label,cluster_alias,average_correlation_score,feature_matrix_label,donor_label,donor_genotype,donor_sex,x_section,y_section,...,parcellation_organ,parcellation_category,parcellation_division,parcellation_structure,parcellation_substructure,parcellation_organ_color,parcellation_category_color,parcellation_division_color,parcellation_structure_color,parcellation_substructure_color
0,1019171907102340387-1,C57BL6J-638850.37,1408,0.596276,C57BL6J-638850,C57BL6J-638850,wt/wt,M,7.226245,4.148963,...,brain,grey,HPF,DG,DG-po,#FFFFFF,#BFDAE3,#7ED04B,#7ED04B,#7ED04B
1,1104095349101460194-1,C57BL6J-638850.26,4218,0.64118,C57BL6J-638850,C57BL6J-638850,wt/wt,M,5.064889,7.309543,...,brain,grey,P,TRN,TRN,#FFFFFF,#BFDAE3,#FF9B88,#FFBA86,#FFBA86
2,1017092617101450577,C57BL6J-638850.25,4218,0.763531,C57BL6J-638850,C57BL6J-638850,wt/wt,M,5.792921,8.189973,...,brain,grey,P,P-unassigned,P-unassigned,#FFFFFF,#BFDAE3,#FF9B88,#FF9B88,#FF9B88


In [19]:
meta.reset_index(drop=True, inplace=True)
adata = expr_data_.to_memory()
adata.obs = meta
adata.write_h5ad('../data/full_brain.h5ad')