# BDSO Correlation
Compares the source taxonomies of the BDS Ontology with the CxG integrated MoP dataset to evaluate their compatibility

In [48]:
## IMPORT depedencies 

import os
import pandas as pd
import anndata as ad
import numpy as np
import urllib.request as request
import zipfile

## Read h5ad (ann_data)

Please manually download the h5ad file from https://cellxgene.cziscience.com/collections/ae1420fe-6630-46ed-8b3d-cc6056a66467

In [84]:
ann_data = ad.read_h5ad("h5ad-downloads/an-integrated-transcriptomic-cellxgene.h5ad", backed="r")

In [89]:
print(ann_data.obs.shape)
ann_data.obs.head(4)

(406187, 29)


Unnamed: 0,BICCN_cluster_id,QC,BICCN_cluster_label,BICCN_subclass_label,BICCN_class_label,cluster_color,size,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,...,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
SM-D9D8O_S03_E1-50,48.0,1201,L6 CT Grp_1,L6 CT,Glutamatergic,#197A6E,781.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,304659,cell,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-DAIH5_S47_E1-50,48.0,3678,L6 CT Grp_1,L6 CT,Glutamatergic,#197A6E,781.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,319137,cell,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-DD44L_S43_E1-50,4.0,4495,Lamp5 Slc35d3,Lamp5,GABAergic,#FFA388,194.0,GABAergicLamp5,ILX:0770149,EFO:0008930,...,298355,cell,GABAergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-D9E5O_S40_E1-50,49.0,1499,L6 CT Grp_2,L6 CT,Glutamatergic,#358459,24.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,306909,cell,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage


## Read cell_to_cell_set_assignments (c2c_data)

Downloads, unzips and loads the cell to cell set assingments data

In [79]:
if not os.path.exists(os.path.join(os.getcwd(), "cell_to_cell_set_assignments_CCN202002013.csv")):
    # Download
    request.urlretrieve("https://github.com/AllenInstitute/MOp_taxonomies_ontology/blob/main/mouseMOp_CCN202002013/cell_to_cell_set_assignments_CCN202002013.zip?raw=true", "cell_to_cell_set_assignments_CCN202002013.zip")

    # Unzip
    with zipfile.ZipFile("cell_to_cell_set_assignments_CCN202002013.zip", 'r') as zip_ref:
        zip_ref.extractall(os.getcwd())
    
# Load to DataFrame
c2c_data = pd.read_csv("cell_to_cell_set_assignments_CCN202002013.csv")

print(c2c_data.shape)
c2c_data.head(5)

(159738, 258)


Unnamed: 0,sample_name,CS202002013_81,CS202002013_1,CS202002013_2,CS202002013_3,CS202002013_4,CS202002013_5,CS202002013_6,CS202002013_7,CS202002013_8,...,CS202002013_248,CS202002013_249,CS202002013_250,CS202002013_251,CS202002013_252,CS202002013_253,CS202002013_254,CS202002013_255,CS202002013_256,CS202002013_257
0,pBICCNsMMrMOpRAiF003d190318_AAACCCAAGGCCTTGC,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,pBICCNsMMrMOpRAiF003d190318_AAACCCAAGGCTAAAT,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,pBICCNsMMrMOpRAiF003d190318_AAACCCATCTGAGCAT,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,pBICCNsMMrMOpRAiF003d190318_AAACGAAAGGGCGAGA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,pBICCNsMMrMOpRAiF003d190318_AAACGAACAGAGATTA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Report the differences between c2c_data and ann_data

List all c2cdata sample names that doesn't exist in the anndata

In [91]:
adata_keys = set(ann_data.obs.index.tolist())
cdata_keys = set(c2c_data.sample_name.unique().tolist())
diff = [x for x in cdata_keys if x not in adata_keys]
diff

[]

Seems c2cdata is subset of anndata!

In [108]:
adata_subset = ann_data.obs[ann_data.obs.index.isin(cdata_keys)]
adata_subset.shape

(159738, 29)

## Report the differences between ann_data and c2c_data

Filter ann_data to keep only the matching samples. But ann_data has 406187 rows while c2c_data has only 159738 rows. So check if we need all the ann_data for comparison.

In [93]:
adata_subset.assay.unique()

['10x 3' v3']
Categories (3, object): ['Smart-seq', '10x 3' v2', '10x 3' v3']

In [94]:
adata_subset.suspension_type.unique()

['nucleus']
Categories (2, object): ['cell', 'nucleus']

Seems we are only using assay="10x 3' v3" and suspension_type="nucleus" !

In [103]:
adata_assay = adata.obs[adata.obs["assay"]=="10x 3' v3"]
adata_filtered = target_assay[target_assay["suspension_type"]=="nucleus"]
adata_filtered.shape

(199904, 29)

List all ann_data filtered rows that doesn't exist in the c2cdata

In [109]:
adata_filtered_keys = set(adata_filtered.index.tolist())
diff2 = [x for x in adata_filtered_keys if x not in cdata_keys]
only_anndata = adata_filtered[adata_filtered.index.isin(diff2)]

print(only_anndata.shape)
only_anndata.head(5)

(40166, 29)


Unnamed: 0,BICCN_cluster_id,QC,BICCN_cluster_label,BICCN_subclass_label,BICCN_class_label,cluster_color,size,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,...,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
AAACCCAAGCTCTTCC-1L8TX_181211_01_A02,59.0,1.0,L6b Shisa6,L6b,Glutamatergic,#2B9880,247.0,GlutamatergicL6b,ILX:0770163,EFO:0009922,...,427311,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage
AAACCCAAGTCGAATA-1L8TX_181211_01_A02,54.0,2.0,L6 CT Brinp3,L6 CT,Glutamatergic,#338C5E,3970.0,GlutamatergicL6 CT,ILX:0770162,EFO:0009922,...,427311,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage
AAACCCACAACCCTAA-1L8TX_181211_01_A02,11.0,3.0,Vip Chat,Vip,GABAergic,#FF00FF,519.0,GABAergicVip,ILX:0770151,EFO:0009922,...,427311,nucleus,GABAergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage
AAACCCAGTATAGCTC-1L8TX_181211_01_A02,34.0,4.0,L5 IT Rspo1_3,L5 IT,Glutamatergic,#3CBC45,1838.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,...,427311,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage
AAACCCAGTCAGACTT-1L8TX_181211_01_A02,30.0,5.0,L2/3 IT_2,L2/3 IT,Glutamatergic,#7C8169,2105.0,GlutamatergicL2/3 IT,ILX:0770156,EFO:0009922,...,427311,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage
