## Template 23/08/22


We need to load the following libraries. Make sure that the anndata version you have installed matches the one used by [cellxgene-schema](https://github.com/chanzuckerberg/single-cell-curation)

In [6]:
import pandas as pd
import scanpy as sc 
import anndata as ad
import numpy as np
from scipy.sparse import csr_matrix

## Read data

Read the annotated and raw h5ad, or h5 file

In [8]:
adata = sc.read("an-integrated-transcriptomic-cellxgene.h5ad")

In [10]:
adata.obs

Unnamed: 0,BICCN_cluster_id,QC,BICCN_cluster_label,BICCN_subclass_label,BICCN_class_label,cluster_color,size,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,...,is_primary_data,organism_ontology_term_id,cell_type,assay,disease,organism,sex,tissue,ethnicity,development_stage
SM-D9D8O_S03_E1-50,48.0,1201,L6 CT Grp_1,L6 CT,Glutamatergic,#197A6E,781.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-DAIH5_S47_E1-50,48.0,3678,L6 CT Grp_1,L6 CT,Glutamatergic,#197A6E,781.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-DD44L_S43_E1-50,4.0,4495,Lamp5 Slc35d3,Lamp5,GABAergic,#FFA388,194.0,GABAergicLamp5,ILX:0770149,EFO:0008930,...,True,NCBITaxon:10090,GABAergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-D9E5O_S40_E1-50,49.0,1499,L6 CT Grp_2,L6 CT,Glutamatergic,#358459,24.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage
LS-15533_S14_E12-50,37.0,391,L5 IT S100b_1,L5 IT,Glutamatergic,#00CF1F,251.0,GlutamatergicL5 IT,ILX:0770157,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SM-GE66G_S146_E1-50,37.0,6098,L6 CT,L6 CT,Glutamatergic,#197A6E,1288.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-GE64N_S067_E1-50,22.0,1714,Pvalb Il1rapl2,Pvalb,GABAergic,#BC2C41,163.0,GABAergicPvalb,ILX:0770154,EFO:0008930,...,True,NCBITaxon:10090,GABAergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-GE66N_S162_E1-50,37.0,4802,L6 CT,L6 CT,Glutamatergic,#197A6E,1288.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage
SM-GE66A_S184_E1-50,18.0,5295,Sst Pvalb Th,Sst,GABAergic,#FF2F7E,25.0,GABAergicSst,ILX:0770152,EFO:0008930,...,True,NCBITaxon:10090,GABAergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage


In [16]:
adata.obs.keys()

Index(['BICCN_cluster_id', 'QC', 'BICCN_cluster_label', 'BICCN_subclass_label',
       'BICCN_class_label', 'cluster_color', 'size', 'temp_class_label',
       'BICCN_ontology_term_id', 'assay_ontology_term_id', 'sex_original',
       'disease_ontology_term_id', 'tissue_ontology_term_id', 'BICCN_project',
       'cell_type_ontology_term_id', 'ethnicity_ontology_term_id',
       'development_stage_ontology_term_id', 'sex_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'cell_type', 'assay',
       'disease', 'organism', 'sex', 'tissue', 'ethnicity',
       'development_stage'],
      dtype='object')

In [33]:
adata.obs['BICCN_cluster_id']

SM-D9D8O_S03_E1-50     48.0
SM-DAIH5_S47_E1-50     48.0
SM-DD44L_S43_E1-50      4.0
SM-D9E5O_S40_E1-50     49.0
LS-15533_S14_E12-50    37.0
                       ... 
SM-GE66G_S146_E1-50    37.0
SM-GE64N_S067_E1-50    22.0
SM-GE66N_S162_E1-50    37.0
SM-GE66A_S184_E1-50    18.0
SM-GE66O_S139_E1-50    22.0
Name: BICCN_cluster_id, Length: 406187, dtype: float64

In [18]:
adata.X

<406187x22356 sparse matrix of type '<class 'numpy.float32'>'
	with 1782527167 stored elements in Compressed Sparse Row format>

In [19]:
adata.var

Unnamed: 0_level_0,Unnamed: 0,feature_biotype,feature_is_filtered,feature_name,feature_reference
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000029422,7173.0,gene,False,Rsrc2,NCBITaxon:10090
ENSMUSG00000049036,18484.0,gene,False,Tmem121,NCBITaxon:10090
ENSMUSG00000029577,7022.0,gene,False,Ube3b,NCBITaxon:10090
ENSMUSG00000040746,15950.0,gene,False,Rnf167,NCBITaxon:10090
ENSMUSG00000020590,17963.0,gene,False,Snx13,NCBITaxon:10090
...,...,...,...,...,...
ENSMUSG00000102368,792.0,gene,False,4930590L20Rik,NCBITaxon:10090
ENSMUSG00000021033,18266.0,gene,False,Gstz1,NCBITaxon:10090
ENSMUSG00000030657,10469.0,gene,False,Xylt1,NCBITaxon:10090
ENSMUSG00000037924,1043.0,gene,False,Olfr16,NCBITaxon:10090


In [20]:
adata.obs

Unnamed: 0,BICCN_cluster_id,QC,BICCN_cluster_label,BICCN_subclass_label,BICCN_class_label,cluster_color,size,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,...,is_primary_data,organism_ontology_term_id,cell_type,assay,disease,organism,sex,tissue,ethnicity,development_stage
SM-D9D8O_S03_E1-50,48.0,1201,L6 CT Grp_1,L6 CT,Glutamatergic,#197A6E,781.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-DAIH5_S47_E1-50,48.0,3678,L6 CT Grp_1,L6 CT,Glutamatergic,#197A6E,781.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-DD44L_S43_E1-50,4.0,4495,Lamp5 Slc35d3,Lamp5,GABAergic,#FFA388,194.0,GABAergicLamp5,ILX:0770149,EFO:0008930,...,True,NCBITaxon:10090,GABAergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-D9E5O_S40_E1-50,49.0,1499,L6 CT Grp_2,L6 CT,Glutamatergic,#358459,24.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage
LS-15533_S14_E12-50,37.0,391,L5 IT S100b_1,L5 IT,Glutamatergic,#00CF1F,251.0,GlutamatergicL5 IT,ILX:0770157,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SM-GE66G_S146_E1-50,37.0,6098,L6 CT,L6 CT,Glutamatergic,#197A6E,1288.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-GE64N_S067_E1-50,22.0,1714,Pvalb Il1rapl2,Pvalb,GABAergic,#BC2C41,163.0,GABAergicPvalb,ILX:0770154,EFO:0008930,...,True,NCBITaxon:10090,GABAergic neuron,Smart-seq,normal,Mus musculus,male,primary motor cortex,na,early adult stage
SM-GE66N_S162_E1-50,37.0,4802,L6 CT,L6 CT,Glutamatergic,#197A6E,1288.0,GlutamatergicL6 CT,ILX:0770162,EFO:0008930,...,True,NCBITaxon:10090,glutamatergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage
SM-GE66A_S184_E1-50,18.0,5295,Sst Pvalb Th,Sst,GABAergic,#FF2F7E,25.0,GABAergicSst,ILX:0770152,EFO:0008930,...,True,NCBITaxon:10090,GABAergic neuron,Smart-seq,normal,Mus musculus,female,primary motor cortex,na,early adult stage


In [21]:
a_obs = pd.DataFrame()

In [24]:
a_obs['author_cell_type'] = adata.obs['BICCN_cluster_label']
a_obs['author_subclass_type'] = adata.obs['BICCN_subclass_label']
a_obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id']

In [25]:
a_obs.to_csv("a_obs_layer.csv")

In [34]:
xls = ExcelFile('path_to_file.xls')
di = xls.parse(xls.sheet_names[0])
print di.to_dict()

SyntaxError: invalid syntax (3116784630.py, line 3)

In [None]:
di = {'CL:0009111': "CL:0000844", 'CL:0009112': "CL:0000845"}
adata.obs.map({"cell_type_ontology_term_id": di})

In [None]:
adata.obs['cell_type_ontology_term_id'][adata.obs['cell_type_ontology_term_id'] == 'CL:0009111'] = 'CL:0000844'

In [None]:
adata.write("To_Cellxgene_spacetime_200922.h5ad")

In [None]:
rdata = sc.read("Full_obj_raw_counts_nosoupx.h5ad")

Use the below cells if the given data is in the h5 file format

In [None]:
adata = sc.read_10x_h5("Full_ob_raw_counts_soupx.h5")

In [None]:
rdata = sc.read_10x_h5("Full_ob_raw_counts_nosoupx.h5")

## Prepare the hd5a layers need for CellxGene

### Prepare the obs layer

First we extract the obs layer from the h5ad file that has been provided to us. Then we extract out the obs layer as a csv file. 



In [None]:
a_obs = adata.obs
a_obs.to_csv("a_obs_layer.csv")

In [None]:
r_obs = rdata.obs
r_obs.to_csv("r_obs_layer.csv")

Download the excel spreadsheet from the INGEST data submission, and navigate to the sequence_file page.
Select the cell_suspension.biomaterial_core.biomaterial_id, cell_suspension.uuid, and library_preparation_protocol.protocol_core.protocol_id and filter for duplicates on those keys. This results in a unique set of cell_suspensions with their related data. 

See script here: https://github.com/ebi-ait/ingest-cellxgene-submitter#create-obs-layer-from-multiple-cell-suspension-uuids

The manual work here is  matching up unique HCA cell_suspensions and metadata with the  unique samples in the provided-h5ad file. We then generate a final obs_layer.csv which combines HCA metadata with the provided metadata from the contributor in matching rows. There is an opportunity for future automation here for scripts to perform the matching and to provide cell-type ontology terms. 

For a list of fields which should be in the final obs_layer.csv see (https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/2.0.0/schema.md#obs-cell-metadata)
- assay_ontology_term_id
- cell_type_ontology_term_id
- development_stage_ontology_term_id
- disease_ontology_term_id
- ethnicity_ontology_term_id
- is_primary_data
- organism_ontology_term_id
- sex_ontology_term_id
- tissue_ontology_term_id
- donor_id

There is also some manual work here to select cell-type ontology terms from the free text cell-type provided by the contributors. There is an opportunity for future automation here to get scripts / work with other experts in ontology to do this. 

Once we have the final combined obs_layer, then save it as "obs_layer.csv" and save it as a dobs2 object

In [None]:
dobs2 = pd.read_csv("obs_layer.csv", sep=",")

In [None]:
dobs2 = dobs2.set_index("barcodes", inplace = False)

Check here for redundant fields in the obs layer (e.g. tissue_label, donor_ext). Assay_ontology_term_id should be '10x v2 5' sequencing' as opposed to '10x TCR'.
Development_stage and tissue should be made specific (e.g. 35-year old human stage).
Unannotated cells use CL:0000003 (native cell).
Check that certain fields in the obs layer should not be continuous / numerical, but should be categorical or string. Use the cells below to fix this if necessary.

In [None]:
adata.obs = adata.obs.drop(['Unnamed: 18','author_tissue','tissue_label','donor_ext'], axis = 1)

In [None]:
adata.obs.dtypes

In [None]:
adata.obs = adata.obs.astype({'batch': 'category','cluster':'category'}, copy = False)

### Prepare the uns layer

In cases where the uns layer already exists, rather than creating the uns layer, we will add to it.
In this case, schema_version is 2.0.0 (although 3.0.0 is coming soon). Title is the name of the dataset. X_normalization is being deprecated in 3.0.0, but is the method of generating the X layer from the raw.X. 

In [None]:
uns = adata.uns
uns.keys

In [None]:
uns['schema_version']='2.0.0'
uns['title'] = "Total - Cells of the human intestinal tract mapped across space and time"
uns["X_normalization"] = "10x Genomics Visium spatial sequencing samples were aligned to the human transcriptome GRCh38-3.0.0 reference (consistently with single-cell RNA-seq samples) using 10x Genomics SpaceRanger v.1.2.1 and exonic reads were used to produce mRNA count matrices for each sample. 10x Genomics SpaceRanger was also used to align paired histology images with mRNA capture spot positions in the Visium slide. The paired image was used to determine the average number of nuclei per Visium location in the tissue and used as a hyperparameter in the spatial mapping of cell types."
uns

Else if the uns layer does not exist, then we will create it 

In [None]:
uns ={
    "schema_version": "2.0.0",
    "title": "Transcriptomic characterisation of haematopoietic stem and progenitor cells from human adult bone marrow, spleen and peripheral blood",
    "X_normalization": "The 10x count matrices from both donors (after multiplet filtering) were processed with the Seurat (version 2.3.4) R package",
}

### Check what's in the X, var and obs layers before editing them

Check that the obs_names from both annotated and raw matrices are the same

In [None]:
t = normdata.obs_names==rawdata.obs_names
np.count_nonzero(t)==normdata.X.shape[0]

Assuming both obs layers do not match, not even in length, we would need to select from the raw layer those samples that are in the annotated layer. If they do match then ignore this. 

keep_rindices is a measure for the raw layer obs samples which are also in the annotated layer. 

In [None]:
condition_rdata = rdata.obs_names.isin(adata.obs_names)
keep_rdata = rdata.obs_names[condition_rdata]
keep_rindices = np.where(condition_rdata == True)

### Prepare the var layer

In this case, we may need to perform different steps:
- Map the gene IDs to ENSEMBL IDs (using a specific GTF file)
- If any genes genes have been filtered out when the authors processed the raw matrix, they will not available in the annotated matrix. We need to add them to the processed matrix. We checked this by inspecting the matrices, but can be checked again by running the following cell

Check that both dataframes have the same number of rows. If they are different, the authors filtered out some genes from the PROCESSED, and we will need to add them in. 


In [None]:
dvar = pd.DataFrame(data=adata.var)
rvar = pd.DataFrame(data=rdata.var)
dvar.shape[0] == rvar.shape[0]

Some of the features have been filtered out of the processed matrices. We have to add the filtered-out genes at the end of the matrices. For that, we are gonna first fill in the *feature_is_filtered* column at the rvar dataframe. We can then create a new dataframe dropping all the non filtered gene, and add this dataframe with the filtered genes at the end of dvar

In [None]:
genes_add = [x for x in rdata.var.index.to_list() if x not in adata.var.index.to_list()]
all_genes = adata.var.index.to_list()
all_genes.extend(genes_add)
new_var = pd.DataFrame(index=all_genes)
new_var = pd.merge(new_var, rdata.var, left_index=True, right_index=True, how='left')
new_var['feature_is_filtered'] = False
new_var.loc[genes_add, 'feature_is_filtered'] = True

In [None]:
adata.var

In [None]:
rdata.var

In [None]:
new_var

In [None]:
rvar = rvar.set_index("gene_ids")

In [None]:
dvar = pd.read_csv('dvar.csv')

In [None]:
dvar.pop('Gene_symbols')

In [None]:
dvar = dvar.set_index("gene_ids")

### Prepare the X layer

In this case, we need to:
- Append to the processed X layer the counts for the genes that were filtered out. We will set them to 0.

#### Filter out the genes that did not map to ENSEMBL IDs

In [None]:
if rdata.shape[1] > adata.shape[1]:
    genes_add = [x for x in rdata.var.index.to_list() if x not in adata.var.index.to_list()]
    new_matrix = csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape = rdata.shape)
    all_genes = adata.var.index.to_list()
    all_genes.extend(genes_add)
    new_var = pd.DataFrame(index=all_genes)
    new_var = pd.merge(new_var, rdata.var, left_index=True, right_index=True, how='left')
    new_var['feature_is_filtered'] = False
    new_var.loc[genes_add, 'feature_is_filtered'] = True
    new_adata = ad.AnnData(X=new_matrix, obs=adata.obs, var=dvar, uns=adata.uns, obsm=adata.obsm)
    new_adata = new_adata[:,rdata.var['gene_ids'].to_list()]
else:
    print('whoops')

In [None]:
new_adata = new_adata[:,cxg_adata_raw.var.index.to_list()]


In [None]:
print(new_adata.X)

In [None]:
dExprs = normdata.X.toarray()
dExprs = pd.DataFrame(data=dExprs)
print(dExprs.shape)

In [None]:
rExprs = rawdata.X.toarray()

In [None]:
rExprs = rExprs.set_axis(rvar.index.to_list(), axis=1, inplace=False)

#### Append the genes filtered out by the authors when processing the X layer

In [None]:
genesToAdd = dvar.loc[dvar['feature_is_filtered'] == True]

In [None]:
dExprs = dExprs.reindex(columns=[*dExprs.columns.tolist(), *genesToAdd.index.to_list()], fill_value=0.0)

In [None]:
dExprs

In [None]:
dExprs.to_csv("dExprs_added_genes.csv")

In [None]:
dExprs = dExprs.set_axis(dvar.index.to_list(), axis=1, inplace=False)
print(dExprs.shape)

## Prepare the hd5a file


In [None]:
# Generate annotated h5ad matrix
# To speed up the matrices can also run: adata.X = csr_matrix(adata.X)
del dExprs


In [None]:
hdata = ad.AnnData(X=dExprs, var=dvar)
del dExprs_np
del dvar


In [None]:
hdata.uns = uns


In [None]:
hdata.obs = dobs2


In [None]:
hdata.obsm = normdata.obsm

In [None]:
hdata.obs

In [None]:
# Generate annotated h5ad matrix
# To speed up the matrices can also run: adata.X = csr_matrix(adata.X)
hdata = ad.AnnData(X=dExprs, obs=dobs2, var=dvar ,obsm=normdata.obsm, uns=uns)

Save the annotated matrix

In [None]:
new_adata.var.pop('mt')

In [None]:
new_adata.var = new_adata.var.astype({'mt': 'string'})


In [None]:
new_adata.var.dtypes

In [None]:
new_adata.obs['Age']

In [None]:
new_adata.write_h5ad("normalized_spacetime_matrix_edited.h5ad")

In [None]:
hdata = new_adata

Generate raw h5ad matrix, in this case raw.var is not the same as var, as some genes have been filtered out

In [None]:
# Generate raw h5ad matrix, raw.var is not always going to be the same as var, but it is in this case because the
# genes being investigated are identical and nothing was filtered out 
idata = ad.AnnData(X=cxg_adata_raw.X, var=rvar)

In [None]:
idata.write("raw_spacetime_matrix_edited.h5ad")

## Filter allowed ENSEMBL IDs
CellxGene only allows a set of genes (ENSEMBL IDs) to be included in their matrices. We need to filter those out.

In [None]:
# Read in the saved objects if you start from here this section
hdata = sc.read('normalized_spacetime_matrix_edited.h5ad')
idata = sc.read('raw_spacetime_matrix_edited.h5ad')

In [None]:
idata.var.drop("noVersionENS",axis=1,inplace=True)

In [None]:
hdata.var

In [None]:
# idata.var.set_index("noVersionENS", inplace=True)
idata.var = idata.var.drop('feature_is_filtered', axis = 'columns')

In [None]:
idata.var

In [None]:
hdata.var.index

In [None]:
#  Filter out allowed ENSEMBL IDs from annotated matrix
approved = pd.read_csv('/Users/wteh/Documents/Wrangling/Cellxgene/KnowledgeShare/allowed_genes.txt',dtype='str')['feature_id'].to_list()

var_to_keep = hdata.var.index.tolist()
var_to_keep
var_in_approved = hdata.var.index[hdata.var.index.isin(approved)].tolist()
var_to_keep = [e for e in var_to_keep if e in var_in_approved]
hdata = hdata[:, var_to_keep]

In [None]:
var_to_keep = idata.var.index.tolist()
var_to_keep
var_in_approved = idata.var.index[idata.var.index.isin(approved)].tolist()
var_to_keep = [e for e in var_to_keep if e in var_in_approved]
idata = idata[:, var_to_keep]

In [None]:
hdata.var.shape

## Add raw matrix to hdata


In [None]:
# Add raw X matrix and raw var to the h5ad matrix 
hdata.raw = idata

In [None]:
#if needed, transfer to sparse matrix format
if type(hdata.X) != "sparse.csr.csr_matrix":
    print('converting X to sparse')
    hdata.X = csr_matrix(hdata.X)
# if hdata.raw:
#     if type(hdata.raw.X) != "sparse.csr.csr_matrix":
#         print('converting raw.X to sparse')
#         raw_adata = ad.AnnData(hdata.raw.X, var=hdata.raw.var, obs=hdata.obs)
#         raw_adata.X = csr_matrix(raw_adata.X)
#         hdata.raw = raw_adata

In [None]:
hdata.write("To_Cellxgene_spacetime_230822.h5ad")

In [None]:
adata.obs.dtypes

## Validate before sending to CellxGene


Run the cellxgene-schema validate tool which validates if a h5ad object meets cellxgene schema. See details here: https://github.com/chanzuckerberg/single-cell-curation

In [None]:
hdata = sc.read('To_Cellxgene_spacetime_190822.h5ad')


In [None]:
hdata.obs = dobs2

In [None]:
hdata.obs

In [None]:
hdata.uns = uns

In [None]:
hdata.obs['disease_ontology_term_id']

In [None]:
hdata.obs['disease_ontology_term_id'][hdata.obs['disease_ontology_term_id'] == 'EFO:0000384'] = 'MONDO:0005011'

In [None]:
MONDO:0005011

In [None]:
rawdata.var

In [None]:
hdata.raw.var


In [None]:
normdata.var

In [None]:
hdata.var

In [None]:
hdata.obs = hdata.obs.drop({'Age','Gender','Region code','sample name'}, axis = 'columns')

In [None]:
hdata.var

In [None]:
hda

In [None]:
hdata.var['feature_biotype'] = 'gene'

In [None]:
hdata.raw.var['feature_biotype'] = 'gene'

In [None]:
hdata.obs = hdata.obs.astype({"disease_ontology_term_id":"category"})

In [None]:
hdata.obs.dtypes

In [None]:
hdata.var.pop('mt')

In [None]:
hdata.obs['is_primary_data']

In [None]:
hdata.obs['is_primary_data'] = hdata.obs['is_primary_data'].replace('no', False)


In [None]:
hdata.obs = hdata.obs.astype({"is_primary_data":"bool"})

In [None]:
hdata.obs['cell_type_ontology_term_id'][hdata.obs['cell_type_ontology_term_id'] == 'CL:0009111'] = 'CL:0000844'

In [None]:
hdata.obs['cell_type_ontology_term_id']

In [None]:
CL: 0009112, CL: 0009111, CL: 0009105

In [None]:
CL:0009112 - centroblast - CL:0000844 (germinal center B cell)
CL:0009111 - centrocyte - CL:0000844 (germinal center B cell)
CL:0009105 - T cell zone reticular cell - CL:0009102 (lymph node fibroblastic reticular cell)

In [None]:
rExprs = rExprs.set_axis(rvarOrIndex, axis=1, inplace=False)
rExprs = rExprs.loc[:,rKeepFeatIndex]
print(rExprs.shape)