# Putting together chunks for preprocessing
1. Use ```glob``` to find all the chunks
2. Run this pipeline for multidataset integration: https://chanzuckerberg.github.io/cellxgene-census/notebooks/analysis_demo/comp_bio_data_integration_scvi.html
2. Run PCA to get ```X_pca``` layer of the data

# Steps prior to generating the UMAP
1. Normalization
- The CZI data normalises to a sum of 1e1, but the integration procedure uses 1e4. Since there are raw counts available in the data, going ahead with the target sum 1e4 makes more sense. 
2. log1p transformation
3. Scaling



In [1]:
import cellxgene_census
import pandas as pd
import scanpy as sc
import anndata
import numpy as np
import os
from IPython.display import display # easy viewing of dataframes
import glob
from pathlib import Path
import harmonypy
import matplotlib as plt
from scipy.sparse import csr_matrix
import scvi 

Global seed set to 0


In [None]:
# chunk_datapath = Path("../data/cellxgene_data") # set datapath
# chunk_files = glob.glob(str(chunk_datapath) + "/*.h5ad")  # use glob to find all the .h5ad files

# # Read all files into a list
# adata_list = [sc.read_h5ad(f) for f in chunk_files] 
# adata = anndata.concat(adata_list, merge="same") # concatenate adata

# # Set new column batch based on dataset ID
# adata.obs["batch"] = adata.obs["dataset_id"].astype(str)  # Ensure it's a string

# adata.write("../data/datasets/cellxgene_covid_pbmc.h5ad")
# # Restart kernel at this point

In [None]:
# Check for batch induced clustering
covid_data_read = sc.read_h5ad("../data/datasets/cellxgene_covid_pbmc_processed.h5ad")
del covid_data_read.obsm["X_pca"]

  utils.warn_names_duplicates("obs")


In [3]:
covid_data_read

AnnData object with n_obs × n_vars = 4090123 × 61891
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars', 'batch'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs'
    uns: 'pca'
    varm: 'PCs'
    layers: 'normalized'

In [14]:
covid_data_read.obs.cell_type.value_counts()

cell_type
classical monocyte                            499181
B cell                                        489978
CD8-positive, alpha-beta T cell               469326
CD4-positive, alpha-beta T cell               365524
natural killer cell                           235321
                                               ...  
mast cell                                         59
group 2 innate lymphoid cell, human               57
myeloid lineage restricted progenitor cell        49
T-helper 2 cell                                   37
T-helper 17 cell                                  12
Name: count, Length: 97, dtype: int64

In [15]:
obs_cols = list(covid_data_read.obs.columns)
print(obs_cols)

for colname in obs_cols:
    if pd.api.types.is_categorical_dtype(covid_data_read.obs[colname]):
        covid_data_read.obs[colname] = covid_data_read.obs[colname].cat.remove_unused_categories()

['soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars', 'batch']


  if pd.api.types.is_categorical_dtype(covid_data_read.obs[colname]):


In [18]:
covid_data_read.obs.dataset_id.value_counts()

dataset_id
9dbab10c-118d-496b-966a-67f1763a6b7d    1234186
ebc2e1ff-c8f9-466a-acf4-9d291afaf8b3     724710
c7775e88-49bf-4ba2-a03b-93f00447c958     527286
01ad3cd7-3929-4654-84c0-6db05bd5fd59     425398
21d3e683-80a4-4d9b-bc89-ebb2df513dde     178805
2a498ace-872a-4935-984b-1afa70fd9886     151312
96a3f64b-0ee9-40d8-91e9-813ce38261c9     135892
30cd5311-6c09-46c9-94f1-71fe4b91813c      99152
c2a461b1-0c15-4047-9fcb-1f966fe55100      97499
bc2a7b3d-f04e-477e-96c9-9d5367d5425c      68011
fa8605cf-f27e-44af-ac2a-476bee4410d3      57552
5e717147-0f75-4de1-8bd2-6fda01b8d75f      45170
242c6e7f-9016-4048-af70-d631f5eea188      36906
de2c780c-1747-40bd-9ccf-9588ec186cee      31463
db0752b9-f20e-40b8-8997-992f3ae0bb2e      30126
456e8b9b-f872-488b-871d-94534090a865      28094
4c4cd77c-8fee-4836-9145-16562a8782fe      27214
055ca631-6ffb-40de-815e-b931e10718c0      25540
59b69042-47c2-47fd-ad03-d21beb99818f      23185
bc260987-8ee5-4b6e-8773-72805166b3f7      15983
ae5341b8-60fb-4fac-86db-86e49

In [None]:
covid_data_read.write_h5ad("../data/datasets/cellxgene_covid_pbmc_processed.h5ad")

In [17]:
covid_data_processed = sc.read_h5ad("../data/datasets/cellxgene_covid_pbmc_processed.h5ad")

  utils.warn_names_duplicates("obs")


In [18]:
covid_data_processed.obs.set_index("soma_joinid", inplace=True) # indices are not unique, because of data concatenation

In [19]:
covid_data_processed.obs_names = covid_data_processed.obs_names.astype(str) # convert to str?

In [23]:
czi_covid_pbmc_5pct = sc.pp.subsample(covid_data_processed, fraction=0.05, copy=True, random_state=42)

In [26]:
czi_covid_pbmc_2pct = sc.pp.subsample(covid_data_processed, fraction=0.02, copy=True, random_state=21)

In [None]:
common_obs = czi_covid_pbmc_2pct.obs_names.intersection(czi_covid_pbmc_5pct.obs_names)
print(len(common_obs)) # 4084

4084


In [31]:
czi_covid_pbmc_5pct.write_h5ad("../data/datasets/czi_covid_pbmc_5pct.h5ad")
czi_covid_pbmc_2pct.write_h5ad("../data/datasets/czi_covid_pbmc_2pct.h5ad")