In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

import anndata
from tqdm import tqdm
from loguru import logger
from helpers.paths import RAW_DATA_FP, DATA_SPLIT_FPS,PREPROC_DATA_FP


In [2]:
adata = sc.read('/Users/simon/GitHub/02-712-project/data/raw/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 44721 × 26361
    obs: 'Admission', 'ClusterID', 'DPS', 'DTF', 'Donor_full', 'HLA1', 'IFN1', 'Sex', 'Status', 'Ventilated', 'cell_type_coarse', 'cell_type_fine', 'nCount_RNA', 'nCount_SCT', 'nFeature_RNA', 'nFeature_SCT', 'percent_mt', 'percent_rpl', 'percent_rps', 'percent_rrna', 'seurat_clusters', 'singler'
    var: 'Selected', 'sct_detection_rate', 'sct_gmean', 'sct_residual_mean', 'sct_residual_variance', 'sct_variable', 'sct_variance'
    uns: 'assay', 'authors', 'disease', 'organism', 'preprint', 'short_name', 'tissue'
    obsm: 'X_pca', 'X_umap'
    varm: 'pca_feature_loadings'
    layers: 'matrix', 'norm_data', 'scale_data'

In [4]:
adata.write('/Users/simon/GitHub/02-712-project/data/intermediate/test_out.h5ad')

In [6]:

logger.info('Computing QC metrics for ingested data...')
sc.pp.calculate_qc_metrics(adata, inplace=True)

2020-11-21 10:31:45.255 | INFO     | __main__:<module>:1 - Computing QC metrics for ingested data...


In [7]:
adata

AnnData object with n_obs × n_vars = 44721 × 26361
    obs: 'Admission', 'ClusterID', 'DPS', 'DTF', 'Donor_full', 'HLA1', 'IFN1', 'Sex', 'Status', 'Ventilated', 'cell_type_coarse', 'cell_type_fine', 'nCount_RNA', 'nCount_SCT', 'nFeature_RNA', 'nFeature_SCT', 'percent_mt', 'percent_rpl', 'percent_rps', 'percent_rrna', 'seurat_clusters', 'singler', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'Selected', 'sct_detection_rate', 'sct_gmean', 'sct_residual_mean', 'sct_residual_variance', 'sct_variable', 'sct_variance', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'assay', 'authors', 'disease', 'organism', 'preprint', 'short_name', 'tissue'
    obsm: 'X_pca', 'X_umap'
    varm: 'pca_feature_loadings'
    layers: 'matrix', 'norm_data', 'scale_

In [8]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [9]:
sc.pp.log1p(adata)

In [10]:

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
logger.info('Identified highly variate genes. Filtering out...')


2020-11-21 10:32:30.658 | INFO     | __main__:<module>:2 - Identified highly variate genes. Filtering out...


In [11]:

adata = adata[:, adata.var.highly_variable]
logger.info('Regressing out effects of percentage of mitochondrial genes expressed')

2020-11-21 10:32:33.682 | INFO     | __main__:<module>:2 - Regressing out effects of percentage of mitochondrial genes expressed


In [14]:
sc.pp.regress_out(adata, ['total_counts','percent_mt'])
logger.info('Scaling the data to unit variance, excluding values exceeding standard dev of 10.')
sc.pp.scale(adata,max_value=10)

2020-11-21 10:36:48.110 | INFO     | __main__:<module>:2 - Scaling the data to unit variance, excluding values exceeding standard dev of 10.
