#KeapKO, Nrf2KO, WT days 5 and 8
Python analysis using scanpy 1.9 Docker image: cr.gitlab.uzh.ch/elena.duerst/docker-images/scanpy1p9_bioc:0.10

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
# import scanorama
import scipy
import pathlib
import anndata as anndata
import scvelo as scv
import matplotlib as mpl
#import helpers
#import scanpy_cluster_proportions

In [None]:
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
import anndata2ri
import logging

In [None]:
rcb.logger.setLevel(logging.ERROR) # Ignore R warning messages
ro.pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none'
}
mpl.rcParams.update(new_rc_params)

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=100, facecolor='white', figsize = (4,4), dpi_save=300, frameon = False)
results_path = '../results/preprocessing'
sc.settings.figdir = results_path
umap_point_size = 30
umap_transparency = 0.3
umap_continuous_point_size = 30
umap_continuous_transparency = 0.7
aspect_ratio = 1
save_figure = False

In [None]:
intermediate_data_path = pathlib.Path('../data/intermediate/')
intermediate_data_path.mkdir(parents=True, exist_ok=True)
results_file = pathlib.Path(intermediate_data_path) / 'preprocessed.h5ad'
data_path = pathlib.Path('../data/raw')

# Data import

# TODO: use GEO link as backup link

In [None]:
sample_files = list(data_path.glob('*_filtered_feature_bc_matrix.h5'))
adatas = {}
for sam in sample_files:
    adata = sc.read_10x_h5(sam)
    sample_name = sam.name.replace('_filtered_feature_bc_matrix.h5', '')
    adata.obs['sample'] = sample_name
    print(sample_name)
    print(adata.shape)
    adata.var_names_make_unique()
    adatas[sample_name] = adata

# QC

In [None]:
 for name,adata in adatas.items():
    adata.var['mt'] = adata.var_names.str.startswith('mt-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata.var['Rp'] = adata.var_names.str.startswith('Rp')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['Rp'], percent_top=None, log1p=False, inplace=True)

In [None]:
 for name, ad in adatas.items():
    print(name)
    print(ad.shape)
    sc.pl.violin(ad, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}.png" if save_figure else None)

In [None]:
for name, adata in adatas.items():
    print(name)
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color= 'n_genes_by_counts')
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color = "pct_counts_mt")
    sc.pl.scatter(adata, x='pct_counts_mt', y='pct_counts_Rp', color = 'n_genes_by_counts')

In [None]:
min_numof_genes = 200
max_numof_genes = 10000#7000
for name, adata in adatas.items():
    fig, axs = plt.subplots(1, 4, figsize=(12, 3))
    fig.suptitle(f"Covariates for filtering: {name}")

    sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.histplot(
        adata.obs["total_counts"][adata.obs["total_counts"] < 40000],
        kde=False,
        bins=40,
        ax=axs[1],
    )
    plot = sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    plot.axvline(x = min_numof_genes, color = 'red')
    plot.axvline(x = max_numof_genes, color = 'red')
    sns.histplot(
        adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < min_numof_genes + 1000],
        kde=False,
        bins=60,
        ax=axs[3],
    )
    plt.axvline(x = min_numof_genes, color = 'red')

# Filtering

In [None]:
max_pct_mt = 5
for name, adata in adatas.items():
    adatas[name] = adata[adata.obs['pct_counts_mt'] < max_pct_mt , :]

In [None]:
 for adata in adatas.values():
    sc.pp.filter_cells(adata, min_genes = min_numof_genes)
    sc.pp.filter_cells(adata, max_genes = max_numof_genes)
    sc.pp.filter_genes(adata, min_cells = 1)
    print(adata.shape)

# QC after filtering

In [None]:
for name, adata in adatas.items():
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}_filtered.png" if save_figure else None)

# Normalization, transformation and PCA

# Normalization

In [None]:
for name, adata in adatas.items():
    adatas[name].layers['counts'] = adata.X

## Shifted logarithm

In [None]:
for name, adata in adatas.items():
    scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
    adatas[name].layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

## scran

In [None]:
def normalize_scran(adata):
    # Preliminary clustering for differentiated normalisation
    adata_pp = adata.copy()
    sc.pp.normalize_total(adata_pp)
    sc.pp.log1p(adata_pp)
    sc.pp.pca(adata_pp, n_comps=15)
    sc.pp.neighbors(adata_pp)
    sc.tl.leiden(adata_pp, key_added="groups")
    input_groups = adata_pp.obs["groups"]
    data_mat = adata_pp.X.T
    # convert to CSC if possible. See https://github.com/MarioniLab/scran/issues/70
    if scipy.sparse.issparse(data_mat):
        if data_mat.nnz > 2**31 - 1:
            data_mat = data_mat.tocoo()
    else:
        data_mat = data_mat.tocsc()
    
    %R -i data_mat,input_groups -o size_factors library(scran);library(BiocParallel); size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters = input_groups, min.mean = 0.1, BPPARAM = MulticoreParam()))
    
    adata.obs["size_factors"] = size_factors
    scran = adata.X / adata.obs["size_factors"].values[:, None]
    adata.layers["scran_normalization"] = scipy.sparse.csr_matrix(sc.pp.log1p(scran))
    return adata

In [None]:
for name, adata in adatas.items():
    adatas[name] = normalize_scran(adata)

In [None]:
for name, adata in adatas.items():
    sc.pp.highly_variable_genes(adata, n_top_genes=5000, layer = 'scran_normalization', inplace=True)

for adata in adatas:
    sc.experimental.pp.normalize_pearson_residuals(adata)
    helpers.assert_all_finite(adata.X)

In [None]:
for name, adata in adatas.items():
    adatas[name].X = adata.layers['scran_normalization']

In [None]:
for name, adata in adatas.items():
    sc.pp.pca(adata, svd_solver='arpack',use_highly_variable=True)

In [None]:
for name, adata in adatas.items():
    print(name)
    sc.pl.pca_overview(adata, color = ['n_genes_by_counts','total_counts','pct_counts_mt', 'pct_counts_Rp'], components = ['1,2', '2,3'], frameon = True)

# Dimensionality reduction and clustering

In [None]:
for name, adata in adatas.items():
    sc.pp.neighbors(adata,
                n_neighbors=15,
                n_pcs=15
               )
    sc.tl.leiden(adata, resolution = 0.2)
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = 'leiden', title = name)

In [None]:
gene_list = ['Cxcl9', 'Cxcl10', 'Cd40', 'Ccl5', 'Adgre1', 'S100a8', 'Cd3e', 'Csf1r', 'Csf3r', 'Cd19', 'Gzmb', 'Itgax', 'Itgam', 'Twist1', 'Tead1']

In [None]:
for name, adata in adatas.items():
    print(name)
    sc.pl.umap(adata, color = set(adata.var_names).intersection(gene_list), layer = 'scran_normalization')

In [None]:
for name, adata in adatas.items():
    print(name)
    sc.pl.umap(adata, color= ['n_genes_by_counts', 'total_counts', 'leiden','Ptprc', 'Cd3e', 'Cd19', 'S100a8'], ncols = 7, save = False) #  layer = 'counts',

In [None]:
for name, adata in adatas.items():
    sc.tl.rank_genes_groups(adata, groupby = 'leiden', method = 'wilcoxon')
    sc.pl.rank_genes_groups(adata, sharey=False)

 for adata, sam in zip(adatas, samples):
    print(sam)
    sc.pl.umap(adata, color= ['doublet_score', 'leiden'], layer = 'log_norm', wspace = 0.5, ncols = 4, save = False)
    

# Manual annotation

In [None]:
cluster2celltype = {
    '1xCD40':{
        '0': 'Macrophages',
        '1': 'Macrophages',
        '2': 'Neutrophils',
        '3': 'T-cells',
        '4': 'Tumor cells'
    },
    '3xCD40': {
        '0': 'Macrophages',
        '1': 'Neutrophils',
        '2': 'T-cells/Neutrophils?',
        '3': 'T-cells',
        '4': 'Macrophages',
        '5': 'Macrophages',
        '6': 'Macrophages',
        '7': 'T-cells & unknown',
        '8': 'Macrophages', # possibly hybrid
        '9': 'T-cells',
        '10': 'Tumor cells'
    },
    'ctrl': {
        '0': 'Macrophages',
        '1': 'Macrophages',
        '2': 'T-cells',
        '3': 'Neutrophils',
        '4': 'unknown',
        '5': 'Tumor cells',
        '6': 'Macrophages', # also contains some T-cells
        '7': 'Macrophages'
    }
}

In [None]:
for name, adata in adatas.items():
    adata.obs["cell_type"] = adata.obs.leiden.map(cluster2celltype[name])
    sc.pl.umap(adata, color = 'cell_type')

# Save file

In [None]:
for name, adata in adatas.items():
    adata.write(pathlib.Path(intermediate_data_path) / 'preprocessed_{}.h5ad'.format(name))

In [None]:
for name, adata in adatas.items():
    ad_mac = adata[adata.obs.cell_type == 'Macrophages']
    ad_mac.write(pathlib.Path(intermediate_data_path) / 'macrophages_{}.h5ad'.format(name))