# Human pancreatic cancer data analysis
Python analysis using scanpy.


## Setup

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import scanorama
import scipy
import os
import anndata as anndata
import pathlib
#import scanpy_cluster_proportions

In [None]:
import matplotlib as mpl
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none'
}
mpl.rcParams.update(new_rc_params)

In [None]:
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
import logging

In [None]:
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) # Ignore R warning messages
pandas2ri.activate()
anndata2ri.activate()

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
library(scran)
library(Seurat)

In [None]:
path_to_results = '../results/preprocessing'
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=100, facecolor='white', figsize = (4,4), dpi_save=300, frameon = False)
sc.settings.figdir = path_to_results
umap_point_size = 10
umap_transparency = 0.3
umap_continuous_point_size = 50
umap_continuous_transparency = 0.7
aspect_ratio = 1
save_figure = False

# Data import
#Read the 10X dataset with count matrices.

In [None]:
results_file = pathlib.Path('../data/intermediate/GSE155698/preprocessed.h5ad')
tissue = 'PDAC'
samples = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11A', '11B', '12', '13']
print(samples)
data_path = pathlib.Path('../data/raw/GSE155698')

In [None]:
adatas = []
for sam in samples:
    adata = sc.read_10x_mtx(os.path.join(data_path, tissue + '_TISSUE_' +  sam, 'filtered_feature_bc_matrix'), var_names = 'gene_symbols', cache = True)
    adata.obs['sample'] = sam
    print(sam, adata.shape)
    adatas.append(adata)

# QC

In [None]:
for adata in adatas:
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata.var['Rp'] = adata.var_names.str.startswith('RP') 
    sc.pp.calculate_qc_metrics(adata, qc_vars=['Rp'], percent_top=None, log1p=False, inplace=True)

In [None]:
for adata, name in zip(adatas, samples):
    print(name)
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}.png" if save_figure else None)

In [None]:
min_numof_genes = 250
max_numof_genes = 12000
max_numof_counts = 100000

In [None]:
for adata, name in zip(adatas, samples):
    fig, axs = plt.subplots(1, 4, figsize=(12, 3))
    fig.suptitle(f"Covariates for filtering: {name}")

    sns.distplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.distplot(
        adata.obs["total_counts"][adata.obs["total_counts"] < 40000],
        kde=False,
        bins=40,
        ax=axs[1],
    )
    plot = sns.distplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    plot.axvline(x = min_numof_genes, color = 'red')
    plot.axvline(x = max_numof_genes, color = 'red')
    sns.distplot(
        adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < min_numof_genes + 1000],
        kde=False,
        bins=60,
        ax=axs[3],
    )
    plt.axvline(x = min_numof_genes, color = 'red')

In [None]:
max_pct_mt = 40
for idx, adata in enumerate(adatas):
    adatas[idx] = adata[adata.obs['pct_counts_mt'] < max_pct_mt , :]

In [None]:
for adata in adatas:
    sc.pp.filter_cells(adata, min_genes = min_numof_genes)
    sc.pp.filter_cells(adata, max_genes = max_numof_genes)
    sc.pp.filter_cells(adata, max_counts = max_numof_counts)
    sc.pp.filter_genes(adata, min_cells = 2)
    print(adata.shape)

In [None]:
for adata, name in zip(adatas, samples):
    print(name)
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}_filtered.png" if save_figure else None)

# Concatenate all samples

In [None]:
adata = anndata.concat(adatas, join = "outer")

In [None]:
del adatas

In [None]:
adata.obs_names_make_unique()

# Normalize and logarithmize the data

will not perform fast normalization since inferior to scran implemented normalization
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
#adata.write('../data/intermediate/not_normalized.h5ad')

In [None]:
#adata = sc.read('../data/intermediate/not_normalized.h5ad')

In [None]:
#Perform a clustering for scran normalization in clusters
adata_pp = adata.copy()
#adata_pp = adata
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.2)

In [None]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [None]:
type(adata.X.T)

In [None]:
%%R -i data_mat -i input_groups -o size_factors

size_factors = calculateSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

np.savetxt("../results/size_factors.csv", size_factors, delimiter=",")

In [None]:
#Delete adata_pp
del adata_pp

In [None]:
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

size_factors = pd.read_csv("../results/size_factors.csv")

In [None]:
adata.obs['size_factors'] = size_factors

In [None]:
adata.X /= adata.obs['size_factors'].values[:,None]

In [None]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [None]:
sc.pp.log1p(adata)

# Identify highly variable genes

In [None]:
#adata = sc.read('../data/intermediate/normalized_logp1.h5ad')

In [None]:
sc.pp.highly_variable_genes(adata, 
                            #flavor='seurat_v3',
                            #n_top_genes=4000
                           )
print('\n','Number of highly variable genes: {:d}'.format(np.sum(adata.var['highly_variable'])))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#adata = adata[:, adata.var.highly_variable]

sc.pp.combat(adata, key='sample')

# Principal component analysis

In [None]:
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True, save = '_log.svg' if save_figure else None)

In [None]:
sc.pl.pca_variance_ratio(adata, save = '.svg' if save_figure else None)

In [None]:
sc.pl.pca_overview(adata, color = 'sample', components = ['1,2', '2,3', '3,4'], frameon = True)

In [None]:
adata.write(os.path.join('../data/intermediate', 'normalized_pca.h5ad'))

# Computing the neighborhood graph and embedding in UMAP

In [None]:
sc.pp.neighbors(adata,
                n_neighbors=15,
                n_pcs=15
               )

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color= ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp','sample'], wspace = 0.5, ncols = 3)

## Expression of marker genes

In [None]:
gene_list = ['PTPRC', 'HMOX1', 'HSPA5', 'VEGFA', 'MARCO', 'CD74', 'ARG1', 'SPP1']

In [None]:
for gene in gene_list:
    fig, (ax1) = plt.subplots(1)
    ax1.set_aspect(aspect_ratio)
    sc.pl.umap(adata, color = gene, size = umap_continuous_point_size, alpha = umap_continuous_transparency, ax = ax1, save = '{}.png'.format(gene) if save_figure else None)

In [None]:
sc.tl.embedding_density(adata, basis='umap', groupby='sample', key_added='umap_density_condition')
for sam in adata.obs['sample'].cat.categories:
    fig = sc.pl.embedding_density(adata, basis = 'umap', group = sam, key = 'umap_density_condition', return_fig = True, frameon = False)
    ax = plt.gca()
    ax.set_aspect(aspect_ratio)
    fig.canvas.draw()
    if save_figure:
        plt.savefig(os.path.join(path_to_results, 'density_{}.png'.format(sam)))

### Clustering the neighborhood graph

In [None]:
sc.tl.leiden(adata, resolution = 0.5)

In [None]:
fig, (ax1) = plt.subplots(1)
ax1.set_aspect(aspect_ratio)
sc.pl.umap(adata, color = ['leiden'], legend_loc='right margin', wspace=0.5, size = umap_point_size, alpha = umap_transparency, ax = ax1,save = 'leiden.png' if save_figure else None)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

# Save file

In [None]:
#adata.write('../data/intermediate/TC_only_clustered_d10_no_filtering.h5ad')
adata.write(results_file)