## In PMI cm03
### Environment
```bash
source activate scanpy_1.9.1
ipython --profile=ak1
```

In [None]:
from anndata import AnnData
import anndata
from scipy import sparse, io
import scipy
import pandas as pd
import scipy.io
import os
import scanpy as sc
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.colors
matplotlib.use('TkAgg')
import numpy as np
import seaborn as sns
import math
import scanpy.external as sce
import scrublet as scr
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
sns.set(font="Arial", font_scale=1, style='ticks')
sc.settings.verbosity = 3
plt.rcParams['figure.figsize'] = (6,6)
%autoindent
%matplotlib

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#104e8b", "#ffdab9", "#8b0a50"])

ak1 = sc.read_10x_mtx('/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/AK1/genes_seurat')
ipsc = sc.read_10x_mtx('/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/iPSC/genes_seurat')
npc = sc.read_10x_mtx('/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/NPC/genes_seurat')

ak1_iso = sc.read_10x_mtx('/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/AK1/isoforms_seurat')
ipsc_iso = sc.read_10x_mtx('/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/iPSC/isoforms_seurat')
npc_iso = sc.read_10x_mtx('/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/NPC/isoforms_seurat')

ak1.write(filename="AK1_genes.h5ad")
ipsc.write(filename="iPSC_genes.h5ad")
npc.write(filename="NPC_genes.h5ad")
ak1_iso.write(filename="AK1_isoforms.h5ad")
ipsc_iso.write(filename="iPSC_isoforms.h5ad")
npc_iso.write(filename="NPC_isoforms.h5ad")


knee = np.sort((np.array(ak1.X.sum(axis=1))).flatten())[::-1] # UMI count for each cell (axis=1)
cell_set = np.arange(len(knee))
cutoff = 200
num_cells = cell_set[knee > cutoff][::-1][0]

fig, ax = plt.subplots(figsize=(10, 7))

ax.loglog(knee, cell_set, lw=5, color="g")
ax.axvline(x=num_cells, lw=3, color="k")
ax.axhline(y=cutoff, lw=3, color="k")
ax.set_xlabel("Set of Barcodes")
ax.set_ylabel("UMI counts")
ax.grid(True, which = "both")

print(f"{num_cells:,.0f} cells passed the {cutoff} UMI threshold")
#9,364 cells passed the 200 UMI threshold (2022-10-22)

##### First, execute *_Run_DropletUtils.R*

## Preprocessing on filtered CBC-UMI matrix

#### Open 10x h5 files

In [None]:
ak1 = sc.read_10x_h5("/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/ALL_H5files/AK1_genes.h5")
ak1.var_names_make_unique()

ipsc = sc.read_10x_h5("/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/ALL_H5files/iPSC_genes.h5")
ipsc.var_names_make_unique()

npc = sc.read_10x_h5("/data/Projects/phenomata/01.Projects/13.AK1_PacBio/02.RNA/ALL_H5files/NPC_genes.h5")
npc.var_names_make_unique()

#### Doublet detection (using scrublet version 0.2.3) and removal for each sample

In [None]:
for sample in [ak1, ipsc, npc]:
    sce.pp.scrublet(sample, adata_sim=None, sim_doublet_ratio=2.0, expected_doublet_rate=0.08, stdev_doublet_rate=0.02, synthetic_doublet_umi_subsampling=1.0, knn_dist_metric='euclidean', n_prin_comps=30, verbose=True)

# Filtering
ak1 = ak1[ak1.obs['predicted_doublet'] == False, :]
ipsc = ipsc[ipsc.obs['predicted_doublet'] == False, :]
npc = npc[npc.obs['predicted_doublet'] == False, :]

for sample in [ak1, ipsc, npc]:
    sample.obs['n_counts'] = sample.X.sum(axis=1)
    sample.obs['n_genes'] = (sample.X > 0).sum(axis=1)

q20 = ak1.obs['n_counts'].quantile(q=0.20, interpolation='linear') # 20% quantile value
ak1 = ak1[ak1.obs['n_counts'] > q20, :]

q20 = ipsc.obs['n_counts'].quantile(q=0.20, interpolation='linear') # 20% quantile value
ipsc = ipsc[ipsc.obs['n_counts'] > q20, :]

q20 = npc.obs['n_counts'].quantile(q=0.20, interpolation='linear') # 20% quantile value
npc = npc[npc.obs['n_counts'] > q20, :]


#### Batch integration

In [None]:
integrated = AnnData.concatenate(ak1, ipsc, npc, join='outer', batch_categories = ['AK1', 'iPSC', 'NPC'], index_unique = '-')

#Checked Distribution of UMI counts and Gene counts
#integrated.obs['n_counts'] = integrated.X.sum(1)
#integrated.obs['n_genes'] = (integrated.X > 0).sum(1)
#sns.set(font="Arial", font_scale=1.5, style='ticks')
#sc.pl.violin(integrated, ['n_counts', 'n_genes'], groupby='batch', size=2, log=True, cut=0, inner='quartile', ylabel=['UMI counts', 'Gene counts'], rotation=0.1)
#sns.despine()
#sns.set(font="Arial", font_scale=1, style='ticks') # Back to original settings

In [None]:
sc.pp.filter_genes(integrated, min_cells=5) # 'n_cells' added in integrated.var 
integrated.layers["counts"] = integrated.X.copy()
integrated.raw = integrated

import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
%%R
library(scran)
library(dplyr)



%config InlineBackend.figure_format = 'retina'

adata_pp = integrated.copy()
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp) # works on anndata.X
sc.tl.pca(adata_pp, n_comps=15) ## 여기서 이 n_component의 숫자를 늘리면 size_factors를 estimation하는 데 도움이 될까?
sc.pp.neighbors(adata_pp)
sc.tl.leiden(adata_pp, key_added='groups', resolution=0.5)
input_groups = adata_pp.obs['groups']
data_mat = integrated.X.T
%%R -i data_mat -i input_groups -o size_factors
size_factors = BiocGenerics::sizeFactors(computeSumFactors(SingleCellExperiment::SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))



del adata_pp
del data_mat

integrated.obs['size_factors'] = size_factors

# Checked Size factor distribution before filtering cell by 'UMI counts'
#d = sns.displot(data=integrated.obs, x='size_factors', hue='batch')
#sns.move_legend(d, "upper right", bbox_to_anchor=(0.7, 0.7))

integrated.X /= integrated.obs['size_factors'].values[:, None]
integrated.layers['scran'] = integrated.X # For cellphoneDB or CelChat maybe?
sc.pp.log1p(integrated) # works on anndata.X
integrated.X = scipy.sparse.csr_matrix(integrated.X)
integrated.layers['scran_log1p'] = integrated.X
integrated.raw = integrated ## ==> log transforamtion 된 것이 raw로 들어가게 됨.

In [None]:
sc.pp.highly_variable_genes(integrated)
integrated.var['highly_variable'].value_counts() # 2,313 ==> 2021-08-10

sc.pp.scale(integrated, max_value=10) # tabula muris senis default (2021-08-10) # mean and std on adata.var
#sc.pp.scale(test3, zero_center=True, max_value=10, copy=False, layer=None, obsm=None)

cell_cycle_genes=[x.strip()[0] + x.strip()[1:] for x in open("/data/Projects/phenomata/01.Projects/11.Vascular_Aging/Database/regev_lab_cell_cycle_genes.txt")]
s_genes= cell_cycle_genes[:43]
g2m_genes= cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in integrated.var_names]
sc.tl.score_genes_cell_cycle(integrated, s_genes=s_genes, g2m_genes=g2m_genes)
"""
Used 'raw' attribute of adata (use_raw = True if .raw is present)
So, log-tranformed scran-normalized counts are put into score_genes_cell_cycle function
"""

df = integrated.obs[['batch', 'phase']]
ax = pd.crosstab(df['batch'], df['phase'], normalize='index', margins=True).plot.bar(stacked=True, rot=45, color={'S': '#689aff', 'G2M': '#fdbf6f', 'G1': '#b15928'})
ax.legend(loc='upper left', bbox_to_anchor=(1.01, 0.25), frameon=False)
ax.set_ylabel('Proportion of Cell Cycle Phase')
ax.set_xlabel("")
plt.tight_layout()
sns.despine()

sc.tl.pca(integrated, n_comps=100, use_highly_variable=True, svd_solver='arpack')

sc.pl.pca_variance_ratio(integrated, n_pcs=100, log=False)
sc.pl.pca(integrated, color=['batch', 'n_counts'], legend_loc='on data', size=8, add_outline=False, color_map='CMRmap', components=['1,2'], title=['', 'UMI count'])
sns.despine()

#sce.pp.bbknn default ==> n_pcs=50, neighbors_within_batch=3, trim=None, annoy_n_trees=10,
sce.pp.bbknn(integrated, batch_key='batch', n_pcs=20, neighbors_within_batch=5, trim=None)
sc.tl.umap(integrated, min_dist=0.5, spread=1.0, n_components=2, alpha=1.0, gamma=1.0, init_pos='spectral', method='umap')
#integrated.uns['batch_colors'] = ['#689aff', '#fdbf6f', '#b15928']
sc.pl.umap(integrated, color=['batch'], add_outline=False, legend_loc='on data', size=20, title='')
sns.despine()

fig, axes = plt.subplots(3,1,figsize=(4.5,15.5))
for i in range(len(integrated.obs['batch'].cat.categories)):
    sc.pl.umap(integrated, color=['batch'], add_outline=False, legend_loc=None, groups=integrated.obs['batch'].cat.categories[i], title=integrated.obs['batch'].cat.categories[i], size=20, ax=axes[i])
    sns.despine(ax=axes[i])


In [None]:
sc.tl.rank_genes_groups(integrated, 'batch', method='wilcoxon', corr_method='benjamini-hochberg', use_raw=True, pts=True, key_added='DEG_bw_batches_wilcoxon') # key_added=''

sc.pl.rank_genes_groups(integrated, n_genes=15, sharey=False, key='DEG_bw_batches_wilcoxon')

ax_dict = sc.pl.rank_genes_groups_heatmap(integrated, n_genes=15, groupby='batch', key='DEG_bw_batches_wilcoxon', groups=['AK1', 'iPSC', 'NPC'], show_gene_labels=True, min_logfoldchange=1, dendrogram=False, cmap='viridis', use_raw=False, swap_axes=True, show=False, var_group_rotation=90)
ax_dict['heatmap_ax'].set_yticklabels(labels=ax_dict['heatmap_ax'].get_yticklabels(), fontstyle='italic')


markers = ["Pecam1", "Cdh5", "Nos3", "Acta2", "Cnn1", "Tagln", "Rgs5", "Kcnj8", "Col1a1", "Col5a1", "Dpt", "Cd19", "Ighm", "Cd14", "Cd68", "Cd3d"] # Cd3g 없음
sc.pl.stacked_violin(test3, markers, groupby='batch')

result = test3.uns['rank_genes_groups']
groups = result['names'].dtype.names
deg_wilcoxon = pd.DataFrame({group + '_' + key: result[key][group] for group in groups for key in ['names', 'logfoldchanges', 'scores', 'pvals_adj']})
deg_wilcoxon.to_csv("/data/Projects/phenomata/01.Projects/11.Vascular_Aging/03.Scanpy/20210916_scanpy_deg.csv", mode='w')


In [None]:
# AK1 (B-lymphocyte) specific gene
genes = ['CD19', 'MS4A1'] # MS4A1 (CD20)
fig, axes = plt.subplots(1, 2, figsize=(10, 5), constrained_layout=True)
for i in range(len(genes)):
    sc.pl.umap(integrated, color=genes[i], add_outline=False, legend_loc='right margin', size=40, color_map='viridis', use_raw=True, ax=axes[i])
    axes[i].set_title(genes[i], style='italic')
    sns.despine(ax=axes[i])

# iPSC-specific gene
genes = ['POU5F1', 'SOX2', 'NANOG'] # JARID2
fig, axes = plt.subplots(1, 3, figsize=(15, 10), constrained_layout=True)
for i in range(len(genes)):
    sc.pl.umap(integrated, color=genes[i], add_outline=False, legend_loc='right margin', size=40, color_map='viridis', use_raw=True, ax=axes[i])
    axes[i].set_title(genes[i], style='italic')
    sns.despine(ax=axes[i])


# NPC-specific gene
genes = ['NCAM1', 'KMT2D']
fig, axes = plt.subplots(1, 2, figsize=(10, 5), constrained_layout=True)
for i in range(len(genes)):
    sc.pl.umap(integrated, color=genes[i], add_outline=False, legend_loc='right margin', size=40, color_map='viridis', use_raw=True, ax=axes[i])
    axes[i].set_title(genes[i], style='italic')
    sns.despine(ax=axes[i])

In [None]:

sc.tl.leiden(test3, resolution=0.5, key_added='leiden_r05') #### 0 ~ 13 ==> 2021-09-28
sc.tl.leiden(test3, resolution=1.0, key_added='leiden_r10')
sc.pl.umap(test3, color=['batch', 'leiden_r05', 'leiden_r10'], add_outline=False, legend_loc='right margin', size=20)

fig, axes = plt.subplots(1,3)
sc.pl.umap(test3, color=['batch'], add_outline=False, legend_loc='right margin', size=20, groups=['m01'], title='1 month', ax=axes[0])
sc.pl.umap(test3, color=['batch'], add_outline=False, legend_loc='right margin', size=20, groups=['m10'], title='10 months', ax=axes[1])
sc.pl.umap(test3, color=['batch'], add_outline=False, legend_loc='right margin', size=20, groups=['m20'], title='20 months', ax=axes[2])


#### Diffusion pseudotime (Experimental)

In [None]:
sc.tl.diffmap(integrated)
sc.pl.diffmap(integrated, color=['batch'], add_outline=False, legend_loc='right margin', size=70, color_map='CMRmap')


sc.tl.draw_graph(test3, layout='fa', init_pos=None, neighbors_key=None) ## init_pos가 .obsm에 있는 pca, umap, paga 등이 될 수 있다.
sc.pl.draw_graph(test3, color=['batch', 'PECAM1', 'CDH5', 'phase'], add_outline=True, legend_loc='right margin', size=10, color_map='CMRmap')

start_cell = np.isin(test3_endo.obs['endo_leiden_r05'], '0') # boolean numpy array ==> array([False, False, False, ..., False, False, False])
#max_start_id = np.argmin(test3_endo.obsm['X_diffmap'][start_cell,1]) # 262
max_start_id = np.argmax(test3_endo.obsm['X_diffmap'][start_cell,1])
root_id = np.arange(len(start_cell))[start_cell][max_start_id] # 341
test3_endo.uns['iroot'] = root_id

sc.tl.dpt(test3_endo, n_branchings=1, n_dcs=10) # n_branchings를 0으로 하면 (recommended by Scanpy developer) dpt_groups가 생성 안 됨.
#computing Diffusion Pseudotime using n_dcs=10
sc.pl.dpt_groups_pseudotime(test3_endo) # 여기에서 pseudotime trajecgory 확인.

lin = ('2', '0', '3', '1') # DPT pseudotime group ordering에 맞게 배치
test3_endo.obs['dpt_groups'] = test3_endo.obs['dpt_groups'].cat.reorder_categories(list(lin), ordered=True)
sc.pl.dpt_groups_pseudotime(test3_endo) # 다시 ordering에 맞게 plotting
sc.pl.dpt_timeseries(test3_endo[:, test3_endo.var.highly_variable])



#### Force-Directged Graph (Experimental)

In [None]:
sc.tl.draw_graph(integrated, layout='fa', init_pos=None, neighbors_key=None) ## init_pos가 .obsm에 있는 pca, umap, paga 등이 될 수 있다.
sc.pl.draw_graph(integrated, color=['batch'], add_outline=True, legend_loc='on data', size=40, color_map='CMRmap')
