# GRN anlysis for Whole cell populations

## Preprocessing

In [None]:
# _pySCENIC_EC_new2.py
# 2022-05-12

## pySCENIC's AUC matrix retrieval
# Environment (pySCENIC)
# ipython --profile=pyscenic

import scanpy as sc
import loompy as lp
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import json
import zlib
import base64
from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.rss import regulon_specificity_scores
from pyscenic.cli.utils import load_signatures
#from pyscenic.plotting import plot_rss # 이거 그대로  function 쓰면 오류 생김 따라서 def() 로 함.
from pyscenic.binarization import binarize
from adjustText import adjust_text

sns.set_theme(font="Arial", font_scale=1, style='ticks')
sc.settings.verbosity = 3
plt.rcParams['figure.figsize'] = (6,6)
plt.rcParams['figure.dpi'] = 100
plt.rc("axes.spines", top=False, right=False)
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["#104e8b", "#ffdab9", "#8b0a50"])
batch_palette=['#689aff', '#fdbf6f', '#b15928']
age_palette  ={'m01': '#689aff', 
               'm10': '#fdbf6f', 
               'm20': '#b15928'}
celltype_colormap = {'Endothelial cells': '#393b79',
                     'Vascular smooth muscle cells': '#9c9ede',
                     'Fibroblasts': '#b5cf6b',
                     'B cells': '#e7ba52',
                     'M\u03A6': '#ad494a',
                     'T cells': '#7b4173',
                     'Neuronal cells': '#de9ed6'}
detailed_celltype_palette =  {'EC1': '#393b79', 
                              'EC2': '#5254a3', 
                              'vSMC1': '#9c9ede', 
                              'vSMC2': '#637939', 
                              'vSMC3': '#b5cf6b', 
                              'vSMC4': '#cedb9c', 
                              'vSMC5': '#bd9e39', 
                              'FB1': '#e7ba52', 
                              'FB2': '#843c39', 
                              'FB3': '#ad494a', 
                              'Bc': '#e7969c', 
                              'MΦ': '#7b4173', 
                              'Tc': '#ce6dbd', 
                              'Neu': '#de9ed6'}


##### 바로 아래 cell은 whole cell에 맞게 재정리해야 함 (written on 2023-08-02)

In [None]:
# pySCENIC input file generation
test3_vsmc = sc.read_h5ad("/data/Projects/phenomata/01.Projects/11.Vascular_Aging/03.Scanpy/test3_vsmc.h5ad")
test3_vsmc.X = test3_vsmc.layers['counts']

row_attrs = { 
    "Gene": np.array(test3_vsmc.var.index) ,
}
col_attrs = { 
    "CellID":  np.array(test3_vsmc.obs.index) ,
    "nGene": np.array( np.sum(test3_vsmc.X.transpose()>0 , axis=0)).flatten() ,
    "nUMI": np.array( np.sum(test3_vsmc.X.transpose() , axis=0)).flatten() ,
}

lp.create( 'test3_vsmc.loom', test3_vsmc.X.transpose(), row_attrs, col_attrs )

# GRN inference using the GRNBoost2 algorithm (from the Command-Line Interface (CLI) version of pySCENIC)
'''
database_dir='/data/Projects/phenomata/01.Projects/11.Vascular_Aging/Database/pySCENIC'
pyscenic grn \
test3_vsmc.loom \
${database_dir}/mm_mgi_tfs.txt \
-o test3_vsmc_adj.csv \
--num_workers 15
'''

# Regulon prediction (cisTarget)
'''
database_dir='/data/Projects/phenomata/01.Projects/11.Vascular_Aging/Database/pySCENIC'
pyscenic ctx \
test3_vsmc_adj.csv \
${database_dir}/mm10__refseq-r80__10kb_up_and_down_tss.mc9nr.feather \
${database_dir}/mm10__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather \
--annotations_fname ${database_dir}/motifs-v9-nr.mgi-m0.001-o0.0.tbl \
--expression_mtx_fname test3_vsmc.loom \
--output test3_vsmc_reg.csv \
--mask_dropouts \
--num_workers 15
'''

# Cellular enrichment using AUCell
nGenesDetectedPerCell = np.sum(test3_vsmc.X>0, axis=1)
percentiles = pd.Series(np.quantile(nGenesDetectedPerCell, [0.01, 0.05, 0.10, 0.50, 1]), index=np.array([0.01, 0.05, 0.10, 0.50, 1]))
fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150, constrained_layout=True)
sns.histplot(data=nGenesDetectedPerCell, legend=False)
for i, x in enumerate(percentiles):
    ax.axvline(x=x, ymin=0, ymax=0.98, color='red')
    ax.text(x=x, y=ax.get_ylim()[1], s=f'{int(x)} ({percentiles.index.values[i]*100}%)', color='red', rotation=20, size='x-small',rotation_mode='anchor' )
sns.despine(ax=ax)
ax.set_xlabel('# of Genes')
ax.set_ylabel('# of Cells')

'''
pyscenic aucell \
test3_vsmc.loom
test3_vsmc.csv \
--output test3_vsmc_pyscenic_output.loom \
--num_workers 15 \
--auc_threshold 0.05

# AUCell (Area under the curve) == Regulon activity score (Suo et al., Cell Reports 2018)
하나의 cell의 gene들은 expression따라 내림차순으로 (high expression to low expression) rank를 만들고, 
auc threshold (e.g. 5% of genes)안에 속하는 유전자들 중 each regulon에 속하는 유전자가 있으면 1씩을 더한다.
그리고 만들어진 graph에서 [0, x at auc_threshold] 에서의 area under the curve (AUC)가 AUCell이 된다.

# Regulon specificity score (RSS)과는 다르다
RSS는 각 그룹별 (e.g. cell type별) 특정 regulon의 enrichment를 볼 수 있는 수치로,
Suo et al.의 "Quantifying cell-type specificity score"부분을 참조하면 된다.
간단하게 설명하자면, 각 regulon 별 RAS에 대한 vector를 만든다. P at regulon R (P^R, 지수가 아니다)= (p1, p2, p3,... pn) where n is the total number of cells. P^R은 합이 1이 되도록 normalize한다.
그리고 P at specific group (P^C, e.g. celltype) vector를 만든다: (p1, p2, p3,... pn) 이 때 vector를 pk가 특정 celltype에 속하면 1 속하지 않으면 0으로 두는 식으로 구성하고, 위와 마찬가지로 합이 1이 되도록 normalize한다.
그 다음, 두 확률분포의 차이를 정량화하는 기법 중 하나인 Jensen-Shannon Divergence (JSD)를 사용하여,
JSD(P^R, P^C)값을 구한다. JSD 값은 '차이'를 나타내므로,
RSS(R, C) = 1 - sqrt(JSD(P^R, P^C)) 로 정의한다.
'''


In [None]:
lf = lp.connect(
    "/mnt/data/Projects/phenomata/01.Projects/11.Vascular_Aging/03.Scanpy/pySCENIC/ALL/test3_pyscenic_output.loom",
    mode='r+', validate=False)
lf.ca.keys()
# ['CellID', 'RegulonsAUC', 'nGene', 'nUMI']
lf.ra.keys()
# ['Gene', 'Regulons']
lf.attrs.keys()
# ['CreationDate', 'LOOM_SPEC_VERSION', 'MetaData']

auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID)
auc_mtx.columns = auc_mtx.columns.str.replace('\(', '_(')

test3 = sc.read_h5ad("/data/Projects/phenomata/01.Projects/11.Vascular_Aging/03.Scanpy/test3.h5ad")
test3.obs['Age'] = test3.obs['batch']

sig = load_signatures('/data/Projects/phenomata/01.Projects/11.Vascular_Aging/03.Scanpy/pySCENIC/ALL/test3_reg.csv')
'''
*.pyscenic_ouput.loom 만들 때, signature에 대해서는 아래 사항을 이행하지 않아서, Regulon_(+)에서 underbar가 sig에는 없음.
따로 만들어줘야 함.
meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))
rt = meta['regulonThresholds']
for i,x in enumerate(rt):
    tmp = x.get('regulon').replace("(","_(")
    x.update( {'regulon': tmp} )

아래 rename_columns() 함수를 통해 해 준다.
'''

test3 = add_scenic_metadata(test3, auc_mtx, sig)  # AUCell score가 test3_endo에 추가된다.

def rename_columns(colname):
    if colname.startswith("Regulon"):
        return colname.replace("(+)", "_(+)")
    else:
        return colname

test3.var.rename(columns=rename_columns, inplace=True)

leiden_to_detailed_celltype_dict = {'0': 'vSMC1',
                                    '1': 'vSMC2',
                                    '2': 'vSMC3',
                                    '3': 'FB1',
                                    '4': 'vSMC4',
                                    '5': 'EC1',
                                    '6': 'FB2',
                                    '7': 'EC2',
                                    '8': 'vSMC5',
                                    '9': 'FB3',
                                    '10': 'Bc',
                                    '11': 'M\u03A6',
                                    '12': 'Tc',
                                    '13': 'Neu'}
test3.obs['detailed_celltype'] = test3.obs['leiden_r05'].map(lambda x: leiden_to_detailed_celltype_dict[x]).astype('category')
detailed_celltype_order = ('EC1', 'EC2', 'vSMC1', 'vSMC2', 'vSMC3', 'vSMC4', 'vSMC5', 'FB1', 'FB2', 'FB3', 'Bc', 'M\u03A6', 'Tc', 'Neu')
test3.obs['detailed_celltype'] = test3.obs['detailed_celltype'].cat.reorder_categories(list(detailed_celltype_order), ordered=True)

cellAnnot_all = test3.obs[['Age', 'celltype', 'detailed_celltype']]
rss_subclusters = regulon_specificity_scores(auc_mtx, cellAnnot_all['detailed_celltype'])

rss_subclusters.to_csv('RSS_detailed_celltype_ALL.tab', sep='\t')

test3.write(filename="/data/Projects/phenomata/01.Projects/11.Vascular_Aging/03.Scanpy/pySCENIC/ALL/test3_pyscenic.h5ad")
'''
test3은 celltype .obs에 Neuronal cells가 없음. 따라서 불러올 때 설정해줘야 함
'''

# More on the regulon specificity score (RSS)
# RSS에 대한 설명은 위의 AUCell (=RAS), RSS에 대한 설명 참조


cats = sorted(list(set(cellAnnot['Subpopulations of vSMC'])))


In [None]:
# Select columns that start with "Regulon"
data_annot = test3[~test3.obs['celltype'].isin(['B cells', 'MΦ', 'T cells'])].obs[['Age', 'detailed_celltype', 'Annotated Cell Types']].copy()
data = test3[~test3.obs['celltype'].isin(['B cells', 'MΦ', 'T cells'])].obs.loc[:, test3[~test3.obs['celltype'].isin(['B cells', 'MΦ', 'T cells'])].obs.columns.str.startswith('Regulon')].copy()
data_Zscore = test3[~test3.obs['celltype'].isin(['B cells', 'MΦ', 'T cells'])].obs.loc[:, test3[~test3.obs['celltype'].isin(['B cells', 'MΦ', 'T cells'])].obs.columns.str.startswith('Regulon')].copy()
for col in list(data.columns):
    data_Zscore[col] = (data[col] - data[col].mean()) / data[col].std(ddof=0)

'''
detailed_celltype_palette = dict(zip(data_annot['detailed_celltype'].unique(), sns.color_palette('husl', data_annot['detailed_celltype'].nunique())))
age_palette = dict(zip(data_annot['Age'].unique(), sns.color_palette('husl', data_annot['Age'].nunique())))
annotated_cell_types_palette = dict(zip(data_annot['Annotated Cell Types'].unique(), sns.color_palette('husl', data_annot['Annotated Cell Types'].nunique())))

# Map the categorical columns to colors (map doesn't apply to categorical object type with order, so change it to an ordinary object type)
data_annot['detailed_celltype'] = data_annot['detailed_celltype'].astype('object')
data_annot['Age'] = data_annot['Age'].astype('object')
data_annot['Annotated Cell Types'] = data_annot['Annotated Cell Types'].astype('object')

detailed_celltype_colors = data_annot['detailed_celltype'].map(detailed_celltype_palette)
age_colors = data_annot['Age'].map(age_palette)
annotated_cell_types_colors = data_annot['Annotated Cell Types'].map(annotated_cell_types_palette)

Then add "row_colors=pd.concat([detailed_celltype_colors, age_colors, annotated_cell_types_colors], axis=1)" argument to sns.clustermap below
'''

# Map the categorical columns to colors (map doesn't apply to categorical object type with order, so change it to an ordinary object type)
data_annot['detailed_celltype'] = data_annot['detailed_celltype'].astype('object')
data_annot['Age'] = data_annot['Age'].astype('object')
data_annot['Annotated Cell Types'] = data_annot['Annotated Cell Types'].astype('object')

detailed_celltype_colors = data_annot['detailed_celltype'].map(detailed_celltype_palette)
age_colors = data_annot['Age'].map(age_palette)
annotated_cell_types_colors = data_annot['Annotated Cell Types'].map(celltype_colormap)

# Create a clustered heatmap
g = sns.clustermap(data_Zscore, cmap='CMRmap', row_cluster=True, row_colors=pd.concat([detailed_celltype_colors, age_colors, annotated_cell_types_colors], axis=1), method='ward', xticklabels=True, yticklabels=False)
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), fontsize=8)

# Show the plot
plt.show()


