# PROJ

## Init Script

In [None]:
from os import path
import os
from IPython.display import display, Markdown 
import numpy as np, pandas as pd, decoupler as dc, scanpy as sc
pd.set_option('display.max_rows', 500)
from standard_workflows import analysis_baseclass as baseclasses
from standard_workflows import analysis_loops as al
from standard_workflows import decoupler_utility as dcu
from standard_workflows import nfcore_utility as nfu
from standard_workflows import diffexpr_utility as deu
from standard_workflows import preprocessing_utility as preu
sc.set_figure_params(dpi=100, color_map = 'viridis_r')
sc.settings.verbosity = 1
sc.logging.print_header()

## Single Cell

In [None]:
sc_template = baseclasses.Analysis.new_dataset(baseclasses.Baseanalysis, preu.Preprocessing, dcu.Decoupler)
sc_analysis = baseclasses.Analysis(datasets=[
           ('01', 'scRNA', 'mouse', sc_template)
            ], params_path = path.abspath("./../../analysis/"))
analysis = sc_analysis
al.analysis = analysis
#analysis.print_info()
analysis.save_paths()
ds = analysis.datasets[0]
ds.data

In [None]:
ds.data
import yaml
markerspath = os.path.join(ds.get_paths()['datapath'], ds.get_paths()['celltype_markers'])
with open(markerspath, 'r') as file:
    markers = yaml.safe_load(file)
markers

### Read mm10 h5 sample files

In [None]:
if len(ds.data) <= 0:
    samples = {}
    dirpath = path.join(ds.paths['datapath'], ds.paths['rawpath'])
    with os.scandir(dirpath) as entries:
        for entry in entries:
            if (not entry.name.startswith('.')):
                sampledirname = entry.name
                sampledirpath = os.path.join(dirpath, entry.name)
                with os.scandir(sampledirpath) as entries:
                    for entry in entries:
                        if entry.is_file() and entry.name.endswith('h5') and not entry.name.startswith('.'):
                            samples[sampledirname] = sc.read_10x_h5(path.join(sampledirpath, entry.name))
                            samples[sampledirname].var_names_make_unique()
    samples_to_concat = list(samples.values())
    # inner join
    ds.data = samples_to_concat[0].concatenate(samples_to_concat[1:], batch_key = 'sampleID', batch_categories = list(samples.keys()))

### Update mm10 to mm39
mm10 was used for the single cell data but the newer release mm39 was used for bulk. Therefore, we'll update the mm10 EnsembleIDs and gene names. 


In [None]:
if True in set(ds.data.var['genome'] == 'mm10'):
    # To check which EnsemblIDs were updated from mm10 to mm39 
    pd.DataFrame(ds.data.var['gene_ids']).to_csv(os.path.join(ds.get_paths()['datapath'], 'ensembl_ids.csv'))
    for file_numb in range(0,5):
        res = pd.read_csv(os.path.join(ds.get_paths()['datapath'], f'Results-Mus_musculus_Tools_IDMapper_{file_numb}.csv'))
        res = res.loc[~(res['Requested ID'] == res['Matched ID(s)'])]
        print(res[['Requested ID', 'Matched ID(s)']])

    # Rename
    old_gene_name = ds.data.var[ds.data.var['gene_ids'] == 'ENSMUSG00000079169'].index # old ensembl id
    ds.data.var.loc[old_gene_name,'gene_ids'] = 'ENSMUSG00000027157' # new ensembl id
    old_gene_name = ds.data.var[ds.data.var['gene_ids'] == 'ENSMUSG00000085431'].index # old ensembl id
    ds.data.var.loc[old_gene_name,'gene_ids'] = 'ENSMUSG00000054510'
    old_gene_name = ds.data.var[ds.data.var['gene_ids'] == 'ENSMUSG00000085147'].index # old ensembl id
    ds.data.var.loc[old_gene_name,'gene_ids'] = 'ENSMUSG00000054510'
    old_gene_name = ds.data.var[ds.data.var['gene_ids'] == 'ENSMUSG00000095316'].index # old ensembl id
    ds.data.var.loc[old_gene_name,'gene_ids'] = 'ENSMUSG00000087358'

    from pyensembl import EnsemblRelease
    ensembldata = EnsemblRelease(ds.analysis_params['preprocessing']['reference_genome_release'], species= ds.organism)

    ds.data.var.index = ds.data.var['gene_ids']
    ds.set_gene_names(ensembldata)
    ds.data.var['genome'] = 'mm10_updatedto_release109'
    ds.save_data(ds.paths["datafilepath_tmp"])

# To add the metadata, recreate analysis object    

### Preprocess
Set gene names, raw attribute, do preprocessing, save data

In [None]:
ds.data.obs['isCond_sample'] = ds.data.obs['isCond_sample'].astype('category')

ds.data.raw = ds.data
ds.preprocess(input_type='raw')
    

In [None]:
new = True
if new:
    ds.filter(prev = ds.analysis_params['preprocessing']['basicFilt']['large_n'][0], newpcaplots=False, skipviolins = False)
else: 
    ds_filt = ds.read_data(os.path.join(ds.paths["datapath"], 'filtered_prev2.pickle'))

In [None]:
sc.pl.pca_variance_ratio(ds.data, n_pcs=40, log=True)

In [None]:
#!pip install harmonypy
import scanpy.external as sce
pca_key = 'X_pca'
batch_key = 'sampleID'
sce.pp.harmony_integrate(ds.data, key = batch_key, basis=pca_key, adjusted_basis=f'{batch_key}_{pca_key}')


### UMAPS

In [None]:
sc.settings.figdir =  os.path.join(sc.settings.figdir , 'umaps')
if not os.path.exists(sc.settings.figdir):
    os.makedirs(sc.settings.figdir)

In [None]:
sc.pp.neighbors(ds.data, n_neighbors=15, n_pcs=30)
sc.tl.umap(ds.data, random_state=0)
resolutions= [0.6]

for res in resolutions:
    sc.tl.leiden(ds.data, resolution = res, key_added = f'leiden_res{res}')
    sc.pl.umap(ds.data, color = ['sampleID', f'leiden_res{res}'], legend_loc='on data', save= f'leiden_res{res}')
leiden_name = f'leiden_res{resolutions[0]}'

ds.data.obs['cluster'] = ds.data.obs[leiden_name]
ds.data.obs['isCond_sample'] = ds.data.obs['isCond_sample'].astype('str')
ds.data.obs['isCond_sample'] = ds.data.obs['isCond_sample'].astype('category')
ds.data.obs['cluster_isCond_sample'] = ds.data.obs['cluster'].astype('str') + '_' + ds.data.obs['isCond_sample'].astype('str')
ds.analysis_params['diffExpr']['conditions'] += ['cluster_isCond_sample']

In [None]:
sc.pl.umap(ds.data, color = ['cluster_isCond_sample', leiden_name])
al.split_umap(ds.data, color = 'cluster_isCond_sample', split_by='isCond_sample',legend_loc = "right margin")

In [None]:
use_raw = True
gene_symbols = 'gene_name'
for group in list(set(ds.data.obs['sampleID'])):
    sc.pl.umap(ds.data, color = ['sampleID'], groups = group, save = f'{group}')
sc.pl.umap(ds.data, color = ['sampleID', 'isCond_sample', leiden_name])


In [None]:
sc.settings.figdir =  os.path.join(sc.settings.figdir , 'markers')
if not os.path.exists(sc.settings.figdir):
    os.makedirs(sc.settings.figdir)

In [None]:
sc.tl.rank_genes_groups(ds.data, leiden_name, method='wilcoxon', layer= 'log', use_raw = False)
ranked_genes = sc.get.rank_genes_groups_df(ds.data, None)
ranked_genes.to_csv(os.path.join(ds.get_paths()['result_pca_path'], 'ranked_genes.csv'))
sc.pl.rank_genes_groups_dotplot(ds.data, n_genes=5, groupby=leiden_name, save='rankedgenes_dotplot.pdf')

In [None]:
for clust in ds.data.obs['cluster']:
    sc.pl.rank_genes_groups_violin(ds.data, groups=clust, n_genes=10, save = f'violin_cluster{clust}.pdf')
    mynames = [x[0] for x in ds.data.uns['rank_genes_groups']['names'][:10]]
    sc.pl.stacked_violin(ds.data, markers, groupby = leiden_name, save = f'violinstacked_cluster{clust}.pdf')

In [None]:
markers = {'fibs': ['Col1a1', 'Pdgfra'],
 'EC': ['Kdr', 'Pecam1', 'Fabp4'],
 'Momac': ['Cd68', 'Itgam'],
 'DC': ['Cd209a', 'Itgam', 'H2-Ab1', 'Cd74', 'Itgae'],
 'BLC': ['Cd79a', 'Ms4a1'],
 'TLC': ['Cd3d', 'Cd3e', 'Lef1'],
 'NK': ['Klrk1', 'Klrb1b', 'Ccl5'],
 'Gran': ['S100a8', 'S100a9'],
 'SMC': ['Rgs5', 'Vtn', 'Kcnj8', 'Cspg4', 'Pdgfrb']}

In [None]:
# Please adjust
def label_cluster(row):
    if row[leiden_name] == 0:
      return 'Fib_act'
    if row[leiden_name] == 1:
      return 'Momac'
ds.data.obs.apply(label_cluster, axis=1)


In [None]:
ax = sc.pl.dotplot(ds.data, marker, groupby='leiden_res0.6', dendrogram=True, standard_scale = 'var')


## Pseudobulk

In [None]:
all_subs = ds.data.raw.to_adata()
all_subs.layers['counts'] = all_subs.X
pdata = dc.get_pseudobulk(
    all_subs,
    sample_col='sampleID',
    groups_col='leiden_res0.6',
    layer='counts',
    mode='sum',
    min_cells=0,
    min_counts=0
)
pdata


In [None]:
import dill
with open(os.path.join(ds.get_paths()['datapath'], '01_sc.pickle'), "wb") as dill_file:
                dill.dump(pdata, dill_file). 
# Move this file into a new dataset folder under 'bulkRNA'. Run a normal bulk analysis on it.

## Decoupler  

For each dataset the activities are estimated according to the given prior knowledge and decoupler parameters. 

In [None]:
import copy
data = copy.deepcopy(ds.data)
saved_ds = copy.deepcopy(ds)
data.X = data.layers['log']
ds.data = data

In [None]:
ds.get_all_acts(new = True)
#al.plot_violin('isCond_sample')
#al.plot_violin('sampleID')
al.plot_umap()
al.get_mean_acts()
import seaborn as sns
al.plot_mean_acts()
