# PROJ scRNA Pseudobulk Analysis

## Init Script

In [None]:
from os import path
from IPython.display import display, Markdown 
import pandas as pd, scanpy as sc
pd.set_option('display.max_rows', 500)
from standard_workflows import analysis_baseclass as baseclasses
from standard_workflows import analysis_loops as al
from standard_workflows import decoupler_utility as dcu
from standard_workflows import nfcore_utility as nfu
from standard_workflows import diffexpr_utility as deu
from standard_workflows import preprocessing_utility as preu
sc.set_figure_params(dpi=100, color_map = 'viridis_r')
sc.settings.verbosity = 1
sc.logging.print_header()

## Bulk-RNA
### Init Analysis

Two `analysis` objects are created for the analysis:   

- `sc_analysis`: single cell analysis with decoupler
- `bulk_analysis`: bulk analysis with nf-core, preprocessing, decoupler
We start with the bulk data: 
1. Create a dataset template that inherits the functionality from all needed modules. 
2. Use this template to initialise the `analysis` object. Different datasets can use different templates but the provided loops in `al` always loop over all datasets of their attached `analysis` ojbect. This leads to an error if the loop to an module is used that is not available in all datasets. 
3. Rename `bulk_analysis` to `analysis` to keep the code standardized. 
4. Attach `analysis` to the `al` module. 
5. Save the analysis paths and parameters of the `analysis` and all `datasets` to yaml files.
6. Optionally print the path and parameter characteristics for all `datasets`.
7. As we only look at one dataset in this analysis, we give it a shorter name. To get a specific `dataset` from the list of `datasets` that an `analysis` holds, we can either use the index or the getter method with the name of the `dataset`. The latter is less errorprone. 
  
CAUTION: decoupler use_raw = True

In [None]:
# Create Dataset class. It inherits from other classes dynamically.
bulk_dataset_template = baseclasses.Analysis.new_dataset(baseclasses.Baseanalysis, preu.Preprocessing, dcu.Decoupler, deu.DiffExpr) 

# Init Analysis object
bulk_analysis = baseclasses.Analysis(datasets=[
            ('01_sc', 'bulkRNA', 'mouse', bulk_dataset_template),
            ], params_path = path.abspath("./../../analysis/"))
analysis = bulk_analysis
al.analysis = analysis
analysis.save_paths()
#analysis.print_info()

ds = analysis.get('01_sc')
ds.data

In [None]:
new = False
if not path.exists(ds.paths["datafilepath_tmp"]) or new:  # skip if h5ad fiel already exists
    # add interaction column
    #ds.data.obs['cluster_isCond_sample'] = ds.data.obs['cluster'].astype('str') + '_' + ds.data.obs['isCond_sample'].astype('str')

    # Filter out genes with counts < 10
    df = pd.DataFrame(ds.data.X)
    genes_to_keep = df.columns[df.sum(axis=0) >= 10]
    ds.data = ds.data[:, genes_to_keep]
    ds.data.raw = ds.data
    ds.preprocess()
    ds.plot_filter_expr(raw=True) # y line is min total counts
    ds.save_data(ds.paths["datafilepath_tmp"])



## PCAs

In [None]:
new=False
if new:
    ds.filter(prev = ds.analysis_params['preprocessing']['basicFilt']['large_n'][0])

**Quality Plots**

In [None]:
new = False
if new == True:
    for col in ds.data.obs.columns:
        display(Markdown(f'**{col}**'))
        print(ds.data.obs[col].value_counts())
    ds.plot_obs()

**Number of Ensembl IDs:**

In [None]:
sum(ds.data.var_names.str.startswith('ENS') )

## Differential Gene Expression with DeSeq2


In [None]:
@al.loop(ds.analysis_params['diffExpr']['deseq2']['design_factors'], False)
def run_deseq_loop(design_factor, ds, filtering, new):
    print(design_factor)
    ds.run_deseq(design_factor, filtering, new)

# Run Deseq2   
run_deseq_loop(ds, filtering = 'filtmin10', new = False)

In [None]:
# Get contrasts
for k in ds.ddss.keys():
    ds.get_contrasts(k, new=True, same_start=True)

## Decoupler

In [None]:
ds.get_all_acts()
ds.plot_acts_perDds(25,25)