# Integrate Malignant cells in multiple patients and plot FOXA2+ vs FOXA- cells


## Description

Using harmony for batch correction. 
For each cell, designate FOXA2 expression status.
Plot density embedding of FOXA2 expression status.


## Procedure

- Import libraries
- Load anndata object of malignant cells
- Compute 

In [None]:
import scanpy as sc
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import matplotlib.font_manager
from matplotlib import font_manager
from matplotlib.font_manager import fontManager, FontProperties

from common_utils import (
    setup_dirs,
    find_arial_font,
    add_gene_binary_status,
    mini_process,
)

# Set the font
find_arial_font()

In [None]:
outDir = OUTDIR_HARMONY_INTEGRATION
figuresDir, dataDir, tablesDir = setup_dirs(outDir)

sc.settings.figdir = figuresDir
sc.set_figure_params(dpi_save=300, vector_friendly=True)

In [None]:
adata_path = ADATA_PATH_RNA_19_2K_HARMONY
adata = sc.read_h5ad(adata_path)

In [None]:
# The gene for which we want to plot the expression
main_gene = 'FOXA2'
main_genes = [main_gene]

In [None]:
## Test that the old result is there
for i, gene in enumerate(main_genes):    
    adata.obs[f'{gene}_is_expressed_str'] = adata.obs[f'{gene}_is_expressed'].astype(str)
    sc.tl.embedding_density(adata, basis='umap', groupby=f'{gene}_is_expressed_str')
    sc.pl.embedding_density(adata, basis='umap', key=f'umap_density_{gene}_is_expressed_str', save=f"{gene}_expr_umap_density.pdf")


## Recompute Harmony with many iterations (or until convergence)

In [None]:
max_iter_harmony = 100
harmony_column = 'sample'
sc.external.pp.harmony_integrate(adata, key=harmony_column, max_iter_harmony=max_iter_harmony)
sc.pp.neighbors(adata, use_rep='X_pca_harmony')
sc.tl.umap(adata)
sc.pl.umap(adata, color=harmony_column, save=f're_harmony_max_iter_{max_iter_harmony}.pdf')

In [None]:
# Batch correction with harmony.
# Save the results for each configuration
for use_harmony in [False, True]:
    # Examine the effect of scaling prior to computing PCA
    for do_scale in [False, True]:
        conf_str = f"use_harmony_{use_harmony}_do_scale_{do_scale}"
        adata = sc.read_h5ad(adata_path)
        adata = mini_process(adata, use_harmony=use_harmony, do_scale=do_scale)
        sc.pl.umap(adata, color=harmony_column, save=f're_harmony_max_iter_{max_iter_harmony}_{conf_str}.pdf')
        adata.write(os.path.join(dataDir, f'adata_{conf_str}.h5ad'))

### Now plot the heat embedding

In [None]:
# For each configuration (use_harmony, do_scale), plot the density of FOXA2 expression
for use_harmony in [False, True]:
    for do_scale in [False, True]:
        conf_str = f"use_harmony_{use_harmony}_do_scale_{do_scale}"
        adata = sc.read_h5ad(os.path.join(dataDir, f'adata_{conf_str}.h5ad'))
        for i, gene in enumerate(main_genes):    
            adata = add_gene_binary_status(adata, gene, threshold=0, use_counts=True)
            adata.obs[f'{gene}_is_expressed_str'] = adata.obs[f'{gene}_is_expressed'].astype(str)
            sc.tl.embedding_density(adata, basis='umap', groupby=f'{gene}_is_expressed_str')
            sc.pl.embedding_density(adata, basis='umap', key=f'umap_density_{gene}_is_expressed_str', save=f"{gene}_expr_umap_density_{conf_str}.pdf")

In [None]:
# UMAP of patients
sc.pl.umap(adata, color='patient_id', save='_umap_patients_19.pdf', ncols=1, title='Patient ID')
sc.pl.umap(adata, color='sample', save='_umap_patients_19_sample.pdf', ncols=1, title='Patient ID')

# Violin plot of FOXA2 expression per patient
adata.obs[f'{main_gene}_expr_log'] = adata[:, main_gene].X.A.flatten()
order = adata.obs.groupby('sample')[f'{main_gene}_expr_log'].mean().sort_values(ascending=False).index.tolist()

plt.clf()
fig, ax = plt.subplots(figsize=(10, 5))
sc.pl.violin(adata, groupby='sample', keys=main_gene, order=order, rotation=90, ax=ax)
plt.savefig(os.path.join(figuresDir, f'{main_gene}_violin_per_patient.pdf'), bbox_inches='tight')
plt.close()

# Plot density embedding of FOXA2 expression status
for i, gene in enumerate(main_genes):    
    adata.obs[f'{gene}_is_expressed_str'] = adata.obs[f'{gene}_is_expressed'].astype(str)
    sc.tl.embedding_density(adata, basis='umap', groupby=f'{gene}_is_expressed_str')
    sc.pl.embedding_density(adata, basis='umap', key=f'umap_density_{gene}_is_expressed_str', save=f"{gene}_expr_umap_density.pdf")

In [None]:

sc.pl.embedding_density(adata, basis='umap', key=f'umap_density_{main_gene}_is_expressed', save=f"{main_gene}_expr_umap_density_with_harmony_false_blue.png", group='False', color_map='Blues')