# <font color = '#FF003D'> ***Synchronous colorectal cancer-liver metastasis project***

**_______________________________________________________________________________________________________________________________________________________________________________________________________________**

# <font color = '#FF003D'> ***==== CODE 4: Healthy tissues ====***

# Python library

In [None]:
import os
import math
import warnings
import datetime

warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import scipy
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from cycler import cycler
import openpyxl
import scvi

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

import rpy2

In [None]:
from ipywidgets import IntProgress
from IPython.display import display
import time
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler

In [None]:
result_folder = '.../Analysis/Synchro/'
data_folder = '.../Analysis/Synchro/Data/'

sc.settings.verbosity = 4
warnings.filterwarnings('ignore')
sc.set_figure_params(dpi = 100, dpi_save = 1000, facecolor = 'white')

# R library

In [None]:
! python -m rpy2.situation

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R 
library(Seurat)
library(SeuratDisk)
library(SeuratData)
library(SeuratWrappers)
library(SeuratObject)

library(openxlsx)
library(ggplot2)
library(ggraph)
library(ggrepel)

library(dplyr)
library(reticulate)
library(patchwork)

library(EnhancedVolcano)

# 1. **Healthy liver – GEO: GSE136103**

## 1.1. Create the anndata

In [None]:
# Healthy1
Healthy1 = sc.read_mtx("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy1/matrix.mtx.gz")
Healthy1_bc = pd.read_csv("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy1/barcodes.tsv.gz", header = None)
Healthy1_features = pd.read_csv('/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy1/features.tsv.gz', header=None, sep = '\t')

Healthy1 = Healthy1.T
Healthy1.obs_names = Healthy1_bc[0]

Healthy1.var['gene_label'] = Healthy1_features[1].tolist()
Healthy1.var.index = Healthy1.var['gene_label']
Healthy1.var['gene_ids'] = Healthy1_features[0].tolist()
Healthy1.var_names_make_unique()
Healthy1.var["feature_types"] = "Gene Expression"

Healthy1.obs['patient_id'] = 'GSM4041150'
Healthy1.obs['sample_id'] = 'Healthy 1'
Healthy1.obs['GEX'] = '3GEX'
Healthy1.obs['Sex'] = 'Male'

# Healthy2
Healthy2 = sc.read_mtx("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy2/matrix.mtx.gz")
Healthy2_bc = pd.read_csv("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy2/barcodes.tsv.gz", header = None)
Healthy2_features = pd.read_csv('/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy2/features.tsv.gz', header=None, sep = '\t')

Healthy2 = Healthy2.T
Healthy2.obs_names = Healthy2_bc[0]

Healthy2.var['gene_label'] = Healthy2_features[1].tolist()
Healthy2.var.index = Healthy2.var['gene_label']
Healthy2.var['gene_ids'] = Healthy2_features[0].tolist()
Healthy2.var_names_make_unique()
Healthy2.var["feature_types"] = "Gene Expression"

Healthy2.obs['patient_id'] = 'GSM4041153'
Healthy2.obs['sample_id'] = 'Healthy 2'
Healthy2.obs['GEX'] = '3GEX'
Healthy2.obs['Sex'] = 'Male'

# Healthy3
Healthy3 = sc.read_mtx("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy3/matrix.mtx.gz")
Healthy3_bc = pd.read_csv("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy3/barcodes.tsv.gz", header = None)
Healthy3_features = pd.read_csv('/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy3/features.tsv.gz', header=None, sep = '\t')

Healthy3 = Healthy3.T
Healthy3.obs_names = Healthy3_bc[0]

Healthy3.var['gene_label'] = Healthy3_features[1].tolist()
Healthy3.var.index = Healthy3.var['gene_label']
Healthy3.var['gene_ids'] = Healthy3_features[0].tolist()
Healthy3.var_names_make_unique()
Healthy3.var["feature_types"] = "Gene Expression"

Healthy3.obs['patient_id'] = 'GSM4041155'
Healthy3.obs['sample_id'] = 'Healthy 3'
Healthy3.obs['GEX'] = '3GEX'
Healthy3.obs['Sex'] = 'Male'

# Healthy4
Healthy4 = sc.read_mtx("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy4/matrix.mtx.gz")
Healthy4_bc = pd.read_csv("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy4/barcodes.tsv.gz", header = None)
Healthy4_features = pd.read_csv('/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy4/features.tsv.gz', header=None, sep = '\t')

Healthy4 = Healthy4.T
Healthy4.obs_names = Healthy4_bc[0]

Healthy4.var['gene_label'] = Healthy4_features[1].tolist()
Healthy4.var.index = Healthy4.var['gene_label']
Healthy4.var['gene_ids'] = Healthy4_features[0].tolist()
Healthy4.var_names_make_unique()
Healthy4.var["feature_types"] = "Gene Expression"

Healthy4.obs['patient_id'] = 'GSM4041158'
Healthy4.obs['sample_id'] = 'Healthy 4'
Healthy4.obs['GEX'] = '3GEX'
Healthy4.obs['Sex'] = 'Female'

# Healthy5
Healthy5 = sc.read_mtx("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy5/matrix.mtx.gz")
Healthy5_bc = pd.read_csv("/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy5/barcodes.tsv.gz", header = None)
Healthy5_features = pd.read_csv('/Users/paolomarzano/Documents/JupyterLab/Analysis/Sincroni/HealthyLiver/Healthy5/features.tsv.gz', header=None, sep = '\t')

Healthy5 = Healthy5.T
Healthy5.obs_names = Healthy5_bc[0]

Healthy5.var['gene_label'] = Healthy5_features[1].tolist()
Healthy5.var.index = Healthy5.var['gene_label']
Healthy5.var['gene_ids'] = Healthy5_features[0].tolist()
Healthy5.var_names_make_unique()
Healthy5.var["feature_types"] = "Gene Expression"

Healthy5.obs['patient_id'] = 'GSM4041160'
Healthy5.obs['sample_id'] = 'Healthy 5'
Healthy5.obs['GEX'] = '3GEX'
Healthy5.obs['Sex'] = 'Male'

In [None]:
k = [Healthy1, Healthy2, Healthy3, Healthy4, Healthy5]
for k in k:
    k.var['mt'] = k.var_names.str.startswith('MT-')              # mitochondrial genes
    k.var['ribo'] = k.var_names.str.startswith(("RPS","RPL"))      # ribosomal genes
    k.var['hb'] = k.var_names.str.contains(("^HB[^(P)]"))        # hemoglobin genes
    sc.pp.calculate_qc_metrics(k, qc_vars = ['mt','ribo','hb'], percent_top = None, log1p = False, inplace = True)

## 1.2. QC filters

### *Healthy 1*

In [None]:
MIN_GENES_H1 = 500
MAX_GENES_H1 = 3500
MIN_UMI_H1 = 1000
MAX_UMI_H1 = 17000
MIN_CELLS_H1 = 3
MT_PERCENTAGE_H1 = 10
RIBO_PERCENTAGE_H1 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(Healthy1.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H1, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(Healthy1.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H1, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(Healthy1.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H1), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H1), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(Healthy1.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H1), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H1), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(Healthy1.n_obs, Healthy1.n_vars)

In [None]:
sc.pp.filter_cells(Healthy1, min_genes = MIN_GENES_H1)
sc.pp.filter_cells(Healthy1, max_genes = MAX_GENES_H1)
sc.pp.filter_cells(Healthy1, min_counts = MIN_UMI_H1)
sc.pp.filter_cells(Healthy1, max_counts = MAX_UMI_H1)
sc.pp.filter_genes(Healthy1, min_cells = MIN_CELLS_H1)

print("Cell x Genes after filtering")
print(Healthy1.n_obs, Healthy1.n_vars)

In [None]:
Healthy1 = Healthy1[Healthy1.obs['pct_counts_mt'] < MT_PERCENTAGE_H1, :]
Healthy1 = Healthy1[Healthy1.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H1, :]
print("Remaining cells %d"%Healthy1.n_obs)

### *Healthy 2*

In [None]:
MIN_GENES_H2 = 350
MAX_GENES_H2 = 2100
MIN_UMI_H2 = 550
MAX_UMI_H2 = 6000
MIN_CELLS_H2 = 3
MT_PERCENTAGE_H2 = 10
RIBO_PERCENTAGE_H2 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(Healthy2.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H2, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(Healthy2.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H2, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(Healthy2.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H2), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H2), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(Healthy2.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H2), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H2), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(Healthy2.n_obs, Healthy2.n_vars)

In [None]:
sc.pp.filter_cells(Healthy2, min_genes = MIN_GENES_H2)
sc.pp.filter_cells(Healthy2, max_genes = MAX_GENES_H2)
sc.pp.filter_cells(Healthy2, min_counts = MIN_UMI_H2)
sc.pp.filter_cells(Healthy2, max_counts = MAX_UMI_H2)
sc.pp.filter_genes(Healthy2, min_cells = MIN_CELLS_H2)

print("Cell x Genes after filtering")
print(Healthy2.n_obs, Healthy2.n_vars)

In [None]:
Healthy2 = Healthy2[Healthy2.obs['pct_counts_mt'] < MT_PERCENTAGE_H2, :]
Healthy2 = Healthy2[Healthy2.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H2, :]
print("Remaining cells %d"%Healthy2.n_obs)

### *Healthy 3*

In [None]:
MIN_GENES_H3 = 700
MAX_GENES_H3 = 4000
MIN_UMI_H3 = 1800
MAX_UMI_H3 = 19500
MIN_CELLS_H3 = 3
MT_PERCENTAGE_H3 = 10
RIBO_PERCENTAGE_H3 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(Healthy3.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H3, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(Healthy3.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H3, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(Healthy3.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H3), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H3), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(Healthy3.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H3), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H3), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(Healthy3.n_obs, Healthy3.n_vars)

In [None]:
sc.pp.filter_cells(Healthy3, min_genes = MIN_GENES_H3)
sc.pp.filter_cells(Healthy3, max_genes = MAX_GENES_H3)
sc.pp.filter_cells(Healthy3, min_counts = MIN_UMI_H3)
sc.pp.filter_cells(Healthy3, max_counts = MAX_UMI_H3)
sc.pp.filter_genes(Healthy3, min_cells = MIN_CELLS_H3)

print("Cell x Genes after filtering")
print(Healthy3.n_obs, Healthy3.n_vars)

In [None]:
Healthy3 = Healthy3[Healthy3.obs['pct_counts_mt'] < MT_PERCENTAGE_H3, :]
Healthy3 = Healthy3[Healthy3.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H3, :]
print("Remaining cells %d"%Healthy3.n_obs)

### *Healthy 4*

In [None]:
MIN_GENES_H4 = 700
MAX_GENES_H4 = 3500
MIN_UMI_H4 = 1600
MAX_UMI_H4 = 17000
MIN_CELLS_H4 = 3
MT_PERCENTAGE_H4 = 10
RIBO_PERCENTAGE_H4 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(Healthy4.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H4, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(Healthy4.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H4, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(Healthy4.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H4), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H4), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(Healthy4.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H4), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H4), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(Healthy4.n_obs, Healthy4.n_vars)

In [None]:
sc.pp.filter_cells(Healthy4, min_genes = MIN_GENES_H4)
sc.pp.filter_cells(Healthy4, max_genes = MAX_GENES_H4)
sc.pp.filter_cells(Healthy4, min_counts = MIN_UMI_H4)
sc.pp.filter_cells(Healthy4, max_counts = MAX_UMI_H4)
sc.pp.filter_genes(Healthy4, min_cells = MIN_CELLS_H4)

print("Cell x Genes after filtering")
print(Healthy4.n_obs, Healthy4.n_vars)

In [None]:
Healthy4 = Healthy4[Healthy4.obs['pct_counts_mt'] < MT_PERCENTAGE_H4, :]
Healthy4 = Healthy4[Healthy4.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H4, :]
print("Remaining cells %d"%Healthy4.n_obs)

### *Healthy 5*

In [None]:
MIN_GENES_H5 = 500
MAX_GENES_H5 = 3000
MIN_UMI_H5 = 1000
MAX_UMI_H5 = 15000
MIN_CELLS_H5 = 3
MT_PERCENTAGE_H5 = 10
RIBO_PERCENTAGE_H5 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(Healthy5.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H5, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(Healthy5.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H5, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(Healthy5.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H5), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H5), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(Healthy5.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H5), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H5), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(Healthy5.n_obs, Healthy5.n_vars)

In [None]:
sc.pp.filter_cells(Healthy5, min_genes = MIN_GENES_H5)
sc.pp.filter_cells(Healthy5, max_genes = MAX_GENES_H5)
sc.pp.filter_cells(Healthy5, min_counts = MIN_UMI_H5)
sc.pp.filter_cells(Healthy5, max_counts = MAX_UMI_H5)
sc.pp.filter_genes(Healthy5, min_cells = MIN_CELLS_H5)

print("Cell x Genes after filtering")
print(Healthy5.n_obs, Healthy5.n_vars)

In [None]:
Healthy5 = Healthy5[Healthy5.obs['pct_counts_mt'] < MT_PERCENTAGE_H5, :]
Healthy5 = Healthy5[Healthy5.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H5, :]
print("Remaining cells %d"%Healthy5.n_obs)

## 1.3. adata_concatenated

In [None]:
batch_list = ['GSM4041150', 'GSM4041153', 'GSM4041155', 'GSM4041158', 'GSM4041160']
HealthyLiver_GEO = Healthy1.concatenate(Healthy2, Healthy3, Healthy4, Healthy5,
                          join = 'outer',
                          batch_key = 'patient_id',
                          batch_categories = batch_list,
                          uns_merge = None,
                          index_unique = '-',
                          fill_value = 0.0)
HealthyLiver_GEO

In [None]:
HealthyLiver_GEO.var['gene_ids'] = HealthyLiver_GEO.var_names
HealthyLiver_GEO.var['feature_types'] = 'Gene Expression'
HealthyLiver_GEO.var['genome'] = 'GRCh38'

var = ['gene_label-GSM4041150', 'gene_ids-GSM4041150', 'feature_types-GSM4041150', 'mt-GSM4041150', 'ribo-GSM4041150', 'hb-GSM4041150', 'n_cells_by_counts-GSM4041150', 'mean_counts-GSM4041150',
       'pct_dropout_by_counts-GSM4041150', 'total_counts-GSM4041150', 'n_cells-GSM4041150', 'gene_label-GSM4041153', 'gene_ids-GSM4041153', 'feature_types-GSM4041153', 'mt-GSM4041153', 'ribo-GSM4041153',
       'hb-GSM4041153', 'n_cells_by_counts-GSM4041153', 'mean_counts-GSM4041153', 'pct_dropout_by_counts-GSM4041153', 'total_counts-GSM4041153', 'n_cells-GSM4041153', 'gene_label-GSM4041155',
       'gene_ids-GSM4041155', 'feature_types-GSM4041155', 'mt-GSM4041155', 'ribo-GSM4041155', 'hb-GSM4041155', 'n_cells_by_counts-GSM4041155', 'mean_counts-GSM4041155', 'pct_dropout_by_counts-GSM4041155',
       'total_counts-GSM4041155', 'n_cells-GSM4041155', 'gene_label-GSM4041158', 'gene_ids-GSM4041158', 'feature_types-GSM4041158', 'mt-GSM4041158', 'ribo-GSM4041158', 'hb-GSM4041158',
       'n_cells_by_counts-GSM4041158', 'mean_counts-GSM4041158', 'pct_dropout_by_counts-GSM4041158', 'total_counts-GSM4041158', 'n_cells-GSM4041158', 'gene_label-GSM4041160', 'gene_ids-GSM4041160',
       'feature_types-GSM4041160', 'mt-GSM4041160', 'ribo-GSM4041160', 'hb-GSM4041160', 'n_cells_by_counts-GSM4041160', 'mean_counts-GSM4041160', 'pct_dropout_by_counts-GSM4041160',
       'total_counts-GSM4041160', 'n_cells-GSM4041160']

for var in var:
    del HealthyLiver_GEO.var[var]

HealthyLiver_GEO.var['mt'] = HealthyLiver_GEO.var_names.str.startswith('MT-')              # mitochondrial genes
HealthyLiver_GEO.var['ribo'] = HealthyLiver_GEO.var_names.str.startswith(("RPS","RPL"))      # ribosomal genes
HealthyLiver_GEO.var['hb'] = HealthyLiver_GEO.var_names.str.contains(("^HB[^(P)]"))        # hemoglobin genes

HealthyLiver_GEO.var

## 1.4. Doublets filtering

In [None]:
print("Cell x Genes BEFORE doublets removal")
print(HealthyLiver_GEO.n_obs, HealthyLiver_GEO.n_vars)

In [None]:
import scrublet as scr

# split per batch into new objects.
batches = list(HealthyLiver_GEO.obs['sample_id'].unique()) #HealthyLiver_GEO.obs['orig.ident'].tolist()#.cat.categories.tolist()
alldata = {}
for batch in batches:
    tmp = HealthyLiver_GEO[HealthyLiver_GEO.obs['sample_id'] == batch,]
    print(batch, ":", tmp.shape[0], " cells")
    scrub = scr.Scrublet(tmp.X)
    out = scrub.scrub_doublets(verbose=False, n_prin_comps = 20)
    alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
    print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")

In [None]:
scrub_pred = pd.concat(alldata.values())
HealthyLiver_GEO.obs['doublet_scores'] = scrub_pred['doublet_score'] 
HealthyLiver_GEO.obs['predicted_doublets'] = scrub_pred['predicted_doublets'] 

sum(HealthyLiver_GEO.obs['predicted_doublets'])

In [None]:
%matplotlib inline
HealthyLiver_GEO.obs['doublet_info'] = HealthyLiver_GEO.obs["predicted_doublets"].astype(str)
sc.pl.violin(HealthyLiver_GEO, 'n_genes_by_counts', jitter = 0.4, groupby = 'doublet_info', rotation=45)

In [None]:
# also revert back to the raw counts as the main matrix in HealthyLiver_GEO
HealthyLiver_GEO = HealthyLiver_GEO[HealthyLiver_GEO.obs['doublet_info'] == 'False',:]

print("Cell x Genes AFTER doublets removal")
print(HealthyLiver_GEO.n_obs, HealthyLiver_GEO.n_vars)

## 1.5. Save the object after QC filters

In [None]:
outfilename = os.path.join(data_folder, "HealthyLiver_GEO_afterQC.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
HealthyLiver_GEO.write(outfilename)
print("Done!")

## 1.6. Normalization & HVG selection

In [None]:
HealthyLiver_GEO.layers['raw_counts'] = HealthyLiver_GEO.X.copy()

In [None]:
sc.pp.normalize_total(HealthyLiver_GEO, target_sum = 1e4)
sc.pp.log1p(HealthyLiver_GEO)

In [None]:
HealthyLiver_GEO.raw = HealthyLiver_GEO
HealthyLiver_GEO.layers["normalized_counts"] = HealthyLiver_GEO.X.copy()

In [None]:
sc.pp.highly_variable_genes(HealthyLiver_GEO,
                            n_top_genes = 4000, flavor = "seurat_v3",
                            layer = "raw_counts", batch_key = 'sample_id',
                            subset = False)

## 1.7. scVI integration

In [None]:
adata_train = HealthyLiver_GEO[:, HealthyLiver_GEO.var.highly_variable].copy()
adata_train

In [None]:
scvi.model.SCVI.setup_anndata(adata_train, layer = "raw_counts", batch_key = 'patient_id')
vae = scvi.model.SCVI(adata_train, n_layers = 2, n_latent = 30, latent_distribution = "normal", gene_likelihood = "nb")
vae.train(accelerator = "cpu")

In [None]:
HealthyLiver_GEO.obsm["X_scVI"] = vae.get_latent_representation()
HealthyLiver_GEO.obsm["denoised_RNA"] = vae.get_normalized_expression()

In [None]:
sc.pp.neighbors(HealthyLiver_GEO, n_neighbors = 15, use_rep = "X_scVI", key_added = "scVI")
sc.tl.leiden(HealthyLiver_GEO, resolution = 0.5, key_added = 'res_0.5', neighbors_key = "scVI")
sc.tl.paga(HealthyLiver_GEO, groups = 'res_0.5', neighbors_key = "scVI")
sc.pl.paga(HealthyLiver_GEO, frameon = True, edge_width_scale = 0.3)
sc.tl.umap(HealthyLiver_GEO, neighbors_key = "scVI", init_pos = 'paga')

## 1.8. Clustering

In [None]:
clustering_labels = []
for res in [0, 0.8, 1.0, 1.5, 2.0, 2.5]:
    clustering_labels.append("res_{}".format(res))
    if "res_{}".format(res) in HealthyLiver_GEO.obs:
        print("res_{}".format(res) + " already exists... going on with next resolution.")
        continue
    sc.tl.leiden(HealthyLiver_GEO, resolution = res, key_added = "res_{}".format(res), neighbors_key = "scVI")

In [None]:
clustering_labels = ['res_0.5', 'res_0.8', 'res_1.0', 'res_1.5', 'res_2.0', 'res_2.5']
sc.pl.umap(HealthyLiver_GEO, color = clustering_labels, legend_loc = 'on data', legend_fontsize = 10, legend_fontoutline = 2, legend_fontweight = 8, wspace = .5, ncols = 3, frameon = True)

## 1.9. Save final object

In [None]:
outfilename = os.path.join(data_folder, "HealthyLiver_GEO_integration.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
HealthyLiver_GEO.write(outfilename)
print("Done!")

# 2. **Healthy liver – Reclustering macrophages**

In [None]:
Macrophages_GEO = HealthyLiver_GEO[HealthyLiver_GEO.obs['res_0.8'].isin(['3', '7', '8'])]
Macrophages_GEO

In [None]:
obs = ['doublet_scores', 'predicted_doublets', 'doublet_info', 'res_0.5', 'res_0', 'res_0.8', 'res_1.0', 'res_1.5', 'res_2.0', 'res_2.5']
var = ['highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches']

for obs in obs:
    del Macrophages_GEO.obs[obs]

for var in var:
    del Macrophages_GEO.var[var]

del Macrophages_GEO.uns
del Macrophages_GEO.obsm
del Macrophages_GEO.varm
del Macrophages_GEO.obsp

Macrophages_GEO

## 2.1. HVG selection

In [None]:
sc.pp.highly_variable_genes(Macrophages_GEO,
                            n_top_genes = 4000, flavor = "seurat_v3",
                            layer = "raw_counts", batch_key = 'sample_id',
                            subset = False)

## 2.2. scVI integration

In [None]:
adata_train = Macrophages_GEO[:, Macrophages_GEO.var.highly_variable].copy()
adata_train

In [None]:
scvi.model.SCVI.setup_anndata(adata_train, layer = "raw_counts", batch_key = 'patient_id')
vae = scvi.model.SCVI(adata_train, n_layers = 2, n_latent = 30, latent_distribution = "normal", gene_likelihood = "nb")
vae.train(accelerator = "cpu")

In [None]:
Macrophages_GEO.obsm["X_scVI"] = vae.get_latent_representation()
Macrophages_GEO.obsm["denoised_RNA"] = vae.get_normalized_expression()

In [None]:
sc.pp.neighbors(Macrophages_GEO, n_neighbors = 15, use_rep = "X_scVI", key_added = "scVI")
sc.tl.leiden(Macrophages_GEO, resolution = 0.5, key_added = 'res_0.5', neighbors_key = "scVI")
sc.tl.paga(Macrophages_GEO, groups = 'res_0.5', neighbors_key = "scVI")
sc.pl.paga(Macrophages_GEO, frameon = True, edge_width_scale = 0.3)
sc.tl.umap(Macrophages_GEO, neighbors_key = "scVI", init_pos = 'paga')

## 2.3. Clustering

In [None]:
clustering_labels = []
for res in [0, 0.8, 1.0, 1.5, 2.0, 2.5]:
    clustering_labels.append("res_{}".format(res))
    if "res_{}".format(res) in Macrophages_GEO.obs:
        print("res_{}".format(res) + " already exists... going on with next resolution.")
        continue
    sc.tl.leiden(Macrophages_GEO, resolution = res, key_added = "res_{}".format(res), neighbors_key = "scVI")

In [None]:
clustering_labels = ['res_0.5', 'res_0.8', 'res_1.0', 'res_1.5', 'res_2.0', 'res_2.5']
sc.pl.umap(Macrophages_GEO, color = clustering_labels, legend_loc = 'on data', legend_fontsize = 10, legend_fontoutline = 2, legend_fontweight = 8, wspace = .5, ncols = 3, frameon = True)

## 2.4. Save the final object

In [None]:
outfilename = os.path.join(data_folder, "HealthyLiver_GEO_RecMacrophages.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
Macrophages_GEO.write(outfilename)
print("Done!")

In [None]:
outfilename = os.path.join(data_folder, "HealthyLiver_GEO_RecMacrophages.h5ad")
Macrophages_GEO = sc.read_h5ad(outfilename)
Macrophages_GEO

## 2.5. UMAP

In [None]:
clustering_labels = ['res_0.5', 'res_0.8', 'res_1.0', 'res_1.5']
Macrophages_GEO.obs['Rename'] = "HL" + Macrophages_GEO.obs['res_0.8'].astype(str)

sc.pl.umap(Subset_liver_HL, color = 'Rename', legend_loc = 'on data', legend_fontsize = 14, legend_fontoutline = 2,
           legend_fontweight = 1, title = 'Healthy Liver clusters', frameon = True,
           save = 'UMAP Reclustering Macrophages Healthy liver GEO.png')

## 2.6. ViolinPlot

In [None]:
KC = ['CD5L', 'VSIG4', 'MS4A4A', 'SLC16A9', 'TMEM132E', 'SLC1A3', 'CD163', 'FOLR2', 'TIMD4', 'GFRA2', 'ADRB1', 'TMEM26', 'SLC40A1',
      'VCAM1', 'SUCNR1', 'NDST3', 'CETP', 'LYVE1', 'MARCO', 'SDC3', 'CXCL12', 'LILRB5', 'SCD', 'SELENBP1',
      'MMP19', 'LGMN', 'HMOX1', 'CTSL', 'C2', 'FABP3', 'TMEM37', 'RND3']

colors = list(Macrophages_GEO.uns['Rename_colors'])

sc.pl.stacked_violin(Macrophages_GEO, KC, 'Rename', standard_scale = 'var', swap_axes = False, dendrogram = False, scale = 'count', row_palette = colors, yticklabels = False, 
             save = "Reclustering Macrophages GEO dataset_res 0.8_Signature KC.png")

## 2.7. Module score

In [None]:
Cytoxic_Inflammatory = ['PRF1', 'GZMM', 'GZMA', 'KLRB1', 'GZMK', 'GZMH', 'IL32', 'CCL5', 'GZMB', 'GNLY', 'IFNG', 'TNF', 'CST7', 'NKG7', 'EOMES', 'EFHD2', 'F2R', 'SLAMF7', "IL1B", "IL2", "IL4"]
sc.tl.score_genes(Macrophages_GEO, Cytoxic_Inflammatory, ctrl_size = 50, gene_pool = None, n_bins = 25, score_name = 'Cytoxic_Inflammatory_score', random_state = 0, copy = False)
sc.pl.umap(Macrophages_GEO, color = 'Cytoxic_Inflammatory_score', vmin = 0, vmax = 1, size = 50, cmap = 'inferno', title = 'Cytotoxic-Inflammatory Score', save = 'Cytoxic_Inflammatory_Score macrophages Healthy Liver.png')

# 3. **Healthy blood – GEO: GSE260763**

## 3.1. Create the anndata

In [None]:
p01 = '.../HealthyBlood_raw/filtered_feature_bc_matrix_VCOV_01_P1.h5'
p12 = '.../HealthyBlood_raw/filtered_feature_bc_matrix_VCOV_012_P1.h5'
p14 = '.../HealthyBlood_raw/filtered_feature_bc_matrix_VCOV_014_P1.h5'
p03 = '.../HealthyBlood_raw/filtered_feature_bc_matrix_VCOV_03_P1.h5'
p08 = '.../HealthyBlood_raw/filtered_feature_bc_matrix_VCOV_08_P1.h5'
p09 = '.../HealthyBlood_raw/filtered_feature_bc_matrix_VCOV_09_P1.h5'

p01 = sc.read_10x_h5(p01, gex_only = False)
p01.var_names_make_unique()

p12 = sc.read_10x_h5(p12, gex_only = False)
p12.var_names_make_unique()

p14 = sc.read_10x_h5(p14, gex_only = False)
p14.var_names_make_unique()

p03 = sc.read_10x_h5(p03, gex_only = False)
p03.var_names_make_unique()

p08 = sc.read_10x_h5(p08, gex_only = False)
p08.var_names_make_unique()

p09 = sc.read_10x_h5(p09, gex_only = False)
p09.var_names_make_unique()

p01.obs['donor_id'] = 'p01'
p12.obs['donor_id'] = 'p12'
p14.obs['donor_id'] = 'p14'
p03.obs['donor_id'] = 'p03'
p08.obs['donor_id'] = 'p08'
p09.obs['donor_id'] = 'p09'

k = [p01, p12, p14, p03, p08, p09]
for k in k:
    k.var['mt'] = k.var_names.str.startswith('MT-')              # mitochondrial genes
    k.var['ribo'] = k.var_names.str.startswith(("RPS","RPL"))      # ribosomal genes
    k.var['hb'] = k.var_names.str.contains(("^HB[^(P)]"))        # hemoglobin genes
    sc.pp.calculate_qc_metrics(k, qc_vars = ['mt','ribo','hb'], percent_top = None, log1p = False, inplace = True)

## 3.2. QC filters

### *p01*

In [None]:
MIN_GENES_H1 = 500
MAX_GENES_H1 = 4000
MIN_UMI_H1 = 1400
MAX_UMI_H1 = 18000
MIN_CELLS_H1 = 3
MT_PERCENTAGE_H1 = 8
RIBO_PERCENTAGE_H1 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p01.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H1, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p01.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H1, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p01.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H1), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H1), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p01.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H1), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H1), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(p01.n_obs, p01.n_vars)

In [None]:
sc.pp.filter_cells(p01, min_genes = MIN_GENES_H1)
sc.pp.filter_cells(p01, max_genes = MAX_GENES_H1)
sc.pp.filter_cells(p01, min_counts = MIN_UMI_H1)
sc.pp.filter_cells(p01, max_counts = MAX_UMI_H1)
sc.pp.filter_genes(p01, min_cells = MIN_CELLS_H1)

print("Cell x Genes after filtering")
print(p01.n_obs, p01.n_vars)

In [None]:
p01 = p01[p01.obs['pct_counts_mt'] < MT_PERCENTAGE_H1, :]
p01 = p01[p01.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H1, :]
print("Remaining cells %d"%p01.n_obs)

### *p12*

In [None]:
MIN_GENES_H12 = 500
MAX_GENES_H12 = 4000
MIN_UMI_H12 = 1200
MAX_UMI_H12 = 15000
MIN_CELLS_H12 = 3
MT_PERCENTAGE_H12 = 8
RIBO_PERCENTAGE_H12 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p12.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H12, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p12.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H12, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p12.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H12), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H12), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p12.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H12), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H12), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(p12.n_obs, p12.n_vars)

In [None]:
sc.pp.filter_cells(p12, min_genes = MIN_GENES_H12)
sc.pp.filter_cells(p12, max_genes = MAX_GENES_H12)
sc.pp.filter_cells(p12, min_counts = MIN_UMI_H12)
sc.pp.filter_cells(p12, max_counts = MAX_UMI_H12)
sc.pp.filter_genes(p12, min_cells = MIN_CELLS_H12)

print("Cell x Genes after filtering")
print(p12.n_obs, p12.n_vars)

In [None]:
p12 = p12[p12.obs['pct_counts_mt'] < MT_PERCENTAGE_H12, :]
p12 = p12[p12.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H12, :]
print("Remaining cells %d"%p01.n_obs)

### Plot before filtering

In [None]:
MIN_GENES_H12 = 500
MAX_GENES_H12 = 4000
MIN_UMI_H12 = 1200
MAX_UMI_H12 = 15000
MIN_CELLS_H12 = 3
MT_PERCENTAGE_H12 = 8
RIBO_PERCENTAGE_H12 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p12.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H12, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p12.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H12, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p12.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H12), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H12), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p12.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H12), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H12), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

### Filtering

In [None]:
print("Cell x Genes before filtering")
print(p12.n_obs, p12.n_vars)

In [None]:
sc.pp.filter_cells(p12, min_genes = MIN_GENES_H12)
sc.pp.filter_cells(p12, max_genes = MAX_GENES_H12)
sc.pp.filter_cells(p12, min_counts = MIN_UMI_H12)
sc.pp.filter_cells(p12, max_counts = MAX_UMI_H12)
sc.pp.filter_genes(p12, min_cells = MIN_CELLS_H12)

print("Cell x Genes after filtering")
print(p12.n_obs, p12.n_vars)

In [None]:
p12 = p12[p12.obs['pct_counts_mt'] < MT_PERCENTAGE_H12, :]
p12 = p12[p12.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H12, :]
print("Remaining cells %d"%p01.n_obs)

### *p14*

In [None]:
MIN_GENES_H14 = 500
MAX_GENES_H14 = 4000
MIN_UMI_H14 = 1400
MAX_UMI_H14 = 13000
MIN_CELLS_H14 = 3
MT_PERCENTAGE_H14 = 8
RIBO_PERCENTAGE_H14 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p14.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H14, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p14.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H14, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p14.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H14), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H14), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p14.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H14), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H14), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(p14.n_obs, p14.n_vars)

In [None]:
sc.pp.filter_cells(p14, min_genes = MIN_GENES_H14)
sc.pp.filter_cells(p14, max_genes = MAX_GENES_H14)
sc.pp.filter_cells(p14, min_counts = MIN_UMI_H14)
sc.pp.filter_cells(p14, max_counts = MAX_UMI_H14)
sc.pp.filter_genes(p14, min_cells = MIN_CELLS_H14)

print("Cell x Genes after filtering")
print(p14.n_obs, p14.n_vars)

In [None]:
p14 = p14[p14.obs['pct_counts_mt'] < MT_PERCENTAGE_H14, :]
p14 = p14[p14.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H14, :]
print("Remaining cells %d"%p01.n_obs)

### *p03*

In [None]:
MIN_GENES_H3 = 700
MAX_GENES_H3 = 4500
MIN_UMI_H3 = 1400
MAX_UMI_H3 = 18000
MIN_CELLS_H3 = 3
MT_PERCENTAGE_H3 = 8
RIBO_PERCENTAGE_H3 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p03.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H3, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p03.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H3, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p03.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H3), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H3), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p03.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H3), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H3), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(p03.n_obs, p03.n_vars)

In [None]:
sc.pp.filter_cells(p03, min_genes = MIN_GENES_H3)
sc.pp.filter_cells(p03, max_genes = MAX_GENES_H3)
sc.pp.filter_cells(p03, min_counts = MIN_UMI_H3)
sc.pp.filter_cells(p03, max_counts = MAX_UMI_H3)
sc.pp.filter_genes(p03, min_cells = MIN_CELLS_H3)

print("Cell x Genes after filtering")
print(p03.n_obs, p03.n_vars)

In [None]:
p03 = p03[p03.obs['pct_counts_mt'] < MT_PERCENTAGE_H3, :]
p03 = p03[p03.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H3, :]
print("Remaining cells %d"%p01.n_obs)

### Plot before filtering

In [None]:
MIN_GENES_H3 = 700
MAX_GENES_H3 = 4500
MIN_UMI_H3 = 1400
MAX_UMI_H3 = 18000
MIN_CELLS_H3 = 3
MT_PERCENTAGE_H3 = 8
RIBO_PERCENTAGE_H3 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p03.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H3, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p03.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H3, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p03.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H3), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H3), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p03.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H3), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H3), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

### Filtering

In [None]:
print("Cell x Genes before filtering")
print(p03.n_obs, p03.n_vars)

In [None]:
sc.pp.filter_cells(p03, min_genes = MIN_GENES_H3)
sc.pp.filter_cells(p03, max_genes = MAX_GENES_H3)
sc.pp.filter_cells(p03, min_counts = MIN_UMI_H3)
sc.pp.filter_cells(p03, max_counts = MAX_UMI_H3)
sc.pp.filter_genes(p03, min_cells = MIN_CELLS_H3)

print("Cell x Genes after filtering")
print(p03.n_obs, p03.n_vars)

In [None]:
p03 = p03[p03.obs['pct_counts_mt'] < MT_PERCENTAGE_H3, :]
p03 = p03[p03.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H3, :]
print("Remaining cells %d"%p01.n_obs)

### *p08*

In [None]:
MIN_GENES_H8 = 500
MAX_GENES_H8 = 4000
MIN_UMI_H8 = 1400
MAX_UMI_H8 = 18000
MIN_CELLS_H8 = 3
MT_PERCENTAGE_H8 = 8
RIBO_PERCENTAGE_H8 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p08.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H8, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p08.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H8, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p08.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H8), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H8), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p08.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H8), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H8), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(p08.n_obs, p08.n_vars)

In [None]:
sc.pp.filter_cells(p08, min_genes = MIN_GENES_H8)
sc.pp.filter_cells(p08, max_genes = MAX_GENES_H8)
sc.pp.filter_cells(p08, min_counts = MIN_UMI_H8)
sc.pp.filter_cells(p08, max_counts = MAX_UMI_H8)
sc.pp.filter_genes(p08, min_cells = MIN_CELLS_H8)

print("Cell x Genes after filtering")
print(p08.n_obs, p08.n_vars)

In [None]:
p08 = p08[p08.obs['pct_counts_mt'] < MT_PERCENTAGE_H8, :]
p08 = p08[p08.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H8, :]
print("Remaining cells %d"%p01.n_obs)

### Plot before filtering

In [None]:
MIN_GENES_H8 = 500
MAX_GENES_H8 = 4000
MIN_UMI_H8 = 1400
MAX_UMI_H8 = 18000
MIN_CELLS_H8 = 3
MT_PERCENTAGE_H8 = 8
RIBO_PERCENTAGE_H8 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p08.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H8, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p08.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H8, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p08.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H8), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H8), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p08.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H8), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H8), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

### Filtering

In [None]:
print("Cell x Genes before filtering")
print(p08.n_obs, p08.n_vars)

In [None]:
sc.pp.filter_cells(p08, min_genes = MIN_GENES_H8)
sc.pp.filter_cells(p08, max_genes = MAX_GENES_H8)
sc.pp.filter_cells(p08, min_counts = MIN_UMI_H8)
sc.pp.filter_cells(p08, max_counts = MAX_UMI_H8)
sc.pp.filter_genes(p08, min_cells = MIN_CELLS_H8)

print("Cell x Genes after filtering")
print(p08.n_obs, p08.n_vars)

In [None]:
p08 = p08[p08.obs['pct_counts_mt'] < MT_PERCENTAGE_H8, :]
p08 = p08[p08.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H8, :]
print("Remaining cells %d"%p01.n_obs)

### *p09*

In [None]:
MIN_GENES_H9 = 500
MAX_GENES_H9 = 4000
MIN_UMI_H9 = 1400
MAX_UMI_H9 = 18000
MIN_CELLS_H9 = 3
MT_PERCENTAGE_H9 = 8
RIBO_PERCENTAGE_H9 = 0.05

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)

ax = plt.subplot(1, 2, 1)
sns.kdeplot(p09.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_H9, 0, 1, c = 'red')
ax.set_xlim([0, 20])
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(p09.obs['pct_counts_ribo'], shade = True, color = 'deepskyblue')
plt.axvline(RIBO_PERCENTAGE_H9, 0, 1, c = 'red')

plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize = (12, 4), tight_layout = True)
plt.subplot(1, 2, 1)
sns.kdeplot(np.log10(p09.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_H9), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_H9), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 2, 2)
sns.kdeplot(np.log10(p09.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_H9), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_H9), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(p09.n_obs, p09.n_vars)

In [None]:
sc.pp.filter_cells(p09, min_genes = MIN_GENES_H9)
sc.pp.filter_cells(p09, max_genes = MAX_GENES_H9)
sc.pp.filter_cells(p09, min_counts = MIN_UMI_H9)
sc.pp.filter_cells(p09, max_counts = MAX_UMI_H9)
sc.pp.filter_genes(p09, min_cells = MIN_CELLS_H9)

print("Cell x Genes after filtering")
print(p09.n_obs, p09.n_vars)

In [None]:
p09 = p09[p09.obs['pct_counts_mt'] < MT_PERCENTAGE_H9, :]
p09 = p09[p09.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_H9, :]
print("Remaining cells %d"%p01.n_obs)

## 3.3. adata_concatenated

In [None]:
batch_list = ['p01', 'p12', 'p14', 'p03', 'p08', 'p09']
HealthyBlood = p01.concatenate(p12, p14, p03, p08, p09,
                          join = 'outer',
                          batch_key = 'donor_id',
                          batch_categories = batch_list,
                          uns_merge = None,
                          index_unique = '-',
                          fill_value = 0.0)
HealthyBlood

In [None]:
HealthyBlood.var['gene_ids'] = HealthyBlood.var_names
HealthyBlood.var['feature_types'] = 'Gene Expression'
HealthyBlood.var['genome'] = 'GRCh38'

var = ['gene_ids-p01', 'feature_types-p01', 'genome-p01', 'mt-p01', 'ribo-p01', 'hb-p01', 'n_cells_by_counts-p01', 'mean_counts-p01', 'pct_dropout_by_counts-p01', 'total_counts-p01', 'n_cells-p01',
       'gene_ids-p03', 'feature_types-p03', 'genome-p03', 'mt-p03', 'ribo-p03', 'hb-p03', 'n_cells_by_counts-p03', 'mean_counts-p03', 'pct_dropout_by_counts-p03', 'total_counts-p03', 'n_cells-p03',
       'gene_ids-p08', 'feature_types-p08', 'genome-p08', 'mt-p08', 'ribo-p08', 'hb-p08', 'n_cells_by_counts-p08', 'mean_counts-p08', 'pct_dropout_by_counts-p08', 'total_counts-p08', 'n_cells-p08',
       'gene_ids-p09', 'feature_types-p09', 'genome-p09', 'mt-p09', 'ribo-p09', 'hb-p09', 'n_cells_by_counts-p09', 'mean_counts-p09', 'pct_dropout_by_counts-p09', 'total_counts-p09', 'n_cells-p09',
       'gene_ids-p12', 'feature_types-p12', 'genome-p12', 'mt-p12', 'ribo-p12', 'hb-p12', 'n_cells_by_counts-p12', 'mean_counts-p12', 'pct_dropout_by_counts-p12', 'total_counts-p12', 'n_cells-p12',
       'gene_ids-p14', 'feature_types-p14', 'genome-p14', 'mt-p14', 'ribo-p14', 'hb-p14', 'n_cells_by_counts-p14', 'mean_counts-p14', 'pct_dropout_by_counts-p14', 'total_counts-p14', 'n_cells-p14']

for var in var:
    del HealthyBlood.var[var]

HealthyBlood.var['mt'] = HealthyBlood.var_names.str.startswith('MT-')              # mitochondrial genes
HealthyBlood.var['ribo'] = HealthyBlood.var_names.str.startswith(("RPS","RPL"))      # ribosomal genes
HealthyBlood.var['hb'] = HealthyBlood.var_names.str.contains(("^HB[^(P)]"))        # hemoglobin genes

HealthyBlood.var

## 3.4. Doublets filtering

In [None]:
print("Cell x Genes BEFORE doublets removal")
print(HealthyBlood.n_obs, HealthyBlood.n_vars)

In [None]:
import scrublet as scr

# split per batch into new objects.
batches = list(HealthyBlood.obs['donor_id'].unique()) #HealthyBlood.obs['orig.ident'].tolist()#.cat.categories.tolist()
alldata = {}
for batch in batches:
    tmp = HealthyBlood[HealthyBlood.obs['donor_id'] == batch,]
    print(batch, ":", tmp.shape[0], " cells")
    scrub = scr.Scrublet(tmp.X)
    out = scrub.scrub_doublets(verbose=False, n_prin_comps = 20)
    alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
    print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")

In [None]:
scrub_pred = pd.concat(alldata.values())
HealthyBlood.obs['doublet_scores'] = scrub_pred['doublet_score'] 
HealthyBlood.obs['predicted_doublets'] = scrub_pred['predicted_doublets'] 

sum(HealthyBlood.obs['predicted_doublets'])

In [None]:
%matplotlib inline
HealthyBlood.obs['doublet_info'] = HealthyBlood.obs["predicted_doublets"].astype(str)
sc.pl.violin(HealthyBlood, 'n_genes_by_counts', jitter = 0.4, groupby = 'doublet_info', rotation=45)

In [None]:
# also revert back to the raw counts as the main matrix in HealthyBlood
HealthyBlood = HealthyBlood[HealthyBlood.obs['doublet_info'] == 'False',:]

print("Cell x Genes AFTER doublets removal")
print(HealthyBlood.n_obs, HealthyBlood.n_vars)

## 3.5. Save the object after QC filters

In [None]:
outfilename = os.path.join(data_folder, "HealthyBlood_afterQC.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
HealthyBlood.write(outfilename)
print("Done!")

## 3.6. Normalization & HVG selection

In [None]:
HealthyBlood.layers['raw_counts'] = HealthyBlood.X.copy()

In [None]:
sc.pp.normalize_total(HealthyBlood, target_sum = 1e4)
sc.pp.log1p(HealthyBlood)

In [None]:
HealthyBlood.raw = HealthyBlood
HealthyBlood.layers["normalized_counts"] = HealthyLiver_GEO.X.copy()

In [None]:
sc.pp.highly_variable_genes(HealthyBlood,
                            n_top_genes = 4000, flavor = "seurat_v3",
                            layer = "raw_counts", batch_key = 'donor_id',
                            subset = False)

## 3.7. scVI integration

In [None]:
adata_train = HealthyBlood[:, HealthyBlood.var.highly_variable].copy()
adata_train

In [None]:
scvi.model.SCVI.setup_anndata(adata_train, layer = "raw_counts", batch_key = 'donor_id')
vae = scvi.model.SCVI(adata_train, n_layers = 2, n_latent = 30, latent_distribution = "normal", gene_likelihood = "nb")
vae.train(accelerator = "cpu")

In [None]:
HealthyBlood.obsm["X_scVI"] = vae.get_latent_representation()
HealthyBlood.obsm["denoised_RNA"] = vae.get_normalized_expression()

In [None]:
sc.pp.neighbors(HealthyBlood, n_neighbors = 15, use_rep = "X_scVI", key_added = "scVI")
sc.tl.leiden(HealthyBlood, resolution = 0.5, key_added = 'res_0.5', neighbors_key = "scVI")
sc.tl.paga(HealthyBlood, groups = 'res_0.5', neighbors_key = "scVI")
sc.pl.paga(HealthyBlood, frameon = True, edge_width_scale = 0.3)
sc.tl.umap(HealthyBlood, neighbors_key = "scVI", init_pos = 'paga')

## 3.8. Clustering

In [None]:
clustering_labels = []
for res in [0, 0.8, 1.0, 1.5, 2.0, 2.5]:
    clustering_labels.append("res_{}".format(res))
    if "res_{}".format(res) in HealthyBlood.obs:
        print("res_{}".format(res) + " already exists... going on with next resolution.")
        continue
    sc.tl.leiden(HealthyBlood, resolution = res, key_added = "res_{}".format(res), neighbors_key = "scVI")

In [None]:
clustering_labels = ['res_0.5', 'res_0.8', 'res_1.0', 'res_1.5', 'res_2.0', 'res_2.5']
sc.pl.umap(HealthyBlood, color = clustering_labels, legend_loc = 'on data', legend_fontsize = 10, legend_fontoutline = 2, legend_fontweight = 8, wspace = .5, ncols = 3, frameon = True)

## 3.9. Save final object

In [None]:
outfilename = os.path.join(data_folder, "HealthyBlood_integration.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
HealthyBlood.write(outfilename)
print("Done!")

# 4. **Healthy blood – Reclustering monocytes**

In [None]:
Monocytes = HealthyBlood[HealthyBlood.obs['res_0.8'].isin(['1', '11', '12', '13'])]
Monocytes

In [None]:
obs = ['doublet_scores', 'predicted_doublets', 'doublet_info', 'res_0.5', 'res_0', 'res_0.8', 'res_1.0', 'res_1.5', 'res_2.0', 'res_2.5']
var = ['highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches']

for obs in obs:
    del Monocytes.obs[obs]

for var in var:
    del Monocytes.var[var]

del Monocytes.uns
del Monocytes.obsm
del Monocytes.varm
del Monocytes.obsp

Monocytes

## 4.1. HVG selection

In [None]:
sc.pp.highly_variable_genes(Monocytes,
                            n_top_genes = 4000, flavor = "seurat_v3",
                            layer = "raw_counts", batch_key = 'donor_id',
                            subset = False)

## 4.2. scVI integration

In [None]:
adata_train = Monocytes[:, Monocytes.var.highly_variable].copy()
adata_train

In [None]:
scvi.model.SCVI.setup_anndata(adata_train, layer = "raw_counts", batch_key = 'donor_id')
vae = scvi.model.SCVI(adata_train, n_layers = 2, n_latent = 30, latent_distribution = "normal", gene_likelihood = "nb")
vae.train(accelerator = "cpu")

In [None]:
Monocytes.obsm["X_scVI"] = vae.get_latent_representation()
Monocytes.obsm["denoised_RNA"] = vae.get_normalized_expression()

In [None]:
sc.pp.neighbors(Monocytes, n_neighbors = 15, use_rep = "X_scVI", key_added = "scVI")
sc.tl.leiden(Monocytes, resolution = 0.5, key_added = 'res_0.5', neighbors_key = "scVI")
sc.tl.paga(Monocytes, groups = 'res_0.5', neighbors_key = "scVI")
sc.pl.paga(Monocytes, frameon = True, edge_width_scale = 0.3)
sc.tl.umap(Monocytes, neighbors_key = "scVI", init_pos = 'paga')

## 4.3. Clustering

In [None]:
clustering_labels = []
for res in [0, 0.8, 1.0, 1.5, 2.0, 2.5]:
    clustering_labels.append("res_{}".format(res))
    if "res_{}".format(res) in Monocytes.obs:
        print("res_{}".format(res) + " already exists... going on with next resolution.")
        continue
    sc.tl.leiden(Monocytes, resolution = res, key_added = "res_{}".format(res), neighbors_key = "scVI")

In [None]:
clustering_labels = ['res_0.5', 'res_0.8', 'res_1.0', 'res_1.5', 'res_2.0', 'res_2.5']
sc.pl.umap(Monocytes, color = clustering_labels, legend_loc = 'on data', legend_fontsize = 10, legend_fontoutline = 2, legend_fontweight = 8, wspace = .5, ncols = 3, frameon = True)

## 4.4. Save the final object

In [None]:
outfilename = os.path.join(data_folder, "HealthyBlood_RecMonocytes.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
Monocytes.write(outfilename)
print("Done!")

In [None]:
outfilename = os.path.join(data_folder, "HealthyBlood_RecMonocytes.h5ad")
Monocytes = sc.read_h5ad(outfilename)    
Monocytes

## 4.5. UMAP

In [None]:
clustering_labels = ['res_0.5', 'res_0.8', 'res_1.0', 'res_1.5']
Monocytes.obs['Rename'] = "HB" + Monocytes.obs['res_0.3'].astype(str)

sc.pl.umap(Monocytes, color = 'Rename', legend_loc = 'on data', legend_fontsize = 13, legend_fontoutline = 4, legend_fontweight = 8, wspace = .15, ncols = 6, title = '', frameon = True,
           save = 'UMAP Reclustering Monocytes Healthy Blood.png')

## 4.6. DotPlot

In [None]:
Genes = ['CD14', 'S100A8', 'S100A9', 'RBP7', 'FOSB',
            'HLA-DPB1', 'HLA-DPA1', 'HLA-DRB1', 'HLA-DRA',
            'PPBP', 'PF4', 'CAVIN2', 'TUBB1',
            'NFKB1', 'NRF1', 'TREM1', 'TLR4',
            'AHSP', 'ALAS2', 'CA1',
             'IL7R', 'GZMA', 'GZMB', 'PRF1', 'GNLY', 'NKG7', 'CTSW', 'CD2',
            'FCGR3A', 'RHOC', 'CDKN1C', 'MS4A7']

sc.pl.dotplot(Monocytes, Genes, 'Rename', standard_scale = 'var', swap_axes = False, dendrogram = False, categories_order = ['HB0', 'HB1', 'HB4', 'HB5', 'HB3', 'HB2'],
              save = 'Reclustering Monocytes Healthy Blood annotation.png')

## 4.7. Module score

In [None]:
Cytoxic_Inflammatory = ['PRF1', 'GZMM', 'GZMA', 'KLRB1', 'GZMK', 'GZMH', 'IL32', 'CCL5', 'GZMB', 'GNLY', 'IFNG', 'TNF', 'CST7', 'NKG7', 'EOMES', 'EFHD2', 'F2R', 'SLAMF7', "IL1B", "IL2", "IL4"]
sc.tl.score_genes(Macrophages_GEO, Cytoxic_Inflammatory, ctrl_size = 50, gene_pool = None, n_bins = 25, score_name = 'Cytoxic_Inflammatory_score', random_state = 0, copy = False)
sc.pl.umap(Macrophages_GEO, color = 'Cytoxic_Inflammatory_score', vmin = 0, vmax = 1, size = 50, cmap = 'inferno', title = 'Cytotoxic-Inflammatory Score', save = 'Cytoxic_Inflammatory_Score macrophages Healthy Liver.png')