# <font color = '#FF003D'> ***Synchronous colorectal cancer-liver metastasis project***

**_______________________________________________________________________________________________________________________________________________________________________________________________________________**

# <font color = '#FF003D'> ***==== CODE 1: Quality control filtering ====***

# Python library

In [None]:
import os
import math
import warnings
import datetime

warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import scipy
import pandas as pd
import scanpy as sc 
import scanpy.external as sce
from cycler import cycler
import openpyxl
import scvi

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

import rpy2

In [None]:
from ipywidgets import IntProgress
from IPython.display import display
import time
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler

In [None]:
result_folder = '.../Analysis/Synchro/'
data_folder = '.../Analysis/Synchro/Data/'

sc.settings.verbosity = 4
warnings.filterwarnings('ignore')
sc.set_figure_params(dpi = 100, dpi_save = 1000, facecolor = 'white')

# R library

In [None]:
! python -m rpy2.situation

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R 
library(Seurat)
library(SeuratDisk)
library(SeuratData)
library(SeuratWrappers)
library(SeuratObject)

library(openxlsx)
library(ggplot2)
library(ggraph)
library(ggrepel)

library(dplyr)
library(reticulate)
library(patchwork)

library(EnhancedVolcano)

# 1. **Create the anndata**

In [None]:
samples = {
    'GEX-29': {'path': '.../GEX-29/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Liver', 'patient_ID': 'CRLM-1', 'run': 'Run 1', 'instrument': 'Illumina NovaSeq 6000'},
    'GEX-30': {'path': '.../GEX-30/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Colon', 'patient_ID': 'CRLM-1', 'run': 'Run 1', 'instrument': 'Illumina NovaSeq 6000'},
    'GEX-36': {'path': '.../GEX-36/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Blood', 'patient_ID': 'CRLM-1', 'run': 'Run 1', 'instrument': 'Illumina NovaSeq 6000'},
    
    'GEX-31': {'path': '.../GEX-31/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Liver', 'patient_ID': 'CRLM-2', 'run': 'Run 1', 'instrument': 'Illumina NovaSeq 6000'},
    'GEX-32': {'path': '.../GEX-32/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Colon', 'patient_ID': 'CRLM-2', 'run': 'Run 2', 'instrument': 'Illumina NovaSeq 6000'},
    'GEX-35': {'path': '.../GEX-35/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Blood', 'patient_ID': 'CRLM-2', 'run': 'Run 2', 'instrument': 'Illumina NovaSeq 6000'},
    
    'GEX-33': {'path': '.../GEX-33/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Colon', 'patient_ID': 'CRLM-3', 'run': 'Run 2', 'instrument': 'Illumina NovaSeq 6000'},
    'GEX-34': {'path': '.../GEX-34/outs/filtered_feature_bc_matrix.h5', 'tissue_origin': 'Liver', 'patient_ID': 'CRLM-3', 'run': 'Run 2', 'instrument': 'Illumina NovaSeq 6000'}
        }

patient_info = {
    'CRLM-1': {'age': '39', 'sex': 'Female'},
    'CRLM-2': {'age': '74', 'sex': 'Male'},
    'CRLM-3': {'age': '66', 'sex': 'Female'}
        }

In [None]:
adata_list = []
for batch_id, meta in samples.items():
    adata = sc.read_10x_h5(meta['path'], gex_only=False)
    adata.var_names_make_unique()
    adata.obs['batch_id'] = batch_id
    adata.obs['tissue_origin'] = meta['tissue_origin']
    adata.obs['patient_ID'] = meta['patient_ID']
    adata.obs['run'] = meta['run']
    adata.obs['age'] = patient_info[meta['patient']]['age']
    adata.obs['sex'] = patient_info[meta['patient']]['sex']

    print(f"Initial count matrix for sample {batch_id} has {adata.shape[0]} cells and {adata.shape[1]} genes")
    
    adata_list.append(adata)

In [None]:
adata = adata_list[0].concatenate(
    adata_list[1:], 
    join = 'outer',
    batch_key = 'batch_id',
    batch_categories = list(samples.keys()),
    uns_merge = None,
    index_unique = '-',
    fill_value = 0.0)

## 1.1. Save the object before QC filters

In [None]:
outfilename = os.path.join(data_folder, "Synchro_beforeQC.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
adata.write(outfilename)
print("Done!")

# 2. **Quality control**

In [None]:
sc.pl.highest_expr_genes(adata[:, adata.var["feature_types"] == "Gene Expression"], n_top = 20)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')              # mitochondrial genes
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))      # ribosomal genes
adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P)]"))        # hemoglobin genes
sc.pp.calculate_qc_metrics(adata, qc_vars = ['mt','ribo','hb'], percent_top = None, log1p = False, inplace = True)

## 2.1. GEX-29

In [None]:
GEX29 = adata[adata.obs['batch_id'].isin(['GEX-29'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX29 = 0.05
MT_PERCENTAGE_GEX29 = 15

MIN_UMI_GEX29 = 900
MAX_UMI_GEX29 = 12000

MIN_GENES_GEX29 = 400
MAX_GENES_GEX29 = 4300

MIN_CELLS_GEX29 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(DMA29.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_DMA29, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(DMA29.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_DMA29), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_DMA29), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(DMA29.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_DMA29), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_DMA29), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(DMA29.n_obs, DMA29.n_vars)

In [None]:
sc.pp.filter_cells(DMA29, min_genes = MIN_GENES_DMA29)
sc.pp.filter_cells(DMA29, max_genes = MAX_GENES_DMA29)
sc.pp.filter_cells(DMA29, min_counts = MIN_UMI_DMA29)
sc.pp.filter_cells(DMA29, max_counts = MAX_UMI_DMA29)
sc.pp.filter_genes(DMA29, min_cells = MIN_CELLS_DMA29)

print("Cell x Genes after filtering")
print(DMA29.n_obs, DMA29.n_vars)

In [None]:
DMA29 = DMA29[DMA29.obs['pct_counts_mt'] < MT_PERCENTAGE_DMA29, :]
DMA29 = DMA29[DMA29.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_DMA29, :]
print("Remaining cells %d"%DMA29.n_obs)

## 2.2. GEX-30

In [None]:
GEX30 = adata[adata.obs['batch_id'].isin(['GEX-30'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX30 = 0.05
MT_PERCENTAGE_GEX30 = 15

MIN_UMI_GEX30 = 900
MAX_UMI_GEX30 = 17000

MIN_GENES_GEX30 = 500
MAX_GENES_GEX30 = 4300

MIN_CELLS_GEX30 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX30.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX30, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX30.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX30), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX30), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX30.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX30), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX30), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX30.n_obs, GEX30.n_vars)

In [None]:
sc.pp.filter_cells(GEX30, min_genes = MIN_GENES_GEX30)
sc.pp.filter_cells(GEX30, max_genes = MAX_GENES_GEX30)
sc.pp.filter_cells(GEX30, min_counts = MIN_UMI_GEX30)
sc.pp.filter_cells(GEX30, max_counts = MAX_UMI_GEX30)
sc.pp.filter_genes(GEX30, min_cells = MIN_CELLS_GEX30)

print("Cell x Genes after filtering")
print(GEX30.n_obs, GEX30.n_vars)

In [None]:
GEX30 = GEX30[GEX30.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX30, :]
GEX30 = GEX30[GEX30.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX30, :]
print("Remaining cells %d"%GEX30.n_obs)

## 2.3. GEX-31

In [None]:
GEX31 = adata[adata.obs['batch_id'].isin(['GEX-31'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX31 = 0.05
MT_PERCENTAGE_GEX31 = 15

MIN_UMI_GEX31 = 800
MAX_UMI_GEX31 = 10000

MIN_GENES_GEX31 = 400
MAX_GENES_GEX31 = 4300

MIN_CELLS_GEX31 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX31.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX31, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX31.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX31), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX31), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX31.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX31), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX31), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX31.n_obs, GEX31.n_vars)

In [None]:
sc.pp.filter_cells(GEX31, min_genes = MIN_GENES_GEX31)
sc.pp.filter_cells(GEX31, max_genes = MAX_GENES_GEX31)
sc.pp.filter_cells(GEX31, min_counts = MIN_UMI_GEX31)
sc.pp.filter_cells(GEX31, max_counts = MAX_UMI_GEX31)
sc.pp.filter_genes(GEX31, min_cells = MIN_CELLS_GEX31)

print("Cell x Genes after filtering")
print(GEX31.n_obs, GEX31.n_vars)

In [None]:
GEX31 = GEX31[GEX31.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX31, :]
GEX31 = GEX31[GEX31.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX31, :]
print("Remaining cells %d"%GEX31.n_obs)

## 2.4. GEX-32

In [None]:
GEX32 = adata[adata.obs['batch_id'].isin(['GEX-32'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX32 = 0.05
MT_PERCENTAGE_GEX32 = 15

MIN_UMI_GEX32 = 1500
MAX_UMI_GEX32 = 17000

MIN_GENES_GEX32 = 600
MAX_GENES_GEX32 = 4300

MIN_CELLS_GEX32 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX32.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX32, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX32.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX32), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX32), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX32.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX32), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX32), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX32.n_obs, GEX32.n_vars)

In [None]:
sc.pp.filter_cells(GEX32, min_genes = MIN_GENES_GEX32)
sc.pp.filter_cells(GEX32, max_genes = MAX_GENES_GEX32)
sc.pp.filter_cells(GEX32, min_counts = MIN_UMI_GEX32)
sc.pp.filter_cells(GEX32, max_counts = MAX_UMI_GEX32)
sc.pp.filter_genes(GEX32, min_cells = MIN_CELLS_GEX32)

print("Cell x Genes after filtering")
print(GEX32.n_obs, GEX32.n_vars)

In [None]:
GEX32 = GEX32[GEX32.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX32, :]
GEX32 = GEX32[GEX32.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX32, :]
print("Remaining cells %d"%GEX32.n_obs)

## 2.5. GEX-33

In [None]:
GEX33 = adata[adata.obs['batch_id'].isin(['GEX-33'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX33 = 0.05
MT_PERCENTAGE_GEX33 = 15

MIN_UMI_GEX33 = 900
MAX_UMI_GEX33 = 14000

MIN_GENES_GEX33 = 600
MAX_GENES_GEX33 = 4600

MIN_CELLS_GEX33 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX33.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX33, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX33.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX33), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX33), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX33.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX33), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX33), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX33.n_obs, GEX33.n_vars)

In [None]:
sc.pp.filter_cells(GEX33, min_genes = MIN_GENES_GEX33)
sc.pp.filter_cells(GEX33, max_genes = MAX_GENES_GEX33)
sc.pp.filter_cells(GEX33, min_counts = MIN_UMI_GEX33)
sc.pp.filter_cells(GEX33, max_counts = MAX_UMI_GEX33)
sc.pp.filter_genes(GEX33, min_cells = MIN_CELLS_GEX33)

print("Cell x Genes after filtering")
print(GEX33.n_obs, GEX33.n_vars)

In [None]:
GEX33 = GEX33[GEX33.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX33, :]
GEX33 = GEX33[GEX33.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX33, :]
print("Remaining cells %d"%GEX33.n_obs)

## 2.6. GEX-34

In [None]:
GEX34 = adata[adata.obs['batch_id'].isin(['GEX-34'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX34 = 0.05
MT_PERCENTAGE_GEX34 = 15

MIN_UMI_GEX34 = 1200
MAX_UMI_GEX34 = 17000

MIN_GENES_GEX34 = 600
MAX_GENES_GEX34 = 4300

MIN_CELLS_GEX34 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX34.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX34, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX34.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX34), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX34), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX34.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX34), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX34), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX34.n_obs, GEX34.n_vars)

In [None]:
sc.pp.filter_cells(GEX34, min_genes = MIN_GENES_GEX34)
sc.pp.filter_cells(GEX34, max_genes = MAX_GENES_GEX34)
sc.pp.filter_cells(GEX34, min_counts = MIN_UMI_GEX34)
sc.pp.filter_cells(GEX34, max_counts = MAX_UMI_GEX34)
sc.pp.filter_genes(GEX34, min_cells = MIN_CELLS_GEX34)

print("Cell x Genes after filtering")
print(GEX34.n_obs, GEX34.n_vars)

In [None]:
GEX34 = GEX34[GEX34.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX34, :]
GEX34 = GEX34[GEX34.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX34, :]
print("Remaining cells %d"%GEX34.n_obs)

## 2.7. GEX-35

In [None]:
GEX35 = adata[adata.obs['batch_id'].isin(['GEX-35'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX35 = 0.05
MT_PERCENTAGE_GEX35 = 15

MIN_UMI_GEX35 = 900
MAX_UMI_GEX35 = 17000

MIN_GENES_GEX35 = 600
MAX_GENES_GEX35 = 5000

MIN_CELLS_GEX35 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX35.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX35, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX35.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX35), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX35), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX35.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX35), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX35), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX35.n_obs, GEX35.n_vars)

In [None]:
sc.pp.filter_cells(GEX35, min_genes = MIN_GENES_GEX35)
sc.pp.filter_cells(GEX35, max_genes = MAX_GENES_GEX35)
sc.pp.filter_cells(GEX35, min_counts = MIN_UMI_GEX35)
sc.pp.filter_cells(GEX35, max_counts = MAX_UMI_GEX35)
sc.pp.filter_genes(GEX35, min_cells = MIN_CELLS_GEX35)

print("Cell x Genes after filtering")
print(GEX35.n_obs, GEX35.n_vars)

In [None]:
GEX35 = GEX35[GEX35.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX35, :]
GEX35 = GEX35[GEX35.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX35, :]
print("Remaining cells %d"%GEX35.n_obs)

## 2.8. GEX-36

In [None]:
GEX36 = adata[adata.obs['batch_id'].isin(['GEX-36'])].copy()

In [None]:
RIBO_PERCENTAGE_GEX36 = 0.05
MT_PERCENTAGE_GEX36 = 15

MIN_UMI_GEX36 = 1200
MAX_UMI_GEX36 = 13000

MIN_GENES_GEX36 = 400
MAX_GENES_GEX36 = 4300

MIN_CELLS_GEX36 = 5

In [None]:
plt.figure(figsize = (15, 4), tight_layout = True)

ax = plt.subplot(1, 3, 1)
sns.kdeplot(GEX36.obs['pct_counts_mt'], shade = True, color = 'limegreen', ax = ax)
plt.axvline(MT_PERCENTAGE_GEX36, 0, 1, c = 'red')
ax.set_xlim([0, 30])
plt.grid(False)

plt.subplot(1, 3, 2)
sns.kdeplot(np.log10(GEX36.obs['total_counts']), shade = True, color = 'orangered')
plt.axvline(np.log10(MIN_UMI_GEX36), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_UMI_GEX36), 0, 1, c = 'darkred')
plt.grid(False)

plt.subplot(1, 3, 3)
sns.kdeplot(np.log10(GEX36.obs['n_genes_by_counts']), shade = True, color = 'darkorange')
plt.axvline(np.log10(MIN_GENES_GEX36), 0, 1, c = 'red')
plt.axvline(np.log10(MAX_GENES_GEX36), 0, 1, c = 'darkred')
plt.grid(False)

plt.show()

In [None]:
print("Cell x Genes before filtering")
print(GEX36.n_obs, GEX36.n_vars)

In [None]:
sc.pp.filter_cells(GEX36, min_genes = MIN_GENES_GEX36)
sc.pp.filter_cells(GEX36, max_genes = MAX_GENES_GEX36)
sc.pp.filter_cells(GEX36, min_counts = MIN_UMI_GEX36)
sc.pp.filter_cells(GEX36, max_counts = MAX_UMI_GEX36)
sc.pp.filter_genes(GEX36, min_cells = MIN_CELLS_GEX36)

print("Cell x Genes after filtering")
print(GEX36.n_obs, GEX36.n_vars)

In [None]:
GEX36 = GEX36[GEX36.obs['pct_counts_mt'] < MT_PERCENTAGE_GEX36, :]
GEX36 = GEX36[GEX36.obs['pct_counts_ribo'] > RIBO_PERCENTAGE_GEX36, :]
print("Remaining cells %d"%GEX36.n_obs)

## 2.9. adata_concatenated

In [None]:
batch_list = ['DMA29', 'DMA30', 'DMA31', 'DMA32', 'DMA33', 'DMA34', 'DMA35', 'DMA36']
adata_concatenated = DMA29.concatenate(DMA30, DMA31, DMA32, DMA33, DMA34, DMA35, DMA36,
                                           join = 'outer',
                                           batch_key = 'batch_id',
                                           batch_categories = batch_list,
                                           uns_merge = None,
                                           index_unique = '-',
                                           fill_value = 0.0)
adata_concatenated

## 2.10. Doublets filtering

In [None]:
print("Cell x Genes BEFORE doublets removal")
print(adata_concatenated.n_obs, adata_concatenated.n_vars)

In [None]:
import scrublet as scr

# split per batch into new objects.
batches = adata_concatenated.obs['batch_id'].cat.categories.tolist()
alldata = {}
for batch in batches:
    tmp = adata_concatenated[adata_concatenated.obs['batch_id'] == batch,]
    print(batch, ":", tmp.shape[0], " cells")
    scrub = scr.Scrublet(tmp.X)
    out = scrub.scrub_doublets(verbose = False, n_prin_comps = 20)
    alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
    print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")

In [None]:
scrub_pred = pd.concat(alldata.values())
adata_concatenated.obs['doublet_scores'] = scrub_pred['doublet_score'] 
adata_concatenated.obs['predicted_doublets'] = scrub_pred['predicted_doublets'] 

sum(adata_concatenated.obs['predicted_doublets'])

In [None]:
%matplotlib inline
adata_concatenated.obs['doublet_info'] = adata_concatenated.obs["predicted_doublets"].astype(str)
sc.pl.violin(adata_concatenated, 'n_genes_by_counts', jitter = 0.4, groupby = 'doublet_info', rotation=45)

In [None]:
adata_concatenated = adata_concatenated[adata_concatenated.obs['doublet_info'] == 'False',:]

print("Cell x Genes AFTER doublets removal")
print(adata_concatenated.n_obs, adata_concatenated.n_vars)

## 2.11. Save the object after QC filters

In [None]:
outfilename = os.path.join(data_folder, "Synchro_afterQC.h5ad")
print("Saving h5ad data to file {}".format(outfilename))
adata_concatenated.write(outfilename)
print("Done!")