# Scanpy: Scran Normalize

Use the Scran R package to normalize the data for later analysis.

### Setup and Load Data

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
import pandas as pd

import os
os.environ['R_HOME'] = r"C:\Program Files\R\R-4.4.1"   
import anndata2ri # order matters, comes after defining 'R_HOME'

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80)

# *** Input File ***  h5 file after filtering 
sample_name = r'sc92'
h5_file_path = r'results/'
h5_file = h5_file_path + sample_name + '_final_raw.h5ad'

# *** Output Files ***  
results_file_path = r'results/'
lognorm_results_file = results_file_path + sample_name + r'_lognorm_scran.h5ad'  #  file to store the log normalized data

scanpy==1.10.1 anndata==0.8.0 umap==0.5.3 numpy==1.26.4 scipy==1.11.4 pandas==2.2.2 scikit-learn==1.1.1 statsmodels==0.14.2 igraph==0.10.8 louvain==0.8.2 pynndescent==0.5.7


In [2]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(6,6) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
# sc.logging.print_versions()

  anndata2ri.activate()


In [3]:
%%R
# Load R libraries
library(scran)

Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: 'MatrixGenerics'

The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOr

In [4]:
adata = sc.read_h5ad(h5_file)
adata.X = adata.X.toarray() # convert to full matrix

print(adata.obs['sample'].value_counts())
print()
print('Data matrix is sparse:', sp.sparse.issparse(adata.X))
print()
print('X size =', adata.X.shape)

sample
60day3    12058
60day1    10254
60day2     9609
Name: count, dtype: int64

Data matrix is sparse: False

X size = (31921, 24178)


In [5]:
samples = set(adata.obs['sample'])
samples = sorted(samples, key=str.lower)
num_cells = []
num_genes = []
num_reads = []
for sample in samples:
    num_cells.append(adata[adata.obs['sample'] == sample].n_obs)
    num_genes.append(adata[adata.obs['sample'] == sample].n_vars)
    num_reads.append(adata[adata.obs['sample'] == sample].X.sum())
    
df = pd.DataFrame(list(zip(num_cells,num_genes,num_reads)), columns = ['Number of cells', 'Number of genes', 'Number of reads'], index = samples)
df 

Unnamed: 0,Number of cells,Number of genes,Number of reads
60day1,10254,24178,47785892.0
60day2,9609,24178,56768236.0
60day3,12058,24178,57285870.0


#### Principal Component Analysis

In [None]:
sc.tl.pca(adata)

computing PCA
    with n_comps=50


In [None]:
sc.pl.pca(adata, color="sample")

#### Examine initial distribution of total counts.

In [None]:
adata.obs['n_counts'] = adata.X.sum(1)
sb.distplot(adata.obs['n_counts'] , bins=50, kde=False)

### Scran Normalization
Scran calculates the cell normalization values.

In [None]:
#Perform a clustering for scran normalization in clusters
adata_pp = adata.copy()
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

In [None]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

Slow step

In [None]:
%%R -i data_mat -i input_groups -o size_factors

size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))

In [None]:
#Delete adata_pp
del adata_pp

In [None]:
# Visualize the estimated size factors
adata.obs['size_factors'] = size_factors
adata.obs['n_counts'] =  adata.X.sum(1)

sc.pl.scatter(adata, 'size_factors', 'n_counts')
sc.pl.scatter(adata, 'size_factors', 'n_genes')

sb.distplot(size_factors, bins=50, kde=False)
plt.show()

In [None]:
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [None]:
#Normalize adata 
adata.X /= adata.obs['size_factors'].values[:,None]

In [None]:
adata.obs['n_counts'] = adata.X.sum(1)
print(adata.obs['n_counts'])
sb.distplot(adata.obs['n_counts'] , bins=50, kde=False)

#### Principal Component Analysis

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pl.pca(adata, color="sample")

### Log Transform

In [None]:
sc.pp.log1p(adata)

In [None]:
adata.obs['n_counts'] = adata.X.sum(1)
print(adata.obs['n_counts'])
sb.distplot(adata.obs['n_counts'] , bins=50, kde=False)

#### Principal Component Analysis
Differences in sequencing depth may reemerge after log transform

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pl.pca(adata, color="sample")

### Save Data

In [None]:
# save
adata.write_h5ad(lognorm_results_file)