# Whole dataset QC + visualisation

In [1]:
import glob
import pandas as pd
import scanpy as sc
import scanpy.external as sce

In [2]:
# modify settings to improve performance on the large dataset 

# tell scanpy to do parallel processing with 6 cores
sc._settings.ScanpyConfig.n_jobs=6
# use .png backend for the plots where possible, will make the plots much smaller in file size vs if they are vectorised
sc._settings.settings._vector_friendly=True

In [3]:
adata = sc.read('/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/integrated_objects/240_libraries_concatenated_gene_info.h5ad')

In [4]:
adata.X

<5084027x38592 sparse matrix of type '<class 'numpy.float32'>'
	with 13363152346 stored elements in Compressed Sparse Row format>

## QC plots for the whole dataset
Note that these data have already been filtered in this script: `tenk10k_phase1/Scanpy/add_metadata_per_sample_no_norm.py`

### Violin plot

In [None]:
sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

### Scatter plots

In [None]:
sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")

## Normalisation and scaling

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
adata.raw = adata
adata = adata[:, adata_240.var.highly_variable]
sc.pp.regress_out(adata_240, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata_240, max_value=10)

## Integration
Use Harmony to remove batch effects