# Importing modules and settings

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
from matplotlib.pyplot import rc_context

In [None]:
import seaborn as sns

General settings of Scanpy

In [None]:
sc.settings.verbosity = 4
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


In [None]:
umap_cmap = sns.light_palette('xkcd:medium blue', as_cmap = True)

# Declaring the input and output files

In [None]:
name_of_analysis = 'pristina_atlas'

In [None]:
sc.settings.figdir = './figures/'+name_of_analysis+'_preprocessing'

In [None]:
results_file = './'+name_of_analysis+'.h5ad'

In [None]:
adata = sc.read_h5ad('./'+name_of_analysis+'_prefiltering.h5ad')

In [None]:
adata

In [None]:
adata.layers['counts'] = adata.X.copy()

# Preprocessing

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, save = True)

In [None]:
sc.pp.filter_cells(adata, min_counts=50)
sc.pp.filter_cells(adata, min_genes= 50)
sc.pp.filter_genes(adata, max_counts = 1000000)

In [None]:
adata.var

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, save = True)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter= 0.7, multi_panel=True, log = True, save = True)

In [None]:
adata

# Matrix slicing

In [None]:
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 700, :]

In [None]:
adata = adata[adata.obs.total_counts < 900, :]

In [None]:
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save = True)

# Normalization and log transformation

The following 2 functions normalise and log transform the matrix

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

# Selecting highly variable genes

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 18000)

In [None]:
sc.pl.highly_variable_genes(adata, save = True)

In [None]:
adata.raw = adata

In [None]:
adata = adata[:, adata.var.highly_variable]

In [None]:
adata

In [None]:
adata.raw.var

# Scaling the data

In [None]:
sc.pp.scale(adata, zero_center=False)

# Performing the PCA and kNN analysis

In [None]:
sc.tl.pca(adata, svd_solver='arpack', n_comps = 150)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=150, log=True, save = True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=45, n_pcs=105)

In [None]:
sc.tl.umap(adata, min_dist=0.5, spread = 1, alpha = 1, gamma = 1.0)

In [None]:
sc.pl.umap(adata)

In [None]:
tr = 'PrileiEVm023936t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm008309t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm011741t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm021316t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm022250t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm000325t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm013699t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

In [None]:
tr = 'PrileiEVm020595t1'
sc.pl.umap(adata, color=tr, title = tr, color_map = umap_cmap, save = tr+'.pdf')

# Clustering

In [None]:
resolutions = [0.5, 1, 1.5, 2]

In [None]:
for i in resolutions:
    sc.tl.leiden(adata, resolution = i, key_added = 'leiden_'+str(i))
    sc.pl.umap(adata, color='leiden_'+str(i))

In [None]:
leiden_names = adata.obs.columns[adata.obs.columns.str.contains('leiden')].to_list()

In [None]:
leiden_names

In [None]:
sc.pl.umap(adata, color=leiden_names, legend_loc = 'on data', legend_fontsize = 10)

In [None]:
for leiden_i in leiden_names:
    with rc_context({'figure.figsize': (15, 15)}):
        sc.pl.umap(adata, color=leiden_i, legend_loc='on data', title=str(leiden_i), size = 50, frameon=False, 
                   save = '_'+str(leiden_i))

In [None]:
for leiden_i in leiden_names:
    sc.tl.rank_genes_groups(adata, leiden_i, method='logreg', key_added = 'rank_genes_groups_logreg_'+str(leiden_i))
    sc.pl.rank_genes_groups(adata, key='rank_genes_groups_logreg_'+str(leiden_i), n_genes = 10, sharey = False)

In [None]:
for leiden_i in leiden_names:
    sc.tl.rank_genes_groups(adata, leiden_i, method='wilcoxon', key_added = 'rank_genes_groups_wilcox_'+str(leiden_i))
    sc.pl.rank_genes_groups(adata, key='rank_genes_groups_wilcox_'+str(leiden_i), n_genes = 10, sharey = False)

In [None]:
adata.write(results_file)