# Importing modules and settings

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
from matplotlib.pyplot import rc_context

In [None]:
import seaborn as sns

General settings of Scanpy

In [None]:
sc.settings.verbosity = 4
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


In [None]:
umap_cmap = sns.light_palette('xkcd:medium blue', as_cmap = True)

# Declaring the input and output files

In [None]:
name_of_analysis = 'pristina_subclustering_piwi_cells'

In [None]:
adata_processed = sc.read_h5ad('pristina_atlas_coloured.h5ad')

In [None]:
results_file_1 = 'pristina_atlas_coloured_subcl.h5ad'

In [None]:
results_file_2 = 'pristina_piwi_subcl.h5ad'

In [None]:
adata = sc.read_10x_mtx(
    "./pristina/",
    var_names='gene_symbols',
    cache=True)

In [None]:
adata.var_names_make_unique()

In [None]:
adata.var_names

In [None]:
adata.var

In [None]:
adata

In [None]:
adata.obs

In [None]:
clusteringlayer = 'leiden_1.5'

# subsetting clusters

In [None]:
subset = ['1', '2', '8']

In [None]:
subset_name = 'piwi_pos'

In [None]:
with rc_context({'figure.figsize': (12, 12)}):
    sc.pl.umap(adata_processed, color=clusteringlayer, legend_loc='on data', legend_fontoutline = 5, title= 'Clustering layer '+str(clusteringlayer), size = 30,
        frameon=False, add_outline = True)

In [None]:
with rc_context({'figure.figsize': (12, 12)}):
    sc.pl.umap(adata_processed, color=clusteringlayer, groups = subset, legend_loc='on data', legend_fontoutline = 5, title= 'Clustering layer '+str(clusteringlayer), size = 30,
        frameon=False, add_outline = True)

In [None]:
adata_processed.obs[clusteringlayer].isin(subset)

In [None]:
subset_ids = adata_processed.obs[adata_processed.obs[clusteringlayer].isin(subset)].index.to_list()

In [None]:
len(subset_ids)

In [None]:
adata = adata[adata.obs.index.isin(subset_ids)].copy()

In [None]:
adata

In [None]:
adata_processed

# Preprocessing

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20 )

In [None]:
sc.pp.filter_cells(adata, min_counts=50)
sc.pp.filter_cells(adata, min_genes= 50)
sc.pp.filter_genes(adata, max_counts = 1000000)

In [None]:
adata.var

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True, log = True)

In [None]:
adata

# Matrix slicing

In [None]:
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 700, :]

In [None]:
adata = adata[adata.obs.total_counts < 900, :]

In [None]:
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

# Normalization and log transformation

The following 2 functions normalise and log transform the matrix

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

# Selecting highly variable genes

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 18000)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
adata = adata[:, adata.var.highly_variable]

In [None]:
adata

In [None]:
adata.raw.var

# Scaling the data

In [None]:
sc.pp.scale(adata, zero_center=False)

# Performing the PCA and kNN analysis

In [None]:
sc.tl.pca(adata, svd_solver='arpack', n_comps = 150)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=150, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=35, n_pcs=25)

In [None]:
sc.tl.umap(adata, min_dist=0.5, spread = 1, alpha = 1, gamma = 1)

In [None]:
sc.pl.umap(adata)

In [None]:
sc.pl.umap(adata, color='PrileiEVm016887t1', title = 'PrileiEVm023936t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm004300t1', title = 'PrileiEVm023936t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm023936t1', title = 'PrileiEVm023936t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm003567t1', title = 'PrileiEVm023936t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm008309t1', title = 'PrileiEVm008309t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm021040t1', title = 'PrileiEVm021040t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm017310t1', title = 'PrileiEVm017310t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm000199t1', title = 'PrileiEVm000199t1', color_map = umap_cmap)

In [None]:
sc.pl.umap(adata, color='PrileiEVm019805t1', title = 'PrileiEVm019805t1', color_map = umap_cmap)

piwi related markers

In [None]:
tr = 'PrileiEVm016887t1'#nanos
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

In [None]:
tr = 'PrileiEVm004300t1'#vasa
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

In [None]:
tr = 'PrileiEVm002383t1'#pumilio
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

In [None]:
tr = 'PrileiEVm003567t1'#piwi
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

proliferation markers

In [None]:
tr = 'PrileiEVm016982t1'#pcna
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

In [None]:
tr = 'PrileiEVm003521t1'#mcm2
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

histones

In [None]:
tr = 'PrileiEVm023936t1'#h2b
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

In [None]:
tr = 'PrileiEVm022498t1'
sc.pl.umap(adata, color=tr, size = 20, title = tr, color_map = umap_cmap)

# Clustering

Indicate how many different resolutions you want to try

In [None]:
res = 0.4

In [None]:
sc.tl.leiden(adata, resolution = res, key_added = 'leiden_'+subset_name+'_'+str(res))
sc.pl.umap(adata, color='leiden_'+subset_name+'_'+str(res))

In [None]:
leiden_names = adata.obs.columns[adata.obs.columns.str.contains('leiden')].to_list()

In [None]:
leiden_names

In [None]:
clusteringlayer_subset = 'leiden_piwi_pos_0.4'

In [None]:
with rc_context({'figure.figsize': (10, 10)}):
    sc.pl.umap(adata, color=clusteringlayer_subset, legend_loc='on data', title=str(clusteringlayer_subset), size = 50, frameon=False)

In [None]:
sc.tl.rank_genes_groups(adata, clusteringlayer_subset, method='wilcoxon', key_added = 'rank_genes_groups_wilcox_'+str(clusteringlayer_subset))
sc.pl.rank_genes_groups(adata, key='rank_genes_groups_wilcox_'+str(clusteringlayer_subset), n_genes = 10, sharey = False)

In [None]:
sc.tl.rank_genes_groups(adata, clusteringlayer_subset, method='logreg', key_added = 'rank_genes_groups_logreg_'+str(clusteringlayer_subset))
sc.pl.rank_genes_groups(adata, key='rank_genes_groups_logreg_'+str(clusteringlayer_subset), n_genes = 10, sharey = False)

In [None]:
markers_w = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+str(clusteringlayer_subset)]['names']).head(15)

In [None]:
markers_w_pvals = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+str(clusteringlayer_subset)]['pvals_adj']).head(15)

In [None]:
markers_w

In [None]:
markers_l = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+str(clusteringlayer_subset)]['names']).head(15)

In [None]:
adata_processed.obs['subset clusters'] = adata.obs[clusteringlayer_subset]

In [None]:
with rc_context({'figure.figsize': (12, 12)}):
    sc.pl.umap(adata_processed, color= 'subset clusters', legend_fontoutline = 5, title= subset_name+' Clusters', size = 30,
        palette = adata.uns[clusteringlayer_subset+'_colors'], frameon=False, add_outline = False)

In [None]:
with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(adata, color= clusteringlayer_subset, legend_fontoutline = 5, title= subset_name+' Clusters', size = 30,
        frameon=False, add_outline = False)

In [None]:
with rc_context({'figure.figsize': (15, 5)}):
    sc.pl.violin(adata, keys = "n_genes" , groupby = clusteringlayer_subset, jitter = False)

In [None]:
with rc_context({'figure.figsize': (15, 5)}):
    sc.pl.violin(adata, keys = "n_counts" , groupby = clusteringlayer_subset, jitter = False)

In [None]:
for i in adata.obs[clusteringlayer_subset].cat.categories:
    print('Cluster '+i)
    cl_markers_w = markers_w[i].to_list()
    cl_markers_l = markers_l[i].to_list()
    cl_markers_c = markers_w[i][markers_w[i].isin(markers_l[i])]
    sc.pl.umap(adata_processed, color= cl_markers_w, cmap = umap_cmap)
    sc.pl.umap(adata_processed, color= cl_markers_l, cmap = umap_cmap)
    if len(cl_markers_c)>0:
        sc.pl.umap(adata_processed, color= cl_markers_c, cmap = umap_cmap)

In [None]:
for i in adata.obs[clusteringlayer_subset].cat.categories:
    cl_markers_w = markers_w[i].to_list()
    sc.tl.score_genes(adata_processed, cl_markers_w, ctrl_size=len(cl_markers_w), gene_pool=None, n_bins=25, score_name='score_scl_'+i)
    sc.pl.umap(adata_processed, color= 'subset clusters', groups = i)
    sc.pl.umap(adata_processed, color= 'score_scl_'+i, cmap = 'magma')

In [None]:
for i in adata.obs[clusteringlayer_subset].cat.categories:
    cl_markers_l = markers_l[i].to_list()
    sc.tl.score_genes(adata_processed, cl_markers_l, ctrl_size=len(cl_markers_l), gene_pool=None, n_bins=25, score_name='score_scl_'+i+'_l')
    sc.pl.umap(adata_processed, color= 'subset clusters', groups = i)
    sc.pl.umap(adata_processed, color= 'score_scl_'+i+'_l', cmap = 'magma')

In [None]:
piwi_pos_genes = ['PrileiEVm003567t1', 'PrileiEVm016887t1', 'PrileiEVm016982t1', 'PrileiEVm003521t1', 'PrileiEVm022498t1', 'PrileiEVm023936t1']

In [None]:
sc.tl.score_genes(adata_processed, piwi_pos_genes, ctrl_size=len(piwi_pos_genes), gene_pool=None, n_bins=25, score_name='score_piwi_pos_genes')

In [None]:
sc.pl.umap(adata_processed, color= 'score_piwi_pos_genes', cmap = 'magma')

In [None]:
adata_processed.write(results_file_1)

In [None]:
adata.write(results_file_2)