# Importing modules and settings

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
from matplotlib.pyplot import rc_context

General settings of Scanpy

In [None]:
sc.settings.verbosity = 3 
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


In [None]:
import seaborn as sns

In [None]:
umap_cmap = sns.light_palette('xkcd:medium blue', as_cmap = True)

In [None]:
import os

# Functions for diamond blast querying

This set of functions explores the diamond blast annotation to return the most informative hit. It works like this:
if there is a hit of the 'preferred species' in the diamond, it returns this one. The preferred species is coded below as "Platynereis" in the default
Otherwise, it looks for an informative hit, this means avoiding hits like "hypothetical protein", etc. These are coded in the list of words (words to avoid). If there is no such nice hit it returns "no nice hit:" followed by the best hit by pvalue, that will then contain either "hypothetical" etc. If there are no hits in the diamond, it returns "not in Diamond"

In [None]:
def check_for_species(df, species):
    if df[4].str.contains(species).any():
        contained = df[df[4].str.contains(species)].sort_values(3)
        return contained.iloc[0, 4]
    else:
        return None

In [None]:
def contains_words (hit_string, li):
    ret = False
    for word in li:
    #print(word)
        if word.lower() in hit_string:
          #print(word)
            ret = True
    return ret

In [None]:
def get_informative (df, li):
    if len(df.index) == 0:
        return "not in Diamond"
    else:
        found = False
        i = 0
        while found == False and i in range(len(df.index)):
            se = df.iloc[i, 4].lower()
            if contains_words(se, li) == False:
                found = True
                return df.iloc[i, 4]
            i +=1  
        if found == False:
            return "no informative hits: "+df.iloc[0, 4]

In [None]:
def get_best_hit (transcript, species, li):
    search = diamond[diamond[0] == transcript]
    if check_for_species(search, species) is not None:
        return check_for_species(search, species)
    else:
        return get_informative (search, li)

In [None]:
list_of_words = ['hypothetical', 'uncharacterized', 'unnamed']

In [None]:
preferred_species = 'Platynereis'

In [None]:
diamond = pd.read_csv('diamond_pristina.tsv', sep='\t', header = None)

# Declaring the input and output files

In [None]:
name_of_analysis = 'pristina_atlas'

In [None]:
adata = sc.read_h5ad('./'+name_of_analysis+'.h5ad')

In [None]:
leiden_names = adata.obs.columns[adata.obs.columns.str.contains('leiden')].to_list()

In [None]:
leiden_names

In [None]:
os.mkdir('./figures/'+name_of_analysis+'_marker_pdfs_and_excels')

In [None]:
for name in leiden_names:
    with pd.ExcelWriter('./figures/'+name_of_analysis+'_marker_pdfs_and_excels/'+name+'_markers_wilcoxon.xlsx') as writer:
        for i in adata.obs[name].cat.categories:
            a = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['names']).head(30)[i].rename('names')
            b = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['logfoldchanges']).head(30)[i].rename('logfoldchanges')
            c = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['pvals']).head(30)[i].rename('pvals')
            d = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['pvals_adj']).head(30)[i].rename('pvals_adj')
            e = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['scores']).head(30)[i].rename('scores')
            f = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['names']).head(30)[i].apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')
            df = pd.concat([a, b, c, d, e, f], axis = 1)
            df.to_excel(writer, sheet_name='Cluster '+i)

In [None]:
for name in leiden_names:
    with pd.ExcelWriter('./figures/'+name_of_analysis+'_marker_pdfs_and_excels/'+name+'_markers_logreg.xlsx') as writer:
        for i in adata.obs[name].cat.categories:
            a = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+name]['names']).head(30)[i].rename('names')
            b = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+name]['scores']).head(30)[i].rename('scores')
            e = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['names']).head(30)[i].apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')
            df = pd.concat([a, b, e], axis = 1)
            df.to_excel(writer, sheet_name='Cluster '+i)

In [None]:
def get_plots (clusteringlayer, cluster, li_markers):
    fig, axs = plt.subplots(3, 3, figsize = (15, 15))
    
    sc.pl.umap(adata, color= clusteringlayer, legend_loc = 'on data', groups = cluster, size = 5, legend_fontsize = 7, title = name+' cluster '+cluster, show = False, ax = axs[0, 0])
    
    while len(li_markers) < 8:
        li_markers.append(None)

    gene01 = li_markers[0]
    gene02 = li_markers[1]
    gene10 = li_markers[2]
    gene11 = li_markers[3]
    gene12 = li_markers[4]
    gene20 = li_markers[5]
    gene21 = li_markers[6]
    gene22 = li_markers[7]


    #Row 0 first row
    sc.pl.umap(adata, color= gene01, title = gene01, color_map = umap_cmap, show = False, ax = axs[0, 1])
    sc.pl.umap(adata, color= gene02, title = gene02, color_map = umap_cmap, show = False, ax = axs[0, 2])
    

    #Row 1 second row
    
    sc.pl.umap(adata, color= gene10, title = gene10, color_map = umap_cmap, show = False, ax = axs[1, 0])
    sc.pl.umap(adata, color= gene11, title = gene11, color_map = umap_cmap, show = False, ax = axs[1, 1])
    sc.pl.umap(adata, color= gene12, title = gene12, color_map = umap_cmap, show = False, ax = axs[1, 2])
    

    #Row 2 third row
    
    sc.pl.umap(adata, color= gene20, title = gene20, color_map = umap_cmap, show = False, ax = axs[2, 0])
    sc.pl.umap(adata, color= gene21, title = gene21, color_map = umap_cmap, show = False, ax = axs[2, 1])
    sc.pl.umap(adata, color= gene22, title = gene22, color_map = umap_cmap, show = False, ax = axs[2, 2])
    #new_fig = fig
    #plt.close(fig)
    #fig.clf()
    return fig
    plt.close(fig)

In [None]:
for name in leiden_names:
    for i in adata.obs[name].cat.categories:
        li = []
        wl = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+name]['names']).head(30)[i]
        lr = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+name]['names']).head(30)[i]
        li = wl[wl.isin(lr)].to_list()
        figure = get_plots(name, i, li)
        figure.savefig('./figures/'+name_of_analysis+'_marker_pdfs_and_excels/umap_'+name+'_cluster_'+i+'.pdf',format = 'pdf')
        figure.clf()
        plt.close(figure)
