In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
sc.settings.verbosity = 4
sc.logging.print_header()
sc.settings.set_figure_params(dpi=300, facecolor='white', format = 'pdf', vector_friendly = True)

In [None]:
figure = "Figure_1"

In [None]:
sc.settings.figdir = './'+figure

In [None]:
umap_cmap = sns.blend_palette(['lightgrey', 'xkcd:medium blue'], as_cmap = True)

# Functions for diamond blast querying

In [None]:
def check_for_species(df, species):
    if df[4].str.contains(species).any():
        contained = df[df[4].str.contains(species)].sort_values(3)
        return contained.iloc[0, 4]
    else:
        return None

In [None]:
def contains_words (hit_string, li):
    ret = False
    for word in li:
        if word.lower() in hit_string:
            ret = True
    return ret

In [None]:
def get_informative (df, li):
    if len(df.index) == 0:
        return "not in Diamond"
    else:
        found = False
        i = 0
        while found == False and i in range(len(df.index)):
            se = df.iloc[i, 4].lower()
            if contains_words(se, li) == False:
                found = True
                return df.iloc[i, 4]
            i +=1  
        if found == False:
            return "no informative hits: "+df.iloc[0, 4]

In [None]:
def get_best_hit (transcript, species, li):
    search = diamond[diamond[0] == transcript]
    if check_for_species(search, species) is not None:
        return check_for_species(search, species)
    else:
        return get_informative (search, li)

In [None]:
list_of_words = ['hypothetical', 'uncharacterized', 'unnamed', 'Dimorphilus']

In [None]:
preferred_species = 'Platynereis'

In [None]:
diamond = pd.read_csv('../../diamond_pristina.tsv', sep='\t', header = None)

# Eggnog annotation

In [None]:
annot = pd.read_csv('../../annot.tsv', sep='\t', index_col = "query")

# Input file

In [None]:
adata = sc.read_h5ad('../../pristina_atlas_coloured_subcl.h5ad')

In [None]:
adata

In [None]:
adata.var

In [None]:
clusteringlayer = 'leiden_1.5'

In [None]:
list(adata.uns)

# Panel B

In [None]:
with plt.rc_context({'figure.figsize': (12, 12)}):
    sc.pl.umap(adata, color=clusteringlayer, legend_loc= 'on data', legend_fontoutline = 3, legend_fontsize = 10,
        title= 'Clustering layer '+str(clusteringlayer), size = 10,
        frameon=False, add_outline = False, save = '_'+figure+'_colored.pdf')

# Panel C

In [None]:
list(adata.uns['rank_genes_groups_wilcox_broad_names'])

In [None]:
best_broad_markers = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_broad_names']['names']).head(5).transpose()

In [None]:
best_broad_markers

In [None]:
group = 'epidermis'
n = 3
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'gut'
n = 2
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'muscle'
n = 0
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'neurons'
n = 0
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'globin+ cells'
n = 3
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'polycystin cells'
n = 0
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'eleocytes'
n = 1
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'chaetal sacs'
n = 1
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

In [None]:
group = 'lipoxygenase+ cells'
n = 1
sc.pl.umap(adata, color= best_broad_markers.loc[group][n], color_map = umap_cmap, frameon = False,
               title = group +': '+ best_broad_markers.loc[group][n] + ' ' + str(adata.var.loc[best_broad_markers.loc[group][n]]['Preferred_name']),
           save = '_feature_'+group+'.pdf')

In [None]:
get_best_hit (best_broad_markers.loc[group][n], preferred_species, list_of_words)

In [None]:
if best_broad_markers.loc[group][n] in annot.index:
    print(annot.loc[best_broad_markers.loc[group][n]])

# Panel E

In [None]:
df = pd.read_excel('../../Names, Broad Names, Color Palette, Order, Pristina.xlsx', index_col = 'Cluster')

In [None]:
df

In [None]:
perc_groups = df[['Broad Type', '% cells group', 'Colour group' ]].dropna()

In [None]:
perc_groups

In [None]:
with plt.rc_context({'figure.figsize': (5, 20)}):
    sns.barplot(y="Broad Type",  x="% cells group", data=perc_groups, palette = perc_groups['Colour group'].to_list())
    for i in range(len(perc_groups.index)):
        plt.axhline(i+0.5, color = 'black', linewidth = 0.5)
        plt.xticks(list(range(0,24,4)))
    plt.savefig('./'+figure+'/barplot_'+figure+'_broad.pdf')

Number of cells per cluster

In [None]:
df.index = df.index.astype('string')

In [None]:
df.loc['0']['Names (leiden_1.5)']

In [None]:
df

In [None]:
df['Number and Name'] = df.index + ' ' + df['Names (leiden_1.5)']

In [None]:
df[df['Names (leiden_1.5)'] == 'unnanotated'].index

In [None]:
dff = df.drop(df[df['Names (leiden_1.5)'] == 'unnanotated'].index)

In [None]:
with plt.rc_context({'figure.figsize': (5, 20)}):
    sns.barplot(y='Number and Name',  x="% cells", data=dff, palette = dff['Colours'])
    plt.axhline(2.5, color = 'black', linewidth = 0.5)
    plt.axhline(10.5, color = 'black', linewidth = 0.5)
    plt.axhline(19.5, color = 'black', linewidth = 0.5)
    plt.axhline(23.5, color = 'black', linewidth = 0.5)
    plt.axhline(30.5, color = 'black', linewidth = 0.5)
    plt.axhline(32.5, color = 'black', linewidth = 0.5)
    plt.axhline(34.5, color = 'black', linewidth = 0.5)
    plt.axhline(37.5, color = 'black', linewidth = 0.5)
    plt.axhline(39.5, color = 'black', linewidth = 0.5)
    plt.axhline(40.5, color = 'black', linewidth = 0.5)
    plt.axhline(41.5, color = 'black', linewidth = 0.5)
    plt.axhline(42.5, color = 'black', linewidth = 0.5)
    plt.axhline(43.5, color = 'black', linewidth = 0.5)
    plt.axhline(45.5, color = 'black', linewidth = 0.5)
    plt.axhline(46.5, color = 'black', linewidth = 0.5)
    plt.axhline(47.5, color = 'black', linewidth = 0.5)
    plt.axhline(48.5, color = 'black', linewidth = 0.5)
    plt.xticks(list(range(11)))
    plt.savefig('./'+figure+'/barplot_'+figure+'_clusters.pdf', bbox_inches = 'tight')

# Panel D PAGA

In [None]:
adata_paga = sc.read_h5ad('../../pristina_atlas_cut_PAGA.h5ad')

In [None]:
with plt.rc_context({'figure.figsize': (10, 10)}):
    sc.pl.paga(adata_paga,
    threshold=0.25,      
    solid_edges='connectivities_tree',
    #dashed_edges='connectivities', 
    root=1,
    layout='rt',
    node_size_scale=1,
    node_size_power=1,
    max_edge_width=3,
    edge_width_scale=0.5,
    #min_edge_width=3,
    fontsize=15,
    fontoutline= 2,
    frameon = False,
    save = '_'+figure+'_colored.pdf')

In [None]:
print(adata.obs.columns.to_list())

In [None]:
adata.obs['leiden_1.5_names'].cat.categories