In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
sc.settings.verbosity = 4
sc.logging.print_header()
sc.settings.set_figure_params(dpi=300, facecolor='white', format = 'pdf', vector_friendly = True)

In [None]:
figure = "Figure_4"

In [None]:
sc.settings.figdir = './'+figure

In [None]:
umap_cmap = sns.blend_palette(['lightgrey', 'xkcd:medium blue'], as_cmap = True)

In [None]:
mod_cmap = sns.blend_palette(['whitesmoke', 'lightgray', 'xkcd:mint green', 'xkcd:kelly green'], as_cmap = True)

# Functions for diamond blast querying

In [None]:
def check_for_species(df, species):
    if df[4].str.contains(species).any():
        contained = df[df[4].str.contains(species)].sort_values(3)
        return contained.iloc[0, 4]
    else:
        return None

In [None]:
def contains_words (hit_string, li):
    ret = False
    for word in li:
        if word.lower() in hit_string:
            ret = True
    return ret

In [None]:
def get_informative (df, li):
    if len(df.index) == 0:
        return "not in Diamond"
    else:
        found = False
        i = 0
        while found == False and i in range(len(df.index)):
            se = df.iloc[i, 4].lower()
            if contains_words(se, li) == False:
                found = True
                return df.iloc[i, 4]
            i +=1  
        if found == False:
            return "no informative hits: "+df.iloc[0, 4]

In [None]:
def get_best_hit (transcript, species, li):
    search = diamond[diamond[0] == transcript]
    if check_for_species(search, species) is not None:
        return check_for_species(search, species)
    else:
        return get_informative (search, li)

In [None]:
list_of_words = ['hypothetical', 'uncharacterized', 'unnamed', 'Dimorphilus']

In [None]:
preferred_species = 'Platynereis'

In [None]:
diamond = pd.read_csv('../../diamond_pristina.tsv', sep='\t', header = None)

# Eggnog annotation

In [None]:
annot = pd.read_csv('../../annot.tsv', sep='\t', index_col = "query")

# Input file

In [None]:
adata = sc.read_h5ad('../../pristina_atlas_coloured_subcl.h5ad')

In [None]:
adata

In [None]:
adata.var

In [None]:
adata.obs

In [None]:
clusteringlayer = 'leiden_1.5'

In [None]:
tfs = pd.read_csv('../../TFs/20220708_pristina_TFs_curated.tsv', sep="\t")['gID'].str.replace('.p1', '')

In [None]:
tfs

In [None]:
tfs[tfs.isin(['PrileiEVm006891t1'])]

# Epidermis, muscle and neurons

In [None]:
ctype = 'epidermis'
gene = 'PrileiEVm000093t1'
name = 'zinc-finger'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10,
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')


In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'muscle'
gene = 'PrileiEVm008071t1'
name = 'myoD1'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10,
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')


In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'neurons'
gene = 'PrileiEVm003917t1'
name = 'pou6f'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10,
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

# Gut

In [None]:
ctype = 'gut'
gene = 'PrileiEVm006891t1'
name = 'Hnf4'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10,
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'gut'
gene = 'PrileiEVm004244t1'
name = 'Nkx-2-b'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10,
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'gut_and_lumbrokinase'
gene = 'PrileiEVm004832t1'
name = 'Gata4'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10, 
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'gut_stomach'
gene = 'PrileiEVm005298t1'
name = 'Nkx-2-b'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10, 
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

#  Other

In [None]:
ctype = 'eleocytes'
gene = 'PrileiEVm005230t1'
name = 'ets-4'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10, 
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'vigilin'
gene = 'PrileiEVm005896t1'
name = 'prdm13'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10,
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'globin'
gene = 'PrileiEVm002870t1'
name = 'mlx'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10, 
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'polycystin'
gene = 'PrileiEVm010161t1'
name = 'tubby'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10, 
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
ctype = 'lipoxygenase'
gene = 'PrileiEVm001926t1'
name = 'zinc finger'

sc.pl.umap(adata, color= gene, color_map = umap_cmap, frameon = False, size = 10, 
           save = '_feature_'+gene+'_'+ctype+'_'+name+'.pdf')

In [None]:
get_best_hit (gene, preferred_species, list_of_words)

In [None]:
if gene in annot.index:
    print(annot.loc[gene])

In [None]:
tf_st = ['PrileiEVm005896t1', 'PrileiEVm005572t1', 'PrileiEVm009906t1', 'PrileiEVm005298t1', 'PrileiEVm004755t1', 'PrileiEVm018460t1', 'PrileiEVm006139t1', 'PrileiEVm010521t1', 'PrileiEVm010508t1', 'PrileiEVm014508t1', 'PrileiEVm013863t1', 'PrileiEVm002396t1', 'PrileiEVm010952t1', 'PrileiEVm004056t1', 'PrileiEVm002845t1', 'PrileiEVm010713t1', 'PrileiEVm003625t1', 'PrileiEVm009731t1', 'PrileiEVm008918t1', 'PrileiEVm004625t1', 'PrileiEVm008132t1', 'PrileiEVm008338t1', 'PrileiEVm008469t1', 'PrileiEVm001919t1', 'PrileiEVm008466t1', 'PrileiEVm007971t1', 'PrileiEVm005955t1', 'PrileiEVm004665t1', 'PrileiEVm001926t1', 'PrileiEVm010363t1', 'PrileiEVm008553t1', 'PrileiEVm007012t1', 'PrileiEVm008019t1', 'PrileiEVm005469t1', 'PrileiEVm001532t1', 'PrileiEVm009536t1', 'PrileiEVm005524t1', 'PrileiEVm002359t1', 'PrileiEVm007750t1', 'PrileiEVm011864t1', 'PrileiEVm007240t1', 'PrileiEVm008125t1', 'PrileiEVm006586t1', 'PrileiEVm008067t1', 'PrileiEVm006705t1', 'PrileiEVm004545t1', 'PrileiEVm017429t1', 'PrileiEVm006596t1', 'PrileiEVm004337t1', 'PrileiEVm008526t1', 'PrileiEVm007974t1']

In [None]:
pd.concat([pd.Series(tf_st).rename('id'), pd.Series(tf_st).apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')], axis = 1)

In [None]:
pd.concat([pd.Series(tf_st).rename('id'), pd.Series(tf_st).apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')], axis = 1).to_excel('centralTFs.xlsx')

In [None]:
cpms = pd.read_csv('../../20221118_plei_cpms.tsv', sep="\t")

In [None]:
cpms

In [None]:
cpms.index[cpms['01_piwi_pos_cells_1'] > 10]

In [None]:
th = 5
genes_quantified = {}
for i in cpms.columns:
    genes_quantified[i] = len(cpms.index[cpms[i] > th])
    

In [None]:
genes_q = pd.DataFrame.from_dict(data = genes_quantified, orient = 'index', columns = ['genes'])

In [None]:
with plt.rc_context({'figure.figsize': (20, 20)}):
    sns.barplot(data = genes_q, x = 'genes', y = genes_q.index, palette = adata.uns['leiden_1.5_colors_sorted'])
    plt.savefig('./'+figure+'/barplot_genes_per_cluster.pdf')

In [None]:
adata.uns['leiden_1.5_colors_sorted']

In [None]:
genes_q

In [None]:
genes_q.describe()

In [None]:
adata.obs['n_genes'].mean()

In [None]:
adata.obs['n_counts'].mean()