In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
import matplotlib.pyplot as plt

In [None]:
#import matplotlib as mpl

In [None]:
import seaborn as sns

In [None]:
sc.settings.verbosity = 4
sc.logging.print_header()
sc.settings.set_figure_params(dpi=300, facecolor='white', format = 'pdf', vector_friendly = True)

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
figure = "Figure_6"

In [None]:
sc.settings.figdir = './'+figure

In [None]:
umap_cmap = sns.blend_palette(['lightgrey', 'xkcd:medium blue'], as_cmap = True)

In [None]:
f_cat_cmap = sns.blend_palette(['whitesmoke', 'whitesmoke', 'lightgray', 'lime', 'green', 'green'], as_cmap = True)

In [None]:
f_cat_cmap

In [None]:
z_sco_cmap = sns.blend_palette(['whitesmoke', 'whitesmoke', 'lightgray', 'pink', 'red'], as_cmap = True)

In [None]:
z_sco_cmap

In [None]:
def check_for_species(df, species):
    if df[4].str.contains(species).any():
        contained = df[df[4].str.contains(species)].sort_values(3)
        return contained.iloc[0, 4]
    else:
        return None

In [None]:
def contains_words (hit_string, li):
    ret = False
    for word in li:
    #print(word)
        if word.lower() in hit_string:
          #print(word)
            ret = True
    return ret

In [None]:
def get_informative (df, li):
    if len(df.index) == 0:
        return "not in Diamond"
    else:
        found = False
        i = 0
        while found == False and i in range(len(df.index)):
            se = df.iloc[i, 4].lower()
            if contains_words(se, li) == False:
                found = True
                return df.iloc[i, 4]
            i +=1  
        if found == False:
            return "no informative hits: "+df.iloc[0, 4]

In [None]:
def get_best_hit (transcript, species, li):
    search = diamond[diamond[0] == transcript]
    if check_for_species(search, species) is not None:
        return check_for_species(search, species)
    else:
        return get_informative (search, li)

In [None]:
list_of_words = ['hypothetical', 'uncharacterized', 'unnamed']

In [None]:
preferred_species = 'Platynereis'

In [None]:
diamond = pd.read_csv('..\..\diamond_pristina.tsv', sep='\t', header = None)

# Input file

In [None]:
adata = sc.read_h5ad('../../pristina_atlas_coloured_subcl.h5ad')

In [None]:
adata

In [None]:
adata.obs

In [None]:
clusteringlayer = 'leiden_1.5'

# Panel FC scores

In [None]:
adata.obs.columns[adata.obs.columns.str.contains('score FC')]

In [None]:
sco = 'score FC Cell cycle control, cell division, chromosome partitioning'
sc.pl.umap(adata, color= sco, color_map = f_cat_cmap, frameon = False, size = 10,
               title = sco,
               save = '_score_'+sco+'.pdf')

In [None]:
sco = 'score FC Chromatin structure and dynamics'
sc.pl.umap(adata, color= sco, color_map = f_cat_cmap, frameon = False, size = 5,
               title = sco,
               save = '_score_'+sco+'.pdf')

In [None]:
sco = 'score FC RNA processing and modification'
sc.pl.umap(adata, color= sco, color_map = f_cat_cmap, frameon = False, size = 5,
               title = sco, 
               save = '_score_'+sco+'.pdf')

In [None]:
sco = 'score FC Replication, recombination and repair'
sc.pl.umap(adata, color= sco, color_map = f_cat_cmap, frameon = False, size = 5,
               title = sco,
               save = '_score_'+sco+'.pdf')

In [None]:
sco = 'score FC Transcription'
sc.pl.umap(adata, color= sco, color_map = f_cat_cmap, frameon = False, size = 5,
               title = sco,
               save = '_score_'+sco+'.pdf')

In [None]:
sco = 'score FC Nuclear structure'
sc.pl.umap(adata, color= sco, color_map = f_cat_cmap, frameon = False, size = 5,
               title = sco,
               save = '_score_'+sco+'.pdf')

# Panel TFs

In [None]:
tfs = pd.read_csv('20221025_pristina_TFs_curated.tsv', sep="\t")

In [None]:
tfs

In [None]:
norm_cpms = pd.read_csv('plei_counts_broad_norm_cpm.tsv', sep="\t")

In [None]:
norm_cpms

In [None]:
#he = norm_cpms[norm_cpms.sum(axis = 1) > 200] # high expression filter

In [None]:
#he

In [None]:
#norm_cpms[norm_cpms.index.isin(he)]

In [None]:
#cpms = he

In [None]:
log_ratios = pd.DataFrame(columns = norm_cpms.columns, index = norm_cpms.index)
for i in norm_cpms:
    current = norm_cpms[i]
    rest = norm_cpms[norm_cpms.columns[~(norm_cpms.columns == i)]].mean(axis = 1)
    seri = np.log2(current+1) - np.log2(rest+1)
    log_ratios[i] = seri

In [None]:
log_ratios

In [None]:
log_ratios = log_ratios[log_ratios.index.isin(tfs['id'])] #filtering only tfs

In [None]:
log_ratios

In [None]:
cv = norm_cpms.std(axis = 1) /  norm_cpms.mean(axis = 1)

In [None]:
log_ratios = log_ratios[log_ratios.index.isin(cv[(cv > 1)].index)] #filtering high cv

In [None]:
log_ratios

In [None]:
top_piwi = log_ratios[log_ratios.index.isin(log_ratios['piwi_pos_cells'].sort_values(ascending = False).head(200).index)]

In [None]:
top_piwi

In [None]:
top_piwi['gut'].sort_values(ascending = False).head(10).index.to_list()

In [None]:
top_piwi_other = top_piwi[['eleocytes', 'epidermis', 'globin_pos_cells', 'gut', 'muscle', 'neurons', 'polycystin_cells']].copy()

In [None]:
sns.clustermap(data = top_piwi_other.transpose(), z_score = 1, metric="euclidean", cmap = 'cividis', figsize=(30, 5), dendrogram_ratio=0.1, colors_ratio = 0.01)
plt.savefig(figure+'/clustermap_'+figure+'_main_types.pdf')

In [None]:
len(top_piwi_other.index.to_list())

In [None]:
pd.Series(top_piwi_other.index, index = top_piwi_other.index).apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')

In [None]:
top_piwi_other['Diamond blast annotation'] = pd.Series(top_piwi_other.index, index = top_piwi_other.index).apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')

In [None]:
top_piwi_other

In [None]:
top_piwi_other.to_excel('./'+figure+"/top_piwi_in_main_types_pristina_6.xlsx")

In [None]:
sc.pl.umap(adata, color = top_piwi_other.index[0:40], cmap = umap_cmap, size = 10, save = '_top_piwi_1.pdf')

In [None]:
sc.pl.umap(adata, color = top_piwi_other.index[40:80], cmap = umap_cmap, size = 10, save = '_top_piwi_2.pdf')

In [None]:
sc.pl.umap(adata, color = top_piwi_other.index[80:120], cmap = umap_cmap, size = 10, save = '_top_piwi_3.pdf')

In [None]:
sc.pl.umap(adata, color = top_piwi_other.index[120:160], cmap = umap_cmap, size = 10, save = '_top_piwi_4.pdf')

In [None]:
sc.pl.umap(adata, color = top_piwi_other.index[160:200], cmap = umap_cmap, size = 10, save = '_top_piwi_5.pdf')

In [None]:
#epidermis and gut
tfid = 'PrileiEVm008001t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
# interferon regulatory factor

In [None]:
#polycystin
tfid = 'PrileiEVm002913t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
#PRA1

In [None]:
#vigilin
tfid = 'PrileiEVm006670t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
#insulinoma-associated protein 1a 

In [None]:
#gut
tfid = 'PrileiEVm010521t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
#even-skipped

In [None]:
#muscle
tfid = 'PrileiEVm008837t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
#paired

In [None]:
#gut
tfid = 'PrileiEVm007974t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
#parahox

In [None]:
#muscle epidermis
tfid = 'PrileiEVm010996t1'
sc.pl.umap(adata, color= tfid, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+tfid+'.pdf')
#myelocytomatosis transcription factor

# Piwi vs all limma analysis

In [None]:
limma = pd.read_csv('plei_piwi_vs_all_DGE.tsv', sep="\t")

In [None]:
limma['-Log2_adj_p_value'] = -np.log2(limma['adj.P.Val'])

In [None]:
limma[limma['logFC'] > 1.5]

In [None]:
limma[limma['logFC'] < -1.5]

In [None]:
annot_set = pd.read_excel('annot.xlsx', index_col = 'Unnamed: 0')

In [None]:
annot_set

In [None]:
logFC_tr = 1.5
p_tr = 0.05
with plt.rc_context({'figure.figsize': (2, 4)}):
    plt.axhline(-np.log2(p_tr), color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.axvline(logFC_tr, color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.axvline(-logFC_tr, color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.grid(None)
    plt.xticks(range(-4, 6, 2))
    plt.yticks(range(0, 24, 4))
    sns.scatterplot(data = limma, x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['lightgrey'], linewidth=0)
    sns.scatterplot(data = limma[limma['logFC'] > logFC_tr], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['palegreen'], linewidth=0)
    sns.scatterplot(data = limma[limma['logFC'] < -logFC_tr], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['lightpink'], linewidth=0)
    sns.scatterplot(data = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] > logFC_tr)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['seagreen'], linewidth=0)
    sns.scatterplot(data = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] < -logFC_tr)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['crimson'], linewidth=0)
    sns.scatterplot(data = limma[limma.index.isin(annot_set.index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['darkgreen'], linewidth=0)

    for i in limma[limma.index.isin(annot_set.index)].index:
        plt.text(limma[limma.index.isin(annot_set.index)].loc[i, 'logFC'], limma[limma.index.isin(annot_set.index)].loc[i,'-Log2_adj_p_value'], i, verticalalignment = 'top', horizontalalignment='left', size=2, color='black', weight='semibold')
    
    plt.savefig(figure+'/vulcano_'+figure+'_limma.pdf')

In [None]:
logFC_tr = 1.5
p_tr = 0.05
with plt.rc_context({'figure.figsize': (2, 4)}):
    plt.axhline(-np.log2(p_tr), color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.axvline(logFC_tr, color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.axvline(-logFC_tr, color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.grid(None)
    plt.xticks(range(-4, 6, 2))
    plt.yticks(range(0, 24, 4))
    sns.scatterplot(data = limma, x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['lightgrey'], linewidth=0)
    sns.scatterplot(data = limma[limma['logFC'] > logFC_tr], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['palegreen'], linewidth=0)
    sns.scatterplot(data = limma[limma['logFC'] < -logFC_tr], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['lightpink'], linewidth=0)
    sns.scatterplot(data = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] > logFC_tr)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['seagreen'], linewidth=0)
    sns.scatterplot(data = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] < -logFC_tr)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['crimson'], linewidth=0)
    sns.scatterplot(data = limma[limma.index.isin(annot_set.index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['darkgreen'], linewidth=0)
    
    plt.savefig(figure+'/vulcano_'+figure+'_limma_unnanotated.pdf')

In [None]:
limma[limma['-Log2_adj_p_value'] > -np.log2(p_tr)]

In [None]:
limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] > logFC_tr)]

In [None]:
a = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] > logFC_tr)].index

In [None]:
b = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] > logFC_tr)]['logFC']

In [None]:
c = limma[(limma['-Log2_adj_p_value'] > -np.log2(p_tr)) & (limma['logFC'] > logFC_tr)]['-Log2_adj_p_value']

In [None]:
d = pd.Series(a, index = a).apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')
      

In [None]:
df = pd.concat([b, c, d], axis = 1)

In [None]:
df

In [None]:
df.to_excel('./'+figure+'/top_piwi_dge.xlsx')

# Epigenetic factors

In [None]:
epig = pd.read_csv('../../epigenetic regulators/Pristina_epigenetic_factor.csv', index_col = 'Pristina IDs')

In [None]:
epig

In [None]:
epig_li = epig.index.to_list()

In [None]:
epig_li

In [None]:
limma

In [None]:
epig

In [None]:
logFC_tr = 1.5
p_tr = 0.05
with plt.rc_context({'figure.figsize': (2, 4)}):
    plt.axhline(-np.log2(p_tr), color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.axvline(logFC_tr, color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.axvline(-logFC_tr, color = 'black', linewidth = 0.5, dashes = (5,5))
    plt.grid(None)
    plt.xticks(range(-4, 6, 2))
    plt.yticks(range(0, 24, 4))
    
    sns.scatterplot(data = limma, x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['lightgrey'], linewidth = 0)
    
    sns.scatterplot(data = limma[limma.index.isin(epig[epig['HAT'] == 'HAT    '].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['magenta'], linewidth = 0)

    sns.scatterplot(data = limma[limma.index.isin(epig[epig['MLL'] == 'MLL    '].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['pink'], linewidth = 0)
    
    sns.scatterplot(data = limma[limma.index.isin(epig[epig['PcG and PcG-like'] == 'PcG and PcG-like'].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['purple'], linewidth = 0)
        
    sns.scatterplot(data = limma[limma.index.isin(epig[epig['SWI/SNF'] == 'SWI/SNF'].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['green'], linewidth = 0)
        
    sns.scatterplot(data = limma[limma.index.isin(epig[epig['HDAC'] == 'HDAC'].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['blue'], linewidth = 0)
            
    sns.scatterplot(data = limma[limma.index.isin(epig[epig['ISWI'] == 'ISWI'].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['red'], linewidth = 0)

    sns.scatterplot(data = limma[limma.index.isin(epig[epig['FACT'] == 'FACT'].index)], x = 'logFC', y = '-Log2_adj_p_value', s = 5, c = ['orange'], linewidth = 0)
    
    plt.savefig(figure+'/vulcano_'+figure+'_limma_epigenetic.pdf')

In [None]:
comp = 'PrileiEVm005208t1'
comp_id = 'fact'
gene_na = 'srrp1'
sc.pl.umap(adata, color= comp, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+comp+'_'+comp_id+'_'+gene_na+'.pdf')

In [None]:
comp = 'PrileiEVm009735t1'
comp_id = 'hdac'
gene_na = 'hdac1'
sc.pl.umap(adata, color= comp, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+comp+'_'+comp_id+'_'+gene_na+'.pdf')

In [None]:
comp = 'PrileiEVm019269t1'
comp_id = 'PcG'
gene_na = 'cbx3'
sc.pl.umap(adata, color= comp, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+comp+'_'+comp_id+'_'+gene_na+'.pdf')

In [None]:
comp = 'PrileiEVm000012t1'
comp_id = 'mll'
gene_na = 'kmt2a'
sc.pl.umap(adata, color= comp, color_map = umap_cmap, frameon = False, size = 5,
               save = '_feature_'+comp+'_'+comp_id+'_'+gene_na+'.pdf')