In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ete3
from ete3 import TreeStyle, PhyloTree, Tree, faces, NodeStyle, AttrFace, TextFace
from ete3.treeview.faces import add_face_to_node
import glob

import seaborn as sns
import re

In [15]:
import matplotlib 
# For higher resoltion figures
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rcParams["font.family"] = "Helvetica" #somethings this one doesnt work
plt.rcParams['pdf.fonttype'] = 42

In [16]:
# Read in the central ADs
with open("../data/fasta_files/AllSeqs_IntegralAround_WxxLF_-50_+20_top138.fasta", "r") as f:
    lines = f.readlines()

seq = ""
seqs = []
names = []

for line in lines:
    if line.startswith(">"):
        names.append(line.strip().replace(">", ""))
        if len(names) > 1:
            seqs.append(seq)
            seq = ""
    else:
        seq += line.strip()

seqs.append(seq)

central_ADs = pd.DataFrame({'name' : names, 'aa_seq' : seqs})
central_ADs


Unnamed: 0,name,aa_seq
0,Eurotiomycetes_jgi|Penatra1|34162|e_gw1.2.1160.1,MFTDLDVAGHEDWPSLFDHSSEPLNAFDLATL
1,Scas_Scas663.23,SEEQEDDQFMTIPPLNELDSNVVDAFFSSSTDSTPMFEFESLDESN...
2,Cten_EGV62856.1_CandidaTenuisATCC10573,ESAPGLSAPDSSLAFHSEVLDSVFSNDEAVDHTPMFDELDFMLESS...
3,Eurotiomycetes_jgi|Asptam1|203219|CE203218_32345,PKDLFMDASAPPSASFTDLSTPSFESPGYFSQDTSPMFATDMELGP...
4,Eurotiomycetes_jgi|Penbr2|60600|gm1.7763_g,ISPRDLMMDTSVPPSGTFTDLSTPSFESPGNFSQNASPMFTDMDLV...
...,...,...
133,Sordariomycetes_jgi|Hypfra2|223471|CE223470_11...,QDHFTSAPNSSAITNLTSPSMYGESPDLHDSYEVSPNYGGSDFDHG...
134,Blastocladiomycota_jgi|Catan2|1506241|gm1.11555_g,TSTSAPPSPVPSLELCAATSGRALSKSPSPFDFGAAPPADATLVVS...
135,Saccharomycotina_jgi|Ascru1|80718|fgenesh1_pm....,NSNRRNNAISHDDVVAPKQTISREELLLITDQVFSNDNVSIFDEKL...
136,Saccharomycotina_jgi|Hanpo2|10524|gm1.4782_g,DDHNADQALRSNPGLTISPFEIHSSVIGSIFEDPDAETAPMFEPRE...


In [17]:
# Count SP/TP motifs

central_ADs["SP_count"] = [len(re.findall('[ST]P', seq)) for seq in central_ADs['aa_seq']]

In [18]:
SP_dict = {}

for i in central_ADs.index:
    SP_dict[central_ADs.loc[i, "name"]] = central_ADs.loc[i, "SP_count"]

In [19]:
central_ADs[central_ADs['name'] == 'Orbiliomycetes_tr|S8AP85|S8AP85_DACHA']['aa_seq'].values[0]

'TFSPNNPLFHDFSPFIGTIPYTGRTNDTLDDSPTGLDTPLFGVDDAALDDWTPLFSSEEQMDFAANQPAP'

In [20]:
re.findall('[ST]P', 'ALPQTATAPDAKTVLPIPELDDAVVESFFSSSTDSTPMFEYENLEDNSKEWTSLFDNDIPVTTDDVSLAD')

['TP']

In [21]:
len("ALPQTATAPDAKTVLPIPELDDAVVESFFSSSTDSTPMFEYENLEDNSKEWTSLFDNDIPVTTDDVSLAD")

70

In [22]:
name_key = pd.read_csv("../data/phylogenetic_info/top138_phyloinfo.csv")

name_dict = {}

for idx in name_key.index:
    name_dict[name_key.loc[idx, "name"]] = name_key.loc[idx, "SpeciesName"]

In [23]:
name_key[name_key["SpeciesName"] == "Dactylellina haptotyla"]

Unnamed: 0,id,name,TreeLeaf,Validated,SpeciesName,NCBI Taxon,Note,AD_seq,full_sequence
389,72.0,Orbiliomycetes_tr|S8AP85|S8AP85_DACHA,Monha1,Yes,Dactylellina haptotyla,430498.0,Equivalent to monacrosporium haptotylum,TFSPNNPLFHDFSPFIGTIPYTGRTNDTLDDSPTGLDTPLFGVDDA...,MARRVRLTSSLACALLLLTTATIPTITITITAAAAAAVAIIVGAVS...


In [24]:
# Put central ADs on the tree
i = 0

def mylayout(node):
    global i
    if node.is_leaf():
        name_face = AttrFace("name", fsize=60)
        node.add_face(name_face, column=0, position="aligned")

        nst = NodeStyle()
        nst["size"] = 0  # Size of the node circle, set to 0 to hide
        nst["vt_line_width"] = 10  # Vertical line thickness
        nst["hz_line_width"] = 10  # Horizontal line thickness
        nst["vt_line_color"] = "#333333"  # Vertical line color
        nst["hz_line_color"] = "#333333"  # Horizontal line color

        name = node.get_leaf_names()
        # if name[0].replace("'","") == "Saccharomycotina_sp|P03069|GCN4_YEAST": 
        #     nst["bgcolor"] = "black"
        if name[0] in SP_dict.keys():
            if SP_dict[name[0]] > 4:
                nst["bgcolor"] = "#75beff"
            elif SP_dict[name[0]] == 4:
                nst["bgcolor"] = "#a1ffe6"
            elif SP_dict[name[0]] == 3:
                nst["bgcolor"] = "#fffca9"
            elif SP_dict[name[0]] == 2:
                nst["bgcolor"] = "#ffb375"
            elif SP_dict[name[0]] == 1:
                nst["bgcolor"] = "#ffd1d1"
        elif name[0] in SP_dict.keys():
            if SP_dict[name[0]] > 4:
                nst["bgcolor"] = "#75beff"
            elif SP_dict[name[0]] == 4:
                nst["bgcolor"] = "#a1ffe6"
            elif SP_dict[name[0]] == 3:
                nst["bgcolor"] = "#fffca9"
            elif SP_dict[name[0]] == 2:
                nst["bgcolor"] = "#ffb375"
            elif SP_dict[name[0].replace("'","")] == 1:
                nst["bgcolor"] = "#ffd1d1"
        else:
            print(name)

        node.set_style(nst)
                

t = PhyloTree("../data/phylogenetic_info/gcn4_gene_phylogeny.treefile", format=1)

filtered_names = [name for name in central_ADs["name"].values if t.search_nodes(name=name)]
t.prune(filtered_names)
ts = TreeStyle()

ts.layout_fn = mylayout
ts.mode = "c"
ts.show_leaf_name = False
# t.show(tree_style = ts)
t.render("../figures/central_SP_on_tree.pdf", tree_style = ts)

{'nodes': [[3656.15904439829,
   3887.4096766354587,
   3660.614879596371,
   3891.8655118335396,
   0,
   None],
  [4091.056327599071,
   3805.174188083623,
   4095.9021400184165,
   3810.0200005029683,
   3,
   None],
  [4134.035884130446,
   3747.575065793204,
   4139.272247107289,
   3752.8114287700473,
   4,
   None],
  [4142.8920692997435,
   3665.1312874415503,
   4148.462066618508,
   3670.701284760315,
   5,
   None],
  [4079.878206653605,
   3574.20239096689,
   4085.4828143539326,
   3579.806998667218,
   6,
   None],
  [3927.2067403537576,
   3495.951129477403,
   3931.8449653498783,
   3500.5893544735236,
   7,
   None],
  [3621.6285267956314,
   3552.697865743888,
   3627.2413052901334,
   3558.31064423839,
   8,
   None],
  [3495.9297329153915,
   4059.861684311126,
   3501.3519713785076,
   4065.283922774242,
   9,
   None],
  [4293.709504725485,
   4059.150179690888,
   4299.010093519649,
   4064.450768485051,
   10,
   None],
  [4318.605239914579,
   4043.702367164654