In [1]:
#ete kernel
import pandas as pd
import matplotlib.pyplot as plt
import difflib
import os
import re
from ete3 import ClusterTree, TreeStyle, ProfileFace, Tree, TextFace
from ete3.treeview.faces import add_face_to_node
import numpy as np

import loess
from loess import loess_1d
from collections import defaultdict 
import math

# os.chdir("/Users/claireleblanc/Documents/grad_school/staller_lab/Evolution_stuff/activity_heatmaps/")

# Prepare data

In [107]:
# Read in activity data
seq_data = pd.read_pickle('../data/FullOrthologDF_20240930')
seq_data = seq_data.rename(columns={"SpeciesName" : "name"})

In [108]:
# Read in species name data
species_mapping = pd.read_csv("../data/phylogenetic_info/top138_phyloinfo.csv")
merged_data = species_mapping.merge(seq_data, on="name")

# Longest first
merged_data = merged_data.sort_values("Length", ascending=False)

# Removing the really long seqs
# merged_data = merged_data[merged_data['Length'] < 1500]

In [162]:
# Finds the start of the WxxLF motif, which the sequences are aligned on
def find_WLF(s):
    p = re.compile("W..LF")
    for m in p.finditer(s):
        return m.start()
    return -1

In [110]:
# Get location of all WxxLF motifs
merged_data.loc[:, "Location_WxxLF"] = merged_data["Seq"].apply(find_WLF)

# Getting the WxxLF motif that is the furthest into a sequence, all other sequence will be aligned to this
align_to = max(merged_data["Location_WxxLF"])
align_to

1413

In [111]:
merged_data[merged_data['Location_WxxLF'] == max(merged_data["Location_WxxLF"])]

Unnamed: 0,id,name,TreeLeaf,Validated,SpeciesName,NCBI Taxon,Note,AD_seq,full_sequence,Seq,Length,WxxLF_loc,SmoothedActivites,LinearCharge,LinearHydrophobicityKD,SmoothedActivitesLoess,Location_WxxLF
490,,Pezizomycetes_jgi|Ascni1|352744|fgenesh1_pg.76...,Ascni1,Yes,Ascodesmis nigricans,,,,,MSDSRHMRRPSHASDTDDEDDQSSTLHRTTSNNPYSISPSSPHRGN...,1543,1413,"[35776.33308908159, 35776.33308908159, 35776.3...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.6, 0.4, 0.4, ...","[0.0, 0.0, 0.3288888888888889, 0.2155555555555...","[31684.47651162896, 31684.47651162896, 31684.4...",1413


In [112]:
# Calculate padding i.e. how much "sequence" needs to be added to the front to align all the WxxLF motifs
merged_data.loc[:,"pad_by"] = list(merged_data["Location_WxxLF"] * -1 + align_to)
merged_data

Unnamed: 0,id,name,TreeLeaf,Validated,SpeciesName,NCBI Taxon,Note,AD_seq,full_sequence,Seq,Length,WxxLF_loc,SmoothedActivites,LinearCharge,LinearHydrophobicityKD,SmoothedActivitesLoess,Location_WxxLF,pad_by
115,,Sordariomycetes_jgi|Acral2|2047914|estExt_Gene...,Acral2,Yes,Sodiomycetes alcalophilus,,,,,MADTCGGSTPLKNFSQYGSQDRSLQQDRVVHGFHGSAAAGPSTFRS...,2943,1021,"[49761.33963354764, 49761.33963354764, 49761.3...","[0.0, 0.0, -0.2, -0.2, -0.2, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.5444444444444445, 0.4933333333333...","[36959.46539862907, 36959.46539862907, 36959.4...",1021,392
110,109.0,Sordariomycetes_jgi|Acral2|2019554|gm1.4974_g,Acral2,Yes,Sodiomycetes alcalophilus,398408.0,,DLLIQDPYMSAPNSTALTALTSPSLYESPDFGYDVSPGFGSNDFDT...,MALRIEVYNRIESSTASTALQRQDLRYTFRSNARAASGQANANYQA...,MALRIEVYNRIESSTASTALQRQDLRYTFRSNARAASGQANANYQA...,2928,1006,"[50849.75653537431, 50849.75653537431, 50849.7...","[0.0, 0.0, 0.2, 0.0, 0.0, 0.0, -0.2, 0.0, 0.2,...","[0.0, 0.0, 0.6666666666666666, 0.5466666666666...","[34049.34344014826, 34049.34344014826, 34049.3...",1006,407
490,,Pezizomycetes_jgi|Ascni1|352744|fgenesh1_pg.76...,Ascni1,Yes,Ascodesmis nigricans,,,,,MSDSRHMRRPSHASDTDDEDDQSSTLHRTTSNNPYSISPSSPHRGN...,1543,1413,"[35776.33308908159, 35776.33308908159, 35776.3...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.6, 0.4, 0.4, ...","[0.0, 0.0, 0.3288888888888889, 0.2155555555555...","[31684.47651162896, 31684.47651162896, 31684.4...",1413,0
55,136.0,Xylonomycetes_jgi|Trigu1|1079454|gm1.247_g,Trigu1,Yes,Trinosporium guianense,1196434.0,,FRDPLASAPPSAAFTNLTSPSIFDSPDVAESFETSPLFANADHDLA...,MADALCGPSNPLQNLQKHTSVDRTLQQDRLVGPRHSPVQDFRSRSS...,MADALCGPSNPLQNLQKHTSVDRTLQQDRLVGPRHSPVQDFRSRSS...,1328,1185,"[23437.5357660965, 23437.5357660965, 23437.535...","[0.0, 0.0, -0.2, -0.2, -0.2, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.6288888888888889, 0.6422222222222...","[29193.548646199746, 29193.548646199746, 29193...",1185,228
15,137.0,Xylonomycetes_jgi|Xylhe1|264767|gm1.473_g,Xylhe1,Yes,Xylona heveae,1217826.0,,LRDPLASAPPSAAFTNLTSPSIFDSPDVAESFETSPLFANADADLA...,MTDALCGPSNPLQHLQKHTSVDRTLQQDRLVGPRHSPVQDFRSQSQ...,MTDALCGPSNPLQHLQKHTSVDRTLQQDRLVGPRHSPVQDFRSQSQ...,1255,1111,"[26567.768783458843, 26567.768783458843, 26567...","[0.0, 0.0, -0.2, -0.2, -0.2, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.5733333333333334, 0.5866666666666...","[32604.845533062748, 32604.845533062748, 32604...",1111,302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,21.0,Dothideomycetes_jgi|Didsa1|381788|estExt_Genew...,Didsa1,Yes,Didymocrea sadasivanii,372059.0,Splicing isoform / truncation (?) of other Did...,MFQGDSCFPTGTDTWYSLFPEEESRTVMPAVTTP,MFQGDSCFPTGTDTWYSLFPEEESRTVMPAVTTPLAAPALERTVSS...,MFQGDSCFPTGTDTWYSLFPEEESRTVMPAVTTPLAAPALERTVSS...,157,14,"[190922.41852774887, 161032.45391660908, 15386...","[0.0, 0.0, -0.2, -0.2, -0.2, -0.2, -0.2, 0.0, ...","[0.0, 0.0, 0.43999999999999995, 0.38, 0.373333...","[170312.56492479687, 170312.56492479687, 17031...",14,1399
181,43.0,Eurotiomycetes_jgi|Penatra1|34162|e_gw1.2.1160.1,Penatra1,Yes,Penicillium atramentosum,36652.0,,MFTDLDVAGHEDWPSLFDHSSEPLNAFDLATL,MFTDLDVAGHEDWPSLFDHSSEPLNAFDLATLDAAAAYSVEPKKPI...,MFTDLDVAGHEDWPSLFDHSSEPLNAFDLATLDAAAAYSVEPKKPI...,157,12,"[261940.99846290916, 261910.30458108656, 26191...","[0.0, 0.0, -0.2, -0.4, -0.4, -0.4, -0.2, -0.2,...","[0.0, 0.0, 0.5955555555555556, 0.4755555555555...","[262143.0, 262143.0, 262143.0, 262143.0, 26214...",12,1401
180,42.0,Eurotiomycetes_jgi|Penatra1|32388|gw1.2.1490.1,Penatra1,Yes,Penicillium atramentosum,36652.0,,SPMFTDLDVAGHEDWPSLFDHSSEPLNAFDLATL,SPMFTDLDVAGHEDWPSLFDHSSEPLNAFDLATLDAAAAYSVEPKK...,SPMFTDLDVAGHEDWPSLFDHSSEPLNAFDLATLDAAAAYSVEPKK...,154,14,"[173113.6358480084, 215442.77937959705, 230942...","[0.0, 0.0, 0.0, -0.2, -0.2, -0.4, -0.4, -0.4, ...","[0.0, 0.0, 0.5355555555555556, 0.4755555555555...","[238988.135652909, 238988.135652909, 238988.13...",14,1399
201,64.0,Leotiomycetes_jgi|Oidma1|18868|fgenesh1_kg.6_#...,Oidma1,Yes,Oidiodendron maius,913774.0,,MDHSLAGDPWYPLFPPEDQFEVPKIEDSS,MDHSLAGDPWYPLFPPEDQFEVPKIEDSSPLLPEEELEVSEALRTS...,MDHSLAGDPWYPLFPPEDQFEVPKIEDSSPLLPEEELEVSEALRTS...,142,9,"[142319.55237852785, 98529.82136074522, 83943....","[0.0, 0.0, -0.2, -0.2, 0.0, -0.2, -0.2, -0.2, ...","[0.0, 0.0, 0.45999999999999996, 0.457777777777...","[76349.80070605988, 76349.80070605988, 76349.8...",9,1404


In [177]:
def make_activity_matrix(data, duplicates=True):
    """
    Used in method activity_heatmap_on_tree
    Creates a pandas dataframe of the activites where now each position
    in the sequence occupies its own cell. Entries are padded with -1
    Also merges species with multiple genes (so there is one row per specie)

    Returns 
    -------
    pd.DataFrame
        The reformatted pandas dataframe, as described above

    """ 
         

    if duplicates: 
        used_names = []
        padded_dict = {}   
        # For each column
        while len(used_names) < len(merged_data):
            current_data = merged_data[[n not in used_names for n in merged_data['name']]].sort_values("Length", ascending=True).drop_duplicates("TreeLeaf", keep='first').reset_index()
            align_to = max(current_data["Location_WxxLF"])
            current_data.loc[:,"pad_by"] = list(current_data["Location_WxxLF"] * -1 + align_to)
            end_align = max(current_data["Length"] + current_data["pad_by"])

            for row_index in current_data.index:
                sub_list = current_data.loc[row_index, 'SmoothedActivitesLoess']
                new_list = [-1] * current_data.loc[row_index, 'pad_by'] + list(sub_list) + [-1] * (end_align - current_data.loc[row_index, 'pad_by'] - len(sub_list) + 50)
                # print(align_to)
                curr_name = current_data.loc[row_index, 'TreeLeaf']
                if curr_name in padded_dict.keys():
                    padded_dict[curr_name] = padded_dict[curr_name] + new_list
                else:
                    padded_dict[curr_name] = new_list
                used_names.append(current_data.loc[row_index, "name"])

        activity_position_matrix = pd.DataFrame(padded_dict.values())
        activity_position_matrix.index = padded_dict.keys()
        activity_position_matrix = activity_position_matrix.fillna(-1)

    else:
        positional_activity_df_padded = []
        names = []
        for group_name, df_group in data.groupby("TreeLeaf"):
            new_list = []
            df_group = df_group.sort_values("Length", ascending=False).reset_index()

            sub_list = df_group.loc[0, 'SmoothedActivitesLoess']

            # new_list = [-1] * df_group.loc[0, 'pad_by'] + list(sub_list) 
            new_list = [-1] * df_group.loc[0, 'pad_by'] + list(sub_list) 

            positional_activity_df_padded.append(new_list)
            names.append(group_name)

        activity_position_matrix = pd.DataFrame(positional_activity_df_padded)
        activity_position_matrix.index = names
        activity_position_matrix = activity_position_matrix.fillna(-1)

    return activity_position_matrix

In [178]:
activity_position_matrix = make_activity_matrix(merged_data)
activity_position_matrix.to_csv("temp_position_activities.tsv", sep='\t')

with open("temp_position_activities.tsv",'r') as f:
    matrix = f.readlines()
    matrix = ''.join(matrix)
    matrix = '#Names' + matrix

# Random code that modifies the ete functionality

In [170]:
# Replacing some of ete's built in functions with our own so that we can use the colors/formatting that we want

def get_color_gradient(self,colorscheme='Reds'):
    from PyQt5 import QtGui
    import matplotlib.colors as colors
    import matplotlib.cm as cmx
    cmap0 = colors.LinearSegmentedColormap.from_list('', ['white', 'darkblue'])
    cNorm  = colors.Normalize(vmin=0, vmax=1)
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cmap0)
    color_scale = []
    for scale in np.linspace(0, 1, 201):
        # rgba = scalarMap.to_rgba(scale)
        # hex_color = '#%02x%02x%02x' % (int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255))
        # # hex_color = '#%02x%02x%02x' %scalarMap.to_rgba(scale)[:3]
        [r,g,b,a] = scalarMap.to_rgba(scale, bytes=True)
        color_scale.append( QtGui.QColor( r, g, b, a ) )

    return color_scale

# Replacing the get_color_gradient method with our custom method
ProfileFace.get_color_gradient = get_color_gradient

def draw_heatmap_profile(self):
    try:
        from numpy import isfinite as _isfinite, ceil
    except ImportError:
        pass
    else:
        isfinite = lambda n: n and _isfinite(n)

    from PyQt5.QtGui import QColor, QBrush, QPainter, QPixmap
    from PyQt5.QtCore import QRectF

    # Calculate vector
    vector = self.node.profile
    deviation = self.node.deviation
    # If no vector, skip
    if vector is None:
        return

    colors = self.get_color_gradient()

    leaves = self.node.get_leaves()

    vlength = len(vector)
    # pixels per array position
    img_height = self.height * len(leaves)
    profile_width = self.width
    profile_height= img_height

    x_alpha = float( profile_width / (len(vector)) )

    # Creates a pixmap
    self.pixmap = QPixmap(self.width, img_height)
    self.pixmap.fill(QColor("white"))
    p = QPainter(self.pixmap)

    x2 = 0
    y  = 0
    y_step = self.height
    for leaf in leaves:
        mean_vector = leaf.profile
        deviation_vector = leaf.deviation
        # Draw heatmap
        for pos in range(vlength):
            # first and second X pixel positions
            x1 = x2
            x2 = x1 + x_alpha
            dev1 = self.fit_to_scale(deviation_vector[pos])
            mean1 = self.fit_to_scale(mean_vector[pos])
            # Set heatmap color
            # if not np.isfinite(mean1):
            #     customColor = QColor("white")
            if (mean_vector[pos] == -1.0) or (math.isnan(mean1)):
                # color_index = abs(int(ceil(((self.center_v-mean1)*100)/(self.max_value-self.center_v))))
                customColor = QColor('white') # Color of the padding values
            elif mean1>self.center_v:
                color_index = abs(int(ceil(((self.center_v-mean1)*100)/(self.max_value-self.center_v))))
                customColor = colors[100 + color_index]
            elif mean1<self.center_v:
                color_index = abs(int(ceil(((self.center_v-mean1)*100)/(self.min_value-self.center_v))))
                customColor = colors[100 - color_index]
            else:
                # color_index = abs(int(ceil(((self.center_v-mean1)*100)/(self.max_value-self.center_v))))
                customColor = colors[100]

            # Fill bar with custom color
            p.fillRect(QRectF(x1, y, x_alpha, y_step), QBrush(customColor))
        y+= y_step
        x2 = 0
    p.end()

# Replacing the draw_heatmap_profile with our custom function
ProfileFace.draw_heatmap_profile = draw_heatmap_profile

In [175]:
# Read in gene tree
t_base = Tree('../data/phylogenetic_info/myco-species-tree_added.nwk', format=1)

# For some reason, ClusterTree doesn't have the format option
t = ClusterTree(t_base.write(format=0),text_array=matrix)

In [176]:
# t = ClusterTree('Phylogeny_test.tree', text_array=matrix)
length=20000
width=20

# Prune the tree to only contain the species we are interested in
leafs = t.get_leaf_names()
clade_leaves = [leaf for leaf in leafs if leaf in matrix]
# clade_leaves = [leaf for leaf in clade_leaves if leaf in yeast_clade]
# t.prune(clade_leaves)

array =  t.arraytable

# Calculates some stats on the matrix. Needed to establish the color gradients.
matrix_dist = [i for r in range(len(array.matrix))\
               for i in array.matrix[r] if np.isfinite(i)]
matrix_max = np.max(matrix_dist)
matrix_min = np.min(matrix_dist)
matrix_avg = matrix_min+((matrix_max-matrix_min)/2)


def mylayout(node):
    # Creates a profile face that will represent node's profile as a
    # heatmap
    profileFace  = ProfileFace(matrix_max, 0, matrix_avg, \
                                            length, width, "heatmap", colorscheme=0)
    
    # This is the beatufil line that allows the heatmap to look good on the whole tree!
    profileFace.rotable = False
    # Creates my own layout function that uses previous faces
    # If node is a leaf
    if node.is_leaf():
        name = TextFace(node.name, fsize=15)
        add_face_to_node(name, node, column=0, position="aligned")

        # And a line profile
        add_face_to_node(profileFace, node, 1, position="aligned")
        node.img_style["size"]=0


# Use my layout to visualize the tree
ts = TreeStyle()
ts.layout_fn = mylayout

# ts.mode = "c"
ts.show_leaf_name = False    
ts.draw_guiding_lines = True

# Use my layout to visualize the tree
# ts = TreeStyle()
# ts.layout_fn = mylayout
# t.show(tree_style=ts)

t.render("../figures/yeast_species_tree_heatmaps_all_genes_loess_smoothing.pdf", tree_style = ts)

{'nodes': [[0.5, 2135.33243560791, 4.5, 2139.33243560791, 0, None],
  [12.289671698643396,
   1801.6648712158203,
   16.289671698643396,
   1805.6648712158203,
   1,
   None],
  [29.907277893126434,
   1313.5933532714844,
   33.907277893126434,
   1317.5933532714844,
   2,
   None],
  [36.725127620712804,
   1118.1867065429688,
   40.725127620712804,
   1122.1867065429688,
   3,
   None],
  [55.62971259971588,
   811.1234130859375,
   59.62971259971588,
   815.1234130859375,
   4,
   None],
  [61.89680284746398,
   547.778076171875,
   65.89680284746399,
   551.778076171875,
   5,
   None],
  [66.63712787049035,
   426.71240234375,
   70.63712787049035,
   430.71240234375,
   6,
   None],
  [72.26915871801677,
   379.4248046875,
   76.26915871801677,
   383.4248046875,
   7,
   None],
  [81.11769986106124,
   339.849609375,
   85.11769986106124,
   343.849609375,
   8,
   None],
  [90.07676756638618, 305.69921875, 94.07676756638618, 309.69921875, 9, None],
  [94.6897060430554, 282.3984

In [142]:
activity_position_matrix = make_activity_matrix(merged_data, duplicates=False)
activity_position_matrix.to_csv("temp_position_activities.tsv", sep='\t')

with open("temp_position_activities.tsv",'r') as f:
    matrix = f.readlines()
    matrix = ''.join(matrix)
    matrix = '#Names' + matrix

# Read in gene tree
t_base = Tree('../data/phylogenetic_info/myco-species-tree_added.nwk', format=1)

# For some reason, ClusterTree doesn't have the format option
t = ClusterTree(t_base.write(format=0),text_array=matrix)

3335


In [143]:
# t = ClusterTree('Phylogeny_test.tree', text_array=matrix)
length=1000
width=10

# Prune the tree to only contain the species we are interested in
leafs = t.get_leaf_names()
clade_leaves = [leaf for leaf in leafs if leaf in matrix]
# clade_leaves = [leaf for leaf in clade_leaves if leaf in yeast_clade]
# t.prune(clade_leaves)

array =  t.arraytable

# Calculates some stats on the matrix. Needed to establish the color gradients.
matrix_dist = [i for r in range(len(array.matrix))\
               for i in array.matrix[r] if np.isfinite(i)]
matrix_max = np.max(matrix_dist)
matrix_min = np.min(matrix_dist)
matrix_avg = matrix_min+((matrix_max-matrix_min)/2)


def mylayout(node):
    # Creates a profile face that will represent node's profile as a
    # heatmap
    profileFace  = ProfileFace(matrix_max, 0, matrix_avg, \
                                            length, width, "heatmap", colorscheme=0)
    
    # This is the beatufil line that allows the heatmap to look good on the whole tree!
    profileFace.rotable = False
    # Creates my own layout function that uses previous faces
    # If node is a leaf
    if node.is_leaf():
        name = TextFace(node.name, fsize=15)
        add_face_to_node(name, node, column=0, position="aligned")

        # And a line profile
        add_face_to_node(profileFace, node, 1, position="aligned")
        node.img_style["size"]=0


# Use my layout to visualize the tree
ts = TreeStyle()
ts.layout_fn = mylayout

# ts.mode = "c"
ts.show_leaf_name = False    
ts.draw_guiding_lines = True

# Use my layout to visualize the tree
# ts = TreeStyle()
# ts.layout_fn = mylayout
# t.show(tree_style=ts)

t.render("../figures/yeast_species_tree_heatmaps_all_genes_loess_smoothing_one_per_species.pdf", tree_style = ts)

{'nodes': [[0.5, 2028.5158138275146, 4.5, 2032.5158138275146, 0, None],
  [12.289671698643396,
   1711.5316276550293,
   16.289671698643396,
   1715.5316276550293,
   1,
   None],
  [29.907277893126434,
   1247.8636856079102,
   33.907277893126434,
   1251.8636856079102,
   2,
   None],
  [36.725127620712804,
   1062.2273712158203,
   40.725127620712804,
   1066.2273712158203,
   3,
   None],
  [55.62971259971588,
   770.5172424316406,
   59.62971259971588,
   774.5172424316406,
   4,
   None],
  [61.89680284746398,
   520.3391723632812,
   65.89680284746399,
   524.3391723632812,
   5,
   None],
  [66.63712787049035,
   405.3267822265625,
   70.63712787049035,
   409.3267822265625,
   6,
   None],
  [72.26915871801677,
   360.403564453125,
   76.26915871801677,
   364.403564453125,
   7,
   None],
  [81.11769986106124,
   322.80712890625,
   85.11769986106124,
   326.80712890625,
   8,
   None],
  [90.07676756638618,
   290.3642578125,
   94.07676756638618,
   294.3642578125,
   9,
  