# Adesoji Adeshina #
# BIOE 190 Programming Project#
## Comparing topology of phylogenetic trees generated using different techniques ##

In [7]:
### Preamble: imports and stuff like that ###

import scipy
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
from Bio import AlignIO
from Bio.SubsMat.MatrixInfo import blosum62 as blosum
from ete3 import Tree
from __future__ import division

aas = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
blosum.update(((b,a),val) for (a,b),val in blosum.items())

BLOSUM62 = np.zeros((20, 20))
for i in xrange(20):
    for j in xrange(20):
        BLOSUM62[i, j] = blosum[(aas[i], aas[j])]
gap_penalties = np.array([-4]*20)

## Part 1 - Multiple Sequence Alignment ## 

In [133]:
#MSA for TLR
tlr_handle = open("tlrs.fasta.txt", "rU")
tlrs = AlignIO.read(tlr_handle, "fasta")

In [134]:
#MSA for KCNA
kcn_handle = open("kcns.fasta.txt", "rU")
kcns = AlignIO.read(kcn_handle, "fasta")

In [137]:
#storing human readable gene names 
acc_id = {}
tlr_acc = [tlr.name for tlr in tlrs]
kcn_acc = [kcn.name for kcn in kcns]
for line in open("tlrs_with_id.fasta.txt", "rU"):
    if ">" in line:
        arr = line.split("|")
        acc_id[arr[1]] = arr[2].split(" ")[0]
for line in open("kcns_with_id.fasta.txt", "rU"):
    if ">" in line:
        arr = line.split("|")
        acc_id[arr[1]] = arr[2].split(" ")[0]

{'Q9EPQ1': 'TLR1_MOUSE', 'Q9EPW9': 'TLR6_MOUSE', 'P08510': 'KCNAS_DROME', 'P10499': 'KCNA1_RAT', 'Q689D1': 'TLR2_CANLF', 'P50638': 'KCNA5_RABIT', 'P19024': 'KCNA5_RAT', 'Q28527': 'KCNA4_MUSPF', 'P22739': 'KCNA2_XENLA', 'Q17ST2': 'KCNA7_MOUSE', 'Q9MYW3': 'TLR4_HORSE', 'Q9BXR5': 'TLR10_HUMAN', 'P15385': 'KCNA4_RAT', 'Q8SPE9': 'TLR4_PONPY', 'Q9R1F8': 'TLR2_CRIGR', 'O60603': 'TLR2_HUMAN', 'Q9QUK6': 'TLR4_MOUSE', 'P22460': 'KCNA5_HUMAN', 'P17659': 'KCNA6_RAT', 'P17658': 'KCNA6_HUMAN', 'P16390': 'KCNA3_MOUSE', 'Q6T752': 'TLR2_HORSE', 'B3Y613': 'TLR2_PANTR', 'B3Y615': 'TLR2_GORGO', 'B3Y614': 'TLR2_PANPA', 'Q09470': 'KCNA1_HUMAN', 'B3Y618': 'TLR2_MACMU', 'B5T267': 'TLR2_BOSIN', 'Q8I4B0': 'KCNSK_CAEEL', 'P22001': 'KCNA3_HUMAN', 'Q9I830': 'KCNA2_ONCMY', 'Q704V6': 'TLR6_BOVIN', 'Q68Y56': 'TLR4_PIG', 'Q96RP8': 'KCNA7_HUMAN', 'Q2V897': 'TLR2_BOSTR', 'Q7T199': 'KCA10_CHICK', 'Q2V898': 'TLR4_BOSTR', 'Q8SPE8': 'TLR4_GORGO', 'Q61423': 'KCNA4_MOUSE', 'Q9QUN7': 'TLR2_MOUSE', 'Q61923': 'KCNA6_MOUSE', 'Q9D

## Part 2 - Profile Construction ##

In [10]:
def construct_profile(msa, **kwargs):
    if "profile_type" in kwargs:
        if kwargs["profile_type"] == "addone":
            n = 0.1
            return [add_n_profile(msa[i, :], n) for i in xrange(len(msa))]
        if kwargs["profile_type"] == "blosum":
            return [blosum62_profile(msa[i, :]) for i in xrange(len(msa))]
        
def add_n_profile(aligned_seq, n):
    profile = np.ones((20, len(aligned_seq)))
    for i, residue in enumerate(aligned_seq):
        if residue in aas:
            idx = aas.index(residue)
            profile[idx, i] += n
    return Profile(profile/21, "addone")

def blosum62_profile(aligned_seq):
    profile = np.zeros((20, len(aligned_seq)))
    for i, residue in enumerate(aligned_seq):
        if residue in aas:
            idx = aas.index(residue)
            profile[:,i] = BLOSUM62[idx,:]
        else:
            profile[:,i] = gap_penalties
    return Profile(profile, "blosum")

class Profile():
    def __init__(self, data, kind):
        self.data = data
        self.kind = kind
        
    def __repr___(self):
        return self.kind

## Part 3 - Profile Scoring ##

In [89]:
#profile data structure is a numpy matrix with 

def score(profile1, profile2, **kwargs):
    if "scoring_fn" in kwargs:
        if kwargs["scoring_fn"] == "kullback_leibler":
            return kullback_leibler_score(profile1.data, profile2.data)
        if kwargs["scoring_fn"] == "jensen_shannon":
            return jensen_shannon_score(profile1.data, profile2.data)
        return sum(position_scores)
    return

#symmetrized version is D_kl = D(P||Q) + D(Q||P)
def kullback_leibler_score(p, q):
    dpq = np.sum(p*np.log(p/q), axis=0)
    dqp = np.sum(q*np.log(q/p), axis=0)
    return sum(dpq + dqp)

#D_js = 1/2 (D(P||M) + D(Q||M))
#M = 1/2 (P + Q)
def jensen_shannon_score(p, q):
    m = 0.5*(p + q)
    dpm = np.sum(p*np.log(p/m), axis=0)
    dqm = np.sum(q*np.log(q/m), axis=0)
    return sum(0.5*(dpm + dqm))

## Part 3 - Tree Construction##

In [187]:
def merge(profile1, profile2):
    merged_profile = 0.5*(profile1.data + profile2.data)
    return Profile(merged_profile, profile1.kind)

def agglomeration(profiles, accs):
    clusters = len(profiles)
    new_profile_id = len(profiles)-1
    profile_ids = range(len(profiles))
    merged = []
    merged_dict= {}
    reverse_merge = {}
    
    while(clusters > 1):
        closest1, closest2, idx = find_min_score(profiles, profile_ids)
        merged.append(idx)
        clusters -= 1
        new_profile_id += 1
        merged_dict[idx] = new_profile_id
        reverse_merge[new_profile_id] = idx
        profile_ids.remove(idx[0])
        profile_ids.remove(idx[1])
        profile_ids.append(new_profile_id)
        profiles.append(merge(closest1, closest2))
    print merged
    return get_tree_input(merged, merged_dict, reverse_merge, accs)
    #return merged

def find_min_score(profiles, ids):
    min_score = np.inf
    indices = (-1, -1)
    for i in xrange(len(ids)-1):
        for j in xrange(i+1, len(ids)):
            curr_score = score(profiles[ids[i]], profiles[ids[j]], scoring_fn="kullback_leibler")
            if curr_score < min_score:
                min_score = curr_score
                indices = (ids[i], ids[j])
    return (profiles[indices[0]], profiles[indices[1]], indices)
    

#iterate through the list of tuples and generate the tree
def get_tree_input(merged, merged_dict, reverse_merge, acc):
    def get_pairing(label, merged):
        for merger in merged:
            if label in merger:
                return merger
        return (0, 0)
    def edge(a, b):
        return "(" + a + "," + b + ")"
    
    def merge(a, b, thresh, acc):
        if a < thresh:
            if b < thresh:
                return edge(acc_id[acc[a]], acc_id[acc[b]])
            newa, newb = reverse_merge[b]
            return edge(acc_id[acc[a]], merge(newa, newb, thresh, acc))
        if b < thresh:
            newa, newb = reverse_merge[a]
            return edge(merge(newa, newb, thresh, acc), acc_id[acc[b]])
        a1, b1 = reverse_merge[a]
        a2, b2 = reverse_merge[b]
        return edge(merge(a1, b1, thresh, acc), merge(a2, b2, thresh, acc))
    
    merger = merged[0]
    left = acc_id[acc[merger[0]]]
    right_val = merger[1]
    while merger in merged_dict:
        label = merged_dict[merger]
        if right_val < len(acc):
            right = acc_id[acc[right_val]]
        else:
            (a, b) = reverse_merge[right_val]
            right = merge(a, b, len(acc), acc)

        merger = get_pairing(label, merged)
        if not merger:
            break
        left = edge(left, right)
        right_val = merger[0] if label == merger[1] else merger[1]
    out_str = left + ";"
    return out_str
            



In [188]:
tlr_addone_profiles = construct_profile(tlrs, profile_type="addone")
tlr_blosum_profiles = construct_profile(tlrs, profile_type="blosum")

kcn_addone_profiles = construct_profile(kcns, profile_type="addone")
kcn_blosum_profiles = construct_profile(kcns, profile_type="blosum")

tlr_tree = agglomeration(tlr_addone_profiles, tlr_acc)
kcn_tree = agglomeration(kcn_addone_profiles, kcn_acc)

[(16, 17), (18, 20), (46, 47), (37, 38), (13, 14), (36, 49), (23, 25), (22, 52), (12, 15), (24, 53), (19, 55), (21, 56), (39, 51), (31, 32), (48, 50), (54, 57), (11, 61), (42, 58), (26, 60), (9, 64), (62, 65), (40, 41), (8, 10), (27, 28), (66, 68), (6, 7), (35, 67), (44, 59), (63, 73), (33, 74), (34, 75), (72, 76), (2, 3), (4, 78), (5, 79), (0, 1), (80, 81), (69, 70), (71, 82), (83, 84), (77, 85), (43, 86), (29, 87), (30, 88), (45, 89)]
[(3, 4), (1, 2), (5, 33), (10, 12), (6, 35), (15, 16), (7, 37), (27, 28), (0, 34), (13, 14), (38, 42), (20, 21), (17, 43), (25, 40), (11, 36), (30, 31), (8, 39), (19, 23), (18, 50), (44, 51), (9, 49), (41, 53), (47, 54), (26, 29), (22, 55), (46, 57), (48, 58), (56, 59), (52, 60), (45, 61), (24, 62), (32, 63)]


In [189]:
t_tlr = Tree(tlr_tree)
t_kcn = Tree(kcn_tree)

In [190]:
print t_tlr


                                       /-TLR2_HUMAN
                                    /-|
                                   |   \-TLR2_GORGO
                                 /-|
                                |  |   /-TLR2_PANTR
                                |   \-|
                              /-|      \-TLR2_PANPA
                             |  |
                             |  |   /-TLR2_MACMU
                           /-|   \-|
                          |  |      \-TLR2_MACFA
                        /-|  |
                       |  |   \-TLR2_CANLF
                       |  |
                     /-|   \-TLR2_HORSE
                    |  |
                    |  |   /-TLR2_GIRCA
                    |  |  |
                    |   \-|      /-TLR2_SHEEP
                    |     |   /-|
                    |     |  |   \-TLR2_CAPIB
                    |      \-|
                    |        |   /-TLR2_BOSTR
                    |         \-|
                  /-|           |

In [186]:
print t_kcn


                                             /-KCNA2_RAT
                                          /-|
                                       /-|   \-KCNA2_MOUSE
                                      |  |
                                    /-|   \-KCNA2_HUMAN
                                   |  |
                                 /-|   \-KCNA2_RABIT
                                |  |
                              /-|   \-KCNA2_CANLF
                             |  |
                           /-|   \-KCNA2_XENLA
                          |  |
                        /-|   \-KCNA2_ONCMY
                       |  |
                       |  |   /-KCNA1_HUMAN
                       |   \-|
                     /-|     |   /-KCNA1_MOUSE
                    |  |      \-|
                    |  |         \-KCNA1_RAT
                    |  |
                    |  |   /-KCNA3_HUMAN
                  /-|   \-|
                 |  |     |   /-KCNA3_RAT
                 |  |      \-|
      

In [None]:
t = Tree( "((a,b),c);" )

In [6]:
print t


      /-a
   /-|
--|   \-b
  |
   \-c
