In [None]:
'''
In this chapter, we define the length of a path in a tree as the sum of the lengths of its edges (rather than the number of edges 
on the path). As a result, the evolutionary distance between two present-day species corresponding to leaves i and j in a tree T is 
equal to the length of the unique path connecting i and j, denoted di,j(T).
Distances Between Leaves Problem: Compute the distances between leaves in a weighted tree.
     Input:  An integer n followed by the adjacency list of a weighted tree with n leaves.
     Output: An n x n matrix (di,j), where di,j is the length of the path between leaves i and j.
Code Challenge: Solve the Distances Between Leaves Problem. The tree is given as an adjacency list of a graph whose leaves are 
integers between 0 and n - 1; the notation a->b:c means that node a is connected to node b by an edge of weight c. The matrix 
you return should be space-separated.
Sample Input:
4
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6
Sample Output:
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0

filename = trial.txt-
4
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6
'''

from collections import defaultdict, deque
with open(filename) as file:
    data = []
    for line in file:
        data.append(line[:-1])
        
n = int(data[0])

from_node = []
to_node = []
weight = []

for line in data[1:]:
    x = line.split('->')
    from_node.append(int(x[0]))
    x = x[1].split(':')
    to_node.append(int(x[0]))
    weight.append(int(x[1]))
    
#print from_node,to_node,weight

nodes = list(set(from_node))

class Graph(object):
    def __init__(self):
        self.nodes = set()
        self.edges = defaultdict(list)
        self.distances = {}
    def add_node(self, value):
        self.nodes.add(value)

    def add_edge(self, from_node, to_node, distance):
        self.edges[from_node].append(to_node)
        self.edges[to_node].append(from_node)
        self.distances[(from_node, to_node)] = distance


def dijkstra(graph, initial):
    visited = {initial: 0}
    path = {}

    nodes = set(graph.nodes)

    while nodes:
        min_node = None
        for node in nodes:
            if node in visited:
                if min_node is None:
                    min_node = node
                elif visited[node] < visited[min_node]:
                    min_node = node
        if min_node is None:
            break

        nodes.remove(min_node)
        current_weight = visited[min_node]

        for edge in graph.edges[min_node]:
            try:
                weight = current_weight + graph.distances[(min_node, edge)]
            except:
                continue
            if edge not in visited or weight < visited[edge]:
                visited[edge] = weight
                path[edge] = min_node

    return visited, path


def shortest_path(graph, origin, destination):

    if origin == destination:
        return 0
    else:
        visited, paths = dijkstra(graph, origin)
        full_path = deque()
        _destination = paths[destination]

        while _destination != origin:
            full_path.appendleft(_destination)
            _destination = paths[_destination]

        full_path.appendleft(origin)
        full_path.append(destination)

        return visited[destination]#, list(full_path)


graph = Graph()

for node in nodes:
    graph.add_node(node)

for i in range(len(from_node)):
    graph.add_edge(from_node[i],to_node[i],weight[i])

leaf_nodes = []
for i in graph.edges:
    if len(graph.edges[i])==2:
        leaf_nodes.append(i)


for i in leaf_nodes:
    final_path = []
    for j in leaf_nodes:
        final_path.append(str(shortest_path(graph,i,j)))
    print(' '.join(final_path)) 

In [None]:
'''
We now have an algorithm for solving the Limb Length Problem. For each j, we can compute LimbLength(j) by finding the minimum value 
of (Di,j+Dj,k−Di,k)/2(Di,j+Dj,k−Di,k)/2 over all pairs of leaves i and k.
Code Challenge: Solve the Limb Length Problem.
     Input: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated
     additive distance matrix D (whose elements are integers).
     Output: The limb length of the leaf in Tree(D) corresponding to row j of this distance
     matrix (use 0-based indexing).
Sample Input:
4
1
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0
Sample Output:
2
'''

from itertools import permutations

def limbLength(mat,n,j):
    leaves = range(n)
    leavesLeft = [l for l in leaves if l != j]
    minlen = float("inf")
    for i,k in permutations(leavesLeft,2):
        if i < k:
            lenik = (mat[j][i]+mat[j][k]-mat[i][k]) / 2
            if lenik < minlen:
                minlen = lenik
    return minlen

mat = [[0,13,21,22],[13,0,12,13],[21,12,0,13],[22,13,13,0]]
n = 4
j = 1

print(limbLength(mat,n,j))

In [None]:
'''
Implement AdditivePhylogeny to solve the Distance-Based Phylogeny Problem.
     Input: An integer n followed by a space-separated n x n distance matrix.
     Output: A weighted adjacency list for the simple tree fitting this matrix.
Note on formatting: The adjacency list must have consecutive integer node labels starting from 0. The n leaves must be labeled 0, 1, 
..., n - 1 in order of their appearance in the distance matrix. Labels for internal nodes may be labeled in any order but must start 
from n and increase consecutively.
Sample Input:
4
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0
Sample Output:
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6
AdditivePhylogeny(D, n)
    if n = 2
        return the tree consisting of a single edge of length D1,2
    limbLength ← Limb(D, n)
    for j ← 1 to n - 1
        Dj,n ← Dj,n - limbLength
        Dn,j ← Dj,n
    (i, k) ← two leaves such that Di,k = Di,n + Dn,k
    x ← Di,n
    remove row n and column n from D
    T ← AdditivePhylogeny(D, n - 1)
    v ← the (potentially new) node in T at distance x from i on the path between i and k
    add leaf n back to T by creating a limb (v, n) of length limbLength
    return T
'''
from itertools import permutations
from collections import defaultdict
'''refer to the additive_phylogency's idea about add branch and I need to master this concept.'''

def additive_phylogence(D,n):
    global Tree
    global nodesnum
    if n == 2:
        Tree = defaultdict(list)
        Tree[0].append((1,D[0][1]))
        Tree[1].append((0,D[0][1]))
        return
    limblen = limbLength(D,n-1)
    '''calculate D_bald to find out (i,L,k)'''
    for j in range(n-1):
        D[j][n-1] = D[j][n-1]-limblen
        D[n-1][j] = D[j][n-1]
    ik = [l for l in range(len(D[n-1][:-1])) if D[n-1][l]!=0]
    for i,k in permutations(ik,2):
        if i < k and D[i][n-1] + D[k][n-1] == D[i][k]:
            x = D[i][n-1]
            ret = ((i,x),(n-1,limblen),(k,D[i][k]-x))
            break
    '''calculate D_trimmed'''
    D = D[0:n-1][0:n-1]
    '''recursively'''
    additive_phylogence(D,n-1)
    '''FORMAT: ret = ((i,x),(l,limblen),(k,D[i][k]-x))'''
    pathiTok = pathBetweenLeaves(Tree,ret[0][0],ret[2][0])
    x = ret[0][1]
    l = ret[1][0]
    limblen = ret[1][1]
    start_node = ret[0][0]
    for node, dist in pathiTok:
        if dist ==0:
            continue
        if x > dist:
            x -= dist
            start_node = node
        elif x == dist:
            Tree[node].append((l,limblen))
            Tree[l].append((node,limblen))
            break
        elif x < dist:
            '''this part is reallly good'''
            new_node = nodesnum
            nodesnum += 1
            Tree[new_node].append((l,limblen))
            Tree[l].append((new_node,limblen))
            Tree[start_node].remove((node,dist))
            Tree[node].remove((start_node,dist))
            Tree[start_node].append((new_node,x))
            Tree[new_node].append((start_node,x))
            Tree[node].append((new_node,dist-x))
            Tree[new_node].append((node,dist-x))
            break

def pathBetweenLeaves(graph,i,k):
    tree = defaultdict(list)
    weights = defaultdict(list)
    for nodei,childList in graph.items():
        for j in range(len(childList)):
            tree[nodei].append(childList[j][0])
            weights[nodei].append(childList[j][1])
    edgeTo, marked = depthFirstPaths(tree,i)
    path = pathTo(tree,weights,k,i,edgeTo)
    return path

def depthFirstPaths(tree,s):
    '''edgeTo and marked all save the idnex of nodes'''
    edgeTo = dict((key,None) for key in tree.keys())
    marked = dict((key,False) for key in tree.keys())
    edgeTo, marked = dfs(tree,s,edgeTo,marked)
    return edgeTo, marked

def dfs(tree,v,edgeTo,marked):
    marked[v] = True
    for w in tree[v]:
        if not marked[w]:
            edgeTo[w] = v
            edgeTo,marked = dfs(tree,w,edgeTo,marked)
    return edgeTo, marked

def pathTo(tree,weights,v,s,edgeTo):
    stack = []
    w = edgeTo[v]
    stack.append((v,weights[w][tree[w].index(v)]))
    while w != s:
        v = w
        w = edgeTo[v]
        stack.append((v,weights[w][tree[w].index(v)]))
    stack.append((s,0))
    return stack[::-1]

def limbLength(mat,j):
    n = len(mat)
    leaves = range(n)
    leavesLeft = [l for l in leaves if l != j]
    minlen = float("inf")
    for i,k in permutations(leavesLeft,2):
        if i < k:
            lenik = (mat[j][i]+mat[j][k]-mat[i][k]) / 2
            if lenik < minlen:
                minlen = lenik
    return minlen

D = [[0,13,21,22],[13,0,12,13],[21,12,0,13],[22,13,13,0]]
n = 4
global Tree
global nodesnum
nodesnum = n
additive_phylogence(D,n)
for key,vals in Tree.items():
    for val in vals:
        print(str(key)+'->'+str(val[0])+':'+str(int(val[1])))

In [None]:
'''
If we had a molecular clock measuring evolutionary time, then we could assign an age to every node v in a rooted binary tree 
(denoted Age(v)), where all of the leaves of the tree have age 0 because they correspond to present-day species. We could then 
define the weight of an edge (v, w) in the tree as the difference Age(v) - Age(w). Consequently, the length of a path between 
the root and any node would be equal to the difference between their ages. Such a tree, in which the distance from the root to 
any leaf is the same, is called ultrametric.
Our aim is to derive an ultrametric tree that explains a given distance matrix (even if it does so only approximately). UPGMA 
(which stands for Unweighted Pair Group Method with Arithmetic Mean) is a simple clustering heuristic that introduces a hypothetical 
molecular clock for constructing an ultrametric evolutionary tree.
Given an n × n matrix D, UPGMA (which is illustrated in the figure on the next step) first forms n trivial clusters, each containing 
a single leaf. The algorithm then finds a pair of “closest” clusters. To clarify the notion of closest clusters, UPGMA defines the 
distance between clusters C1 and C2 as the average pairwise distance between elements of C1 and C2.
Once UPGMA has identified a pair of closest clusters C1 and C2, it merges them into a cluster C with |C1| + |C2| elements and then 
creates a node for C, which it connects to each of C1 and C2 by a directed edge. The age of C is set to be DC1, C2 /2. UPGMA then 
iterates this process of merging the two closest clusters until only a single cluster remains, which corresponds to the root.
Implement UPGMA.
     Input: An integer n followed by a space separated n x n distance matrix.
     Output: An adjacency list for the ultrametric tree returned by UPGMA. Edge weights should be accurate to two decimal places
     (answers in the sample dataset below are provided to three decimal places).
Note on formatting: The adjacency list must have consecutive integer node labels starting from 0. The n leaves must be labeled 
0, 1, ..., n - 1 in order of their appearance in the distance matrix. Labels for internal nodes may be labeled in any order but 
must start from n and increase consecutively.
Sample Input:
4
0 20 17 11
20 0 20 13
17 20 0 10
11 13 10 0
Sample Output:
0->5:7.000
1->6:8.833
2->4:5.000
3->4:5.000
4->2:5.000
4->3:5.000
4->5:2.000
5->0:7.000
5->4:2.000
5->6:1.833
6->5:1.833
6->1:8.833
UPGMA(D, n) 
        Clusters ← n single-element clusters labeled 1, ... , n
        construct a graph T with n isolated nodes labeled by single elements 1, ... , n
    for every node v in T 
        Age(v) ← 0
    while there is more than one cluster
        find the two closest clusters Ci and Cj
        merge Ci and Cj into a new cluster Cnew with |Ci| + |Cj| elements
        add a new node labeled by cluster Cnew to T
        connect node Cnew to Ci and Cj by directed edges 
        Age(Cnew) ← DCi, Cj / 2
        remove the rows and columns of D corresponding to Ci and Cj
        remove Ci and Cj from Clusters
        add a row/column to D for Cnew by computing D(Cnew, C) for each C in Clusters
        add Cnew to Clusters
    root ← the node in T corresponding to the remaining cluster
    for each edge (v, w) in T
        length of (v, w) ← Age(v) - Age(w)
    return T
'''

from __future__ import division
import sys
from collections import defaultdict

''' as an UPGMA tree, the degree of internal nodes must be three SOGA'''
'''20151030: it took me a while to realize that it is the number of leaves in a cluster, not the number of nodes'''
def dfs(tree,v,edgeTo,marked,count):
    count += 1
    marked[v] = True
    for w in tree[v]:
        if not marked[w]:
            edgeTo[w] = v
            edgeTo,marked,count = dfs(tree,w,edgeTo,marked,count)
    return edgeTo, marked,count

def count(tree,s):
    if len(tree[s]) == 0:
        return 1
    '''edgeTo and marked all save the index of nodes'''
    edgeTo = dict((key,None) for key in tree.keys())
    marked = dict((key,False) for key in tree.keys())
    count = 0
    edgeTo, marked,count = dfs(tree,s,edgeTo,marked,count)
    return (count+1)/2#THE NUMBER OF LEAVES:) n + n - 1 = count

def upgma1(D,nodes):
    #create the tree first
    T = {}
    age = {}
    for node in nodes:
        T[node] = []
        age[node] = 0

    nodeNum = len(D)
    while len(nodes)>1:
        cv,ci,cj = findMin(D) #ci, cj are the index of D
        newnode = nodeNum
        nodeNum += 1

        newRow = []
        for m in range(len(D)):
            if m!=ci and m!=cj:
                cmi = D[m][ci]
                cmj = D[m][cj]
                numi = count(T,nodes[ci])
                numj = count(T,nodes[cj])
                newRow.append((cmi*numi+cmj*numj) / (numi+numj))
        newRow.append(0)

        T[nodes[ci]].append(newnode)
        T[nodes[cj]].append(newnode)
        T[newnode] = [nodes[ci], nodes[cj]]
        age[newnode] = cv/2.0

        Dnew = []
        for i in range(len(D)):
            if i != ci and i != cj:
                Dnew.append([D[i][j] for j in range(len(D[i])) if j != ci and j != cj])
        nodes = [nodes[i] for i in range(len(nodes)) if i !=ci and i != cj]		
        nodes.append(newnode)
        Dnew.append(newRow)
        for i in range(len(Dnew)-1):
            Dnew[i].append(Dnew[-1][i])
        D = Dnew

    for v in T.keys():
        T[v] = [ (w,abs(age[v]-age[w])) for w in T[v] ] 
    return T

def findMin(D):
    '''return the row and column index of the min value, not necessarily the node number'''
    n = len(D)
    imin = None
    jmin = None
    vmin = float("inf")
    for i in range(n):
        rowmin = min([x for x in D[i] if x>0])
        if rowmin < vmin:
            imin = i
            jmin = D[i].index(rowmin)
            vmin = rowmin
    return (vmin,imin,jmin)

def printTree(T):
    for v in T.keys():
        for (w,edge) in T[v]:
            print(str(v)+"->"+str(w)+":"+"%.3f"%(edge))
            
n = 4
D =[[0,20,17,11],[20,0,20,13],[17,20,0,10],[11,13,10,0]]
nodes = range(n)

tree = upgma1(D,nodes)
printTree(tree)

In [None]:
'''
In 1987, Naruya Saitou and Masatoshi Nei developed the neighbor-joining algorithm for evolutionary tree reconstruction. Given 
an additive distance matrix, this algorithm, which we call NeighborJoining, finds a pair of neighboring leaves and substitutes 
them by a single leaf, thus reducing the size of the tree. NeighborJoining can thus recursively construct a tree fitting the 
additive matrix. This algorithm also provides a heuristic for non-additive distance matrices that performs well in practice.
The central idea of NeighborJoining is that although finding a minimum element in a distance matrix D is not guaranteed to 
yield a pair of neighbors in Tree(D), we can convert D into a different matrix whose minimum element does yield a pair of 
neighbors. First, given an n × n distance matrix D, we define TotalDistanceD(i) as the sum ∑1≤k≤n Di,k of distances from 
leaf i to all other leaves. The neighbor-joining matrix D* (see below) is defined such that for any i and j, D*i,i = 0 
and D*i,j = (n - 2) · Di,j - TotalDistanceD(i) - TotalDistanceD(j).
Implement NeighborJoining.
     Input: An integer n, followed by an n x n distance matrix.
     Output: An adjacency list for the tree resulting from applying the neighbor-joining algorithm. Edge-weights should be 
     accurate to two decimal places (they are provided to three decimal places in the sample output below).
Note on formatting: The adjacency list must have consecutive integer node labels starting from 0. The n leaves must be 
labeled 0, 1, ..., n - 1 in order of their appearance in the distance matrix. Labels for internal nodes may be labeled 
in any order but must start from n and increase consecutively.
Sample Input:
4
0 23 27 20
23 0 30 28
27 30 0 30
20 28 30 0
Sample Output:
0->4:8.000
1->5:13.500
2->5:16.500
3->4:12.000
4->5:2.000
4->0:8.000
4->3:12.000
5->1:13.500
5->2:16.500
5->4:2.000
'''

from __future__ import division
import sys
from collections import defaultdict
#Chunyu Zhao 20151031

def neighbor_joining(D,nodes):
    global nodeNum
    n = len(D)
    if n == 2:
        T = defaultdict(list)
        T[nodes[0]].append((nodes[1],D[0][1]))
        T[nodes[1]].append((nodes[0],D[1][0]))
        return T
    Dstar = getDstar(D)
    ci,cj = findMin(Dstar)

    newnode = nodeNum
    nodeNum += 1
    totalDist = [ sum(row) for row in D ]
    delta = (totalDist[ci] - totalDist[cj])/ (n-2)
    limbi = (D[ci][cj] + delta) / 2.0
    limbj = (D[ci][cj] - delta) / 2.0
    nodei = nodes[ci]
    nodej = nodes[cj]

    newRow = [(D[k][ci]+D[k][cj]-D[ci][cj])/2.0 for k in range(n) if k!=ci and k!=cj]
    newRow.append(0)
    Dnew = []
    for i in range(n):
        if i != ci and i != cj:
            Dnew.append([D[i][j] for j in range(len(D[i])) if j != ci and j != cj])
    nodes = [nodes[i] for i in range(len(nodes)) if i !=ci and i != cj]		
    nodes.append(newnode)
    Dnew.append(newRow)
    for i in range(n-2):
        Dnew[i].append(Dnew[-1][i])
    D = Dnew

    ''' recursively '''
    T = neighbor_joining(D,nodes)

    T[nodei].append((newnode,limbi))
    T[nodej].append((newnode,limbj))
    T[newnode].append((nodei,limbi))
    T[newnode].append((nodej,limbj))

    return T

def getDstar(D):
    n = len(D)
    totalDist = [ sum(row) for row in D ]
    Dstar = []
    for i in range(n):
        Dstar.append([(n-2)*D[i][j]-totalDist[i]-totalDist[j] for j in range(n)])
    for i in range(n):
        for j in range(n):
            if i==j:
                Dstar[i][j] = 0
    return Dstar

def findMin(D):
    '''return the row and column index of the min value, not necessarily the node number'''
    n = len(D)
    imin = None
    jmin = None
    vmin = float("inf")
    for i in range(n):
        rowmin = min([x for x in D[i] if x != 0])
        if rowmin < vmin:
            imin = i
            jmin = D[i].index(rowmin)
            vmin = rowmin
    return (imin,jmin)

def printTree(T):
    for v in T.keys():
        for (w,edge) in T[v]:
            print(str(v)+"->"+str(w)+":"+"%.3f"%(edge))
            
n = 4
D =[[0,23,27,20],[23,0,30,28],[27,30,0,30],[20,28,30,0]]
nodes = range(n)
global nodeNum 
nodeNum = n
T = neighbor_joining(D,nodes)
printTree(T)

In [None]:
'''
Implement SmallParsimony to solve the Small Parsimony Problem.
     Input: An integer n followed by an adjacency list for a rooted binary tree with n leaves labeled by DNA strings.
     Output: The minimum parsimony score of this tree, followed by the adjacency list of the tree corresponding to labeling
     internal nodes by DNA strings in order to minimize the parsimony score of the tree.
Note: Remember to run SmallParsimony on each individual index of the strings at the leaves of the tree.
Sample Input:
4
4->CAAATCCC
4->ATTGCGAC
5->CTGCGCTG
5->ATGGACGA
6->4
6->5
Sample Output:
16
ATTGCGAC->ATAGCCAC:2
ATAGACAA->ATAGCCAC:2
ATAGACAA->ATGGACTA:2
ATGGACGA->ATGGACTA:1
CTGCGCTG->ATGGACTA:4
ATGGACTA->CTGCGCTG:4
ATGGACTA->ATGGACGA:1
ATGGACTA->ATAGACAA:2
ATAGCCAC->CAAATCCC:5
ATAGCCAC->ATTGCGAC:2
ATAGCCAC->ATAGACAA:2
CAAATCCC->ATAGCCAC:5
SmallParsimony(T, Character)
    for each node v in tree T
        Tag(v) ← 0
        if v is a leaf
            Tag(v) ← 1
            for each symbol k in the alphabet
                if Character(v) = k
                    sk(v) ← 0
                else
                    sk(v) ← ∞
    while there exist ripe nodes in T
        v ← a ripe node in T
        Tag(v) ← 1
        for each symbol k in the alphabet
            sk(v) ← minimumall symbols i {si(Daughter(v))+δi,k} + minimumall symbols j {sj(Son(v))+δj,k}
    return minimum over all symbols k {sk(v)}
    
trial.txt --

4
4->CAAATCCC
4->ATTGCGAC
5->CTGCGCTG
5->ATGGACGA
6->4
6->5

'''

import sys
from collections import defaultdict

alphabet = ['A', 'C', 'G', 'T', ]


def dist(a, b):
    if a == b:
        return 0
    else:
        return 1


def seq_dist(a, b):
    summ = 0
    for i in range(len(a)):
        if a[i] != b[i]:
            summ += 1
    return summ


class Node(object):
    def __init__(self, index, leaf):
        self.index = index
        self.leaf = leaf
        self.children = []
        self.edges = []
        self.text = ''
        self.score = defaultdict(int)
        self.scored = False

    def reinit(self):
        self.score = defaultdict(int)
        self.scored = False
        return None

    def is_ripe(self):
        if self.leaf:
            return True
        else:
            for item in self.children:
                if not nodelist[item].scored:
                    return False
        return True

    def add_node(self, c_index):
        self.children.append(c_index)
        self.edges.append(0)
        return None

    def scoring(self, curr_letter):
        if self.leaf:
            for letter in alphabet:
                if letter == self.text[curr_letter]:
                    self.score[letter] = 0
                else:
                    self.score[letter] = float('inf')
        else:
            for letter in alphabet:
                for item in self.children:
                    self.score[letter] += min([nodelist[item].score[i]+dist(letter, i) for i in alphabet])
        self.scored = True
        return None


def interpreter(conn):
    nn = int(conn.readline().strip())
    nl = [Node(i, True) for i in range(nn)]
    for j, raw_line in enumerate(conn):
        line = raw_line.strip().split('->')
        node_val = int(line[0])
        if node_val == len(nl):
            nl.append(Node(node_val, False))
        if node_val > len(nl):
            print("Unexpected node")
        if j < nn:
            nl[node_val].add_node(j)
            nl[j].text = line[1]
        if j >= nn:
            nl[node_val].add_node(int(line[1]))
    return nl


def tree_scoring(curr_letter):
    for node in nodelist:
        if node.leaf:
            node.scoring(curr_letter)
    while not nodelist[-1].scored:
        for node in nodelist:
            if not node.scored:
                if node.is_ripe():
                    node.scoring(curr_letter)
    return None


def tree_pruning(node, parent_letter=''):
    if node.leaf:
        return None
    mini = float('inf')
    choice = ''
    for letter in alphabet:
        s = node.score[letter]
        if parent_letter != letter:
            s += 1
        if s < mini:
            mini = s
            choice = letter
    node.text += choice
    for i in node.children:
        tree_pruning(nodelist[i], choice)
    return None


def tree_edge_weighter(node):
    global running_sum
    if node.leaf:
        return None
    for index, item in enumerate(node.children):
        dd = seq_dist(node.text, nodelist[item].text)
        running_sum += dd
        node.edges[index] = dd
        tree_edge_weighter(nodelist[item])
    return None


def tree_printer(node, conn):
    if node.leaf:
        return None
    for index, item in enumerate(node.children):
        child = nodelist[item]
        conn.write(node.text+'->'+child.text+':'+str(node.edges[index])+'\n')
        conn.write(child.text+'->'+node.text+':'+str(node.edges[index])+'\n')
        tree_printer(child, conn)
    return None

filename = '/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt'
with open(filename, 'r') as f:
    nodelist = interpreter(f)
seq_len = len(nodelist[0].text)
for i in range(seq_len):
    tree_scoring(i)
    tree_pruning(nodelist[-1])
    for node in nodelist:
        node.reinit()
        
running_sum = 0
tree_edge_weighter(nodelist[-1])
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/out_final.txt', 'w') as g:
    g.write(str(running_sum)+'\n')
    tree_printer(nodelist[-1], g)

In [None]:
'''
When the position of the root in the tree is unknown, we can simply assign the root to any edge that we like, apply 
SmallParsimony to the resulting rooted tree, and then remove the root. It can be shown that this method provides a solution 
to the following problem.
Solve the Small Parsimony in an Unrooted Tree Problem.
     Input: An integer n followed by an adjacency list for an unrooted binary tree with n leaves labeled by DNA strings.
     Output: The minimum parsimony score of this tree, followed by the adjacency list of the tree corresponding to labeling
     internal nodes by DNA strings in order to minimize the parsimony score of the tree.
Sample Input:
4
TCGGCCAA->4
4->TCGGCCAA
CCTGGCTG->4
4->CCTGGCTG
CACAGGAT->5
5->CACAGGAT
TGAGTACC->5
5->TGAGTACC
4->5
5->4
Sample Output:
17
TCGGCCAA->CCAGGCAC:4
CCTGGCTG->CCAGGCAC:3
TGAGTACC->CAAGGAAC:4
CCAGGCAC->CCTGGCTG:3
CCAGGCAC->CAAGGAAC:2
CCAGGCAC->TCGGCCAA:4
CACAGGAT->CAAGGAAC:4
CAAGGAAC->CACAGGAT:4
CAAGGAAC->TGAGTACC:4
CAAGGAAC->CCAGGCAC:2


trial.txt --

4
TCGGCCAA->4
4->TCGGCCAA
CCTGGCTG->4
4->CCTGGCTG
CACAGGAT->5
5->CACAGGAT
TGAGTACC->5
5->TGAGTACC
4->5
5->4
'''
from collections import defaultdict

alphabet = ['A', 'C', 'G', 'T', ]


def dist(a, b):
    if a == b:
        return 0
    else:
        return 1


def seq_dist(a, b):
    summ = 0
    for i in range(len(a)):
        if a[i] != b[i]:
            summ += 1
    return summ


def is_int(s):
    try:
        int(s)
        return True
    except ValueError:
        return False


class Node(object):
    def __init__(self, index, leaf):
        self.index = index
        self.leaf = leaf
        self.children = []
        self.edges = []
        self.text = ''
        self.score = defaultdict(int)
        self.scored = False

    def reinit(self):
        self.score = defaultdict(int)
        self.scored = False
        return None

    def is_ripe(self):
        if self.leaf:
            return True
        else:
            for item in self.children:
                if not nodelist[item].scored:
                    return False
        return True

    def add_node(self, c_index):
        self.children.append(c_index)
        self.edges.append(0)
        return None

    def remove_node(self, c_index):
        self.children.remove(c_index)
        self.edges.pop()
        return None

    def scoring(self, curr_letter):
        if self.leaf:
            for letter in alphabet:
                if letter == self.text[curr_letter]:
                    self.score[letter] = 0
                else:
                    self.score[letter] = float('inf')
        else:
            for letter in alphabet:
                for item in self.children:
                    self.score[letter] += min([nodelist[item].score[i]+dist(letter, i) for i in alphabet])
        self.scored = True
        return None


def interpreter(conn):
    nn = int(conn.readline().strip())
    nl = dict()
    running = 0
    for raw_line in conn:
        line = raw_line.strip().split('->')
        if is_int(line[0]):
            a = int(line[0])
            if a not in nl:
                nl[a] = Node(a, False)
            if is_int(line[1]):
                b = int(line[1])
                if b not in nl:
                    nl[b] = Node(b, False)
                nl[a].add_node(b)
            else:
                seq = line[1]
                nl[running] = Node(running, True)
                nl[running].text = seq
                nl[a].add_node(running)
                running += 1
    return nl


def tree_scoring(curr_letter):
    for _, node in nodelist.items():
        if node.leaf:
            node.scoring(curr_letter)
    while not nodelist[new_root].scored:
        for _, node in nodelist.items():
            if not node.scored:
                if node.is_ripe():
                    node.scoring(curr_letter)
    return None


def tree_pruning(node, parent_letter=''):
    if node.leaf:
        return None
    mini = float('inf')
    choice = ''
    for letter in alphabet:
        s = node.score[letter]
        if parent_letter != letter:
            s += 1
        if s < mini:
            mini = s
            choice = letter
    node.text += choice
    for i in node.children:
        tree_pruning(nodelist[i], choice)
    return None


def tree_edge_weighter(node):
    global running_sum
    if node.leaf:
        return None
    for index, item in enumerate(node.children):
        dd = seq_dist(node.text, nodelist[item].text)
        running_sum += dd
        node.edges[index] = dd
        tree_edge_weighter(nodelist[item])
    return None


def tree_printer(node, conn):
    if node.leaf:
        return None
    for index, item in enumerate(node.children):
        child = nodelist[item]
        conn.write(node.text+'->'+child.text+':'+str(node.edges[index])+'\n')
        conn.write(child.text+'->'+node.text+':'+str(node.edges[index])+'\n')
        tree_printer(child, conn)
    return None


def tree_rooting(node):
    if node.leaf:
        return None
    ii = node.index
    for child in node.children:
        if ii in nodelist[child].children:
            nodelist[child].remove_node(ii)
        tree_rooting(nodelist[child])
        
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt', 'r') as f:
    nodelist = interpreter(f)
seq_len = len(nodelist[0].text)
# Tree rooting process
new_root = len(nodelist)
last_node = new_root - 1
connected_node = max(nodelist[last_node].children)
nodelist[new_root] = Node(new_root, False)
nodelist[new_root].add_node(last_node)
nodelist[new_root].add_node(connected_node)
nodelist[last_node].remove_node(connected_node)
nodelist[connected_node].remove_node(last_node)
tree_rooting(nodelist[new_root])

# Tree scoring
for i in range(seq_len):
    tree_scoring(i)
    tree_pruning(nodelist[new_root])
    for _, node in nodelist.items():
        node.reinit()
running_sum = 0
tree_edge_weighter(nodelist[new_root])

# Tree uprooting
top_edge_weight = sum(nodelist[new_root].edges)
del nodelist[new_root]
nodelist[last_node].add_node(connected_node)
nodelist[last_node].edges[-1] = top_edge_weight
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/out.txt', 'w') as g:
    g.write(str(running_sum)+'\n')
    tree_printer(nodelist[last_node], g)

In [None]:
'''
Solve the Nearest Neighbors of a Tree Problem.
     Input: Two internal nodes a and b specifying an edge e, followed by an adjacency list of an unrooted binary tree.
     Output: Two adjacency lists representing the nearest neighbors of the tree with respect to e. Separate the
     adjacency lists with a blank line.
Sample Input:
5 4
0->4
4->0
1->4
4->1
2->5
5->2
3->5
5->3
4->5
5->4
Sample Output:
1->4
0->5
3->4
2->5
5->2
5->4
5->0
4->1
4->5
4->3
1->5
0->4
3->4
2->5
5->2
5->4
5->1
4->0
4->5
4->3

trial.txt --

5 4
0->4
4->0
1->4
4->1
2->5
5->2
3->5
5->3
4->5
5->4
'''

from collections import defaultdict
from copy import copy, deepcopy


class Node(object):
    def __init__(self, index, leaf):
        self.index = index
        self.leaf = leaf
        self.children = []
        self.edges = []
        self.text = ''
        self.score = defaultdict(int)
        self.scored = False

    def reinit(self):
        self.score = defaultdict(int)
        self.scored = False
        return None

    def is_ripe(self):
        if self.leaf:
            return True
        else:
            for item in self.children:
                if not nodelist[item].scored:
                    return False
        return True

    def add_node(self, c_index):
        self.children.append(c_index)
        self.edges.append(0)
        return None

    def remove_node(self, c_index):
        self.children.remove(c_index)
        self.edges.pop()
        return None


def interpreter(conn):
    edge = conn.readline().strip().split(' ')
    nl = dict()
    for raw_line in conn:
        line = raw_line.strip().split('->')
        a = int(line[0])
        if a not in nl:
            nl[a] = Node(a, False)
        b = int(line[1])
        if b not in nl:
            nl[b] = Node(b, False)
        nl[a].add_node(b)
    return edge, nl


def tree_printer(nodelist, conn):
    for _, node in nodelist.items():
        for child in node.children:
            conn.write(str(node.index)+'->'+str(child)+'\n')
    conn.write('\n')
    return None


def exchanger(aa, bb, tree):
    first_node = tree[aa]
    second_node = tree[bb]
    first_children = copy(first_node.children)
    second_children = copy(second_node.children)
    first_children.remove(bb)
    second_children.remove(aa)
    fixed_node = first_children[0]
    treelist = list()
    for ii in range(2):
        new_node1 = Node(aa, False)
        new_node1.add_node(bb)
        new_node1.add_node(fixed_node)
        new_node1.add_node(second_children[ii])
        new_node2 = Node(bb, False)
        new_node2.add_node(aa)
        new_node2.add_node(first_children[1])
        new_node2.add_node(second_children[(ii+1)%2])
        new_tree = deepcopy(tree)
        new_tree[aa] = new_node1
        new_tree[bb] = new_node2
        new_tree[first_children[1]].remove_node(aa)
        new_tree[first_children[1]].add_node(bb)
        new_tree[second_children[ii]].remove_node(bb)
        new_tree[second_children[ii]].add_node(aa)
        treelist.append(new_tree)
    return treelist

filename = '/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt'

with open(filename, 'r') as f:
    edge, nodelist = interpreter(f)
a, b = int(edge[0]), int(edge[1])
treelist = exchanger(a, b, nodelist)
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/out.txt', 'w') as g:
    for nodes in treelist:
        tree_printer(nodes, g)

In [None]:
'''
Implement the nearest neighbor interchange heuristic for the Large Parsimony Problem.
     Input: An integer n, followed by an adjacency list for an unrooted binary tree whose n leaves are labeled by DNA strings and
     whose internal nodes are labeled by integers.
     Output: The parsimony score and unrooted labeled tree obtained after every step of the nearest neighbor interchange heuristic.
     Each step should be separated by a blank line.
Note: Depending on how your code breaks ties, you may obtain a different solution than the one we provide.  As a result, the parsimony 
score at each step may vary.
Sample Input:
5
GCAGGGTA->5
TTTACGCG->5
CGACCTGA->6
GATTCCAC->6
5->TTTACGCG
5->GCAGGGTA
5->7
TCCGTAGT->7
7->5
7->6
7->TCCGTAGT
6->GATTCCAC
6->CGACCTGA
6->7
Sample Output:
22
TCCGTAGT->TCAGCGGA:4
GATTCCAC->GAACCCGA:4
CGACCTGA->GAACCCGA:3
TTTACGCG->TCAGCGGA:5
TCAGCGGA->TTTACGCG:5
TCAGCGGA->GCAGCGGA:1
TCAGCGGA->TCCGTAGT:4
GCAGGGTA->GCAGCGGA:2
GCAGCGGA->GAACCCGA:3
GCAGCGGA->GCAGGGTA:2
GCAGCGGA->TCAGCGGA:1
GAACCCGA->GATTCCAC:4
GAACCCGA->CGACCTGA:3
GAACCCGA->GCAGCGGA:3
21
TCCGTAGT->TCTGCGGA:4
GATTCCAC->GCTGCGGA:5
CGACCTGA->GCAGCGGA:4
TTTACGCG->TCTGCGGA:4
TCTGCGGA->TTTACGCG:4
TCTGCGGA->GCTGCGGA:1
TCTGCGGA->TCCGTAGT:4
GCAGGGTA->GCAGCGGA:2
GCTGCGGA->GCAGCGGA:1
GCTGCGGA->GATTCCAC:5
GCTGCGGA->TCTGCGGA:1
GCAGCGGA->CGACCTGA:4
GCAGCGGA->GCAGGGTA:2
GCAGCGGA->GCTGCGGA:1


trial.txt --

5
GCAGGGTA->5
TTTACGCG->5
CGACCTGA->6
GATTCCAC->6
5->TTTACGCG
5->GCAGGGTA
5->7
TCCGTAGT->7
7->5
7->6
7->TCCGTAGT
6->GATTCCAC
6->CGACCTGA
6->7

'''

import sys
import queue
import numpy as np
from copy import deepcopy
class LargeParsimony:
    def __init__(self):
        n, adj, nodes, lastEdge = self.readFromFile()
        trees = self.runNearestNeighborInterchange(n, adj, nodes, lastEdge)
        self.saveTrees(trees)
        self.printTrees(trees)
    
    def _input(self):
        data = sys.stdin.read().strip().split('\n')
        n = int(data[0])
        adj = [dict() for _ in range(n)]
        nodes = ['' for _ in range(n)]
        currNode = 0
        for d in data[1:]:
            d = d.split('->')
            try:
                p = int(d[0])
            except:
                p = currNode
                nodes[p] = d[0]
                currNode += 1
            try:
                c = int(d[1])
            except:
                continue
            if p > len(adj)-1 or c > len(adj)-1:
                adj.extend([dict() for _ in range(max([p,c])-len(adj)+1)])
            adj[p][c] = 0
            adj[c][p] = 0
        nodes.extend(['' for _ in range(len(adj)-n+1)])
        lastEdge = [int(i) for i in data[-1].split('->')]
        return n, adj, nodes, lastEdge

    def readFromFile(self):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt', 'r')
        data = []
        for line in f:
            data.append(line.strip())
        n = int(data[0])
        adj = [dict() for _ in range(n)]
        nodes = ['' for _ in range(n)]
        currNode = 0
        for d in data[1:]:
            d = d.split('->')
            try:
                p = int(d[0])
            except:
                p = currNode
                nodes[p] = d[0]
                currNode += 1
            try:
                c = int(d[1])
            except:
                continue
            if p > len(adj)-1 or c > len(adj)-1:
                adj.extend([dict() for _ in range(max([p,c])-len(adj)+1)])
            adj[p][c] = 0
            adj[c][p] = 0
        nodes.extend(['' for _ in range(len(adj)-n+1)])
        lastEdge = [int(i) for i in data[-1].split('->')]
        f.close()
        return n, adj, nodes, lastEdge

    def printResults(self, s, adj, nodes):
        print(s)
        for i, d in enumerate(adj):
            for j, w in d.items():
                print(nodes[i]+'->'+nodes[j]+':'+str(w))
    
    def printTrees(self, trees):
        for s, adj, nodes in trees:
            print(s)
            for i, d in enumerate(adj):
                for j, w in d.items():
                    print(nodes[i]+'->'+nodes[j]+':'+str(w))
            print('')
    
    def saveTrees(self, trees):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/result.txt', 'w')
        for s, adj, nodes in trees:
            f.write(str(s)+'\n')
            for i, d in enumerate(adj):
                for j, w in d.items():
                    f.write(nodes[i]+'->'+nodes[j]+':'+str(w)+'\n')
            f.write('\n')
    
    def charIndConversion(self):
        char2ind = {'A':0, 'C':1, 'G':2, 'T':3}
        ind2char = {0:'A', 1:'C', 2:'G', 3:'T'}
        return char2ind, ind2char
    
    def singleSmallParsimony(self, n, adjC, adjP, adj, nodes, char2ind, ind2char, charInd):
        s = [[np.inf]*4 for _ in range(len(adjC))]
        backtrack = [[(-1, -1) for _ in range(4)] for __ in range(len(adjC))]
        processed = [0 for _ in range(len(adjC))]
        ripe = set()
        for i in range(n):
            s[i][char2ind[nodes[i][charInd]]] = 0
            processed[i] = 1
            if len(adjP[i]) > 0:
                ripe.add(adjP[i][0])
        
        while len(ripe) > 0:
            v = ripe.pop()
            for k in range(4):
                l = [s[adjC[v][0]][i] + (0 if k == i else 1) for i in range(4)]
                r = [s[adjC[v][1]][i] + (0 if k == i else 1) for i in range(4)]
                largmin = np.argmin(l)
                rargmin = np.argmin(r)
                backtrack[v][k] = (largmin, rargmin)
                s[v][k] = l[largmin] + r[rargmin]
            processed[v] = 1
            if len(adjP[v]) > 0 and all([processed[u] for u in adjC[adjP[v][0]]]):
                ripe.add(adjP[v][0])

        ind = np.argmin(s[v])
        nodes[v] += ind2char[ind]
        smin = s[v][ind]

        q = queue.Queue()
        q.put((v, ind))
        while not q.empty():
            v, k = q.get()
            if len(adjC[v]) > 0:
                u, w = adjC[v]
                l, r = backtrack[v][k]
                
                if k != l:
                    adj[v][u] += 1
                    adj[u][v] += 1
                if k != r:
                    adj[v][w] += 1
                    adj[w][v] += 1

                if len(adjC[u]) > 0:
                    nodes[u] += ind2char[l]
                    q.put((u, l))
                if len(adjC[w]) > 0:
                    nodes[w] += ind2char[r]
                    q.put((w, r))
      
        return smin
    
    def runSmallParsimony(self, n, adj, nodes, lastEdge): # for unrooted binary tree
        def dist(v, w):
            d = 0
            l = len(v)
            for i in range(l):
                if v[i] != w[i]:
                    d += 1
            return d

        char2ind, ind2char = self.charIndConversion()
        root = len(adj)
        del adj[lastEdge[0]][lastEdge[1]]
        del adj[lastEdge[1]][lastEdge[0]]
        adj.append(dict())
        adj[root][lastEdge[0]] = 0
        adj[lastEdge[0]][root] = 0
        adj[root][lastEdge[1]] = 0
        adj[lastEdge[1]][root] = 0
        adjC = [[] for _ in range(len(adj))]
        adjP = [[] for _ in range(len(adj))]
        q = queue.Queue()
        q.put(root)
        visited = [False for _ in range(len(adj))]
        visited[root] = True
        while not q.empty():
            curr = q.get()
            for v in adj[curr].keys():
                if not visited[v]:
                    adjP[v].append(curr)
                    visited[v] = True
                    q.put(v)
        for u, d in enumerate(adjP):
            for v in d:
                adjC[v].append(u)
        s = 0
        for i in range(len(nodes[0])):
            s += self.singleSmallParsimony(n, adjC, adjP, adj, nodes, char2ind, ind2char, i)
        d = dist(nodes[lastEdge[0]], nodes[lastEdge[1]])
        del adj[root]
        del adj[lastEdge[0]][root]
        del adj[lastEdge[1]][root]
        adj[lastEdge[0]][lastEdge[1]] = d
        adj[lastEdge[1]][lastEdge[0]] = d
        return s, adj, nodes
    
    def findNearestNeighbors(self, edge, adj):
        adj1 = deepcopy(adj)
        adj2 = deepcopy(adj)

        del adj1[edge[0]][edge[1]]
        del adj1[edge[1]][edge[0]]
        e0 = list(adj1[edge[0]].keys())
        e1 = list(adj1[edge[1]].keys())
        adj1[edge[0]][e1[0]] = 0
        adj1[edge[1]][e0[0]] = 0
        adj1[e1[0]][edge[0]] = 0
        adj1[e0[0]][edge[1]] = 0
        del adj1[e1[0]][edge[1]]
        del adj1[e0[0]][edge[0]]
        del adj1[edge[0]][e0[0]]
        del adj1[edge[1]][e1[0]]
        adj1[edge[0]][edge[1]] = 0
        adj1[edge[1]][edge[0]] = 0

        adj2[edge[0]][e1[1]] = 0
        adj2[edge[1]][e0[0]] = 0
        adj2[e1[1]][edge[0]] = 0
        adj2[e0[0]][edge[1]] = 0
        del adj2[e1[1]][edge[1]]
        del adj2[e0[0]][edge[0]]
        del adj2[edge[0]][e0[0]]
        del adj2[edge[1]][e1[1]]
        return adj1, adj2

    def runNearestNeighborInterchange(self, n, adj, nodes, lastEdge):
        trees = []
        score = np.inf
        newScore, newAdj, newNodes = self.runSmallParsimony(n, adj, deepcopy(nodes), lastEdge)
        while newScore < score:
            score = newScore
            adj = newAdj
            visited = set()
            for v in range(n, len(adj)):
                for u in adj[v].keys():
                    if u >= n and not (v, u) in visited:
                        adj1, adj2 = self.findNearestNeighbors([v, u], adj)
                        for i, a in enumerate(adj1):
                            adj1[i] = dict.fromkeys(a, 0)
                        for i, a in enumerate(adj2):
                            adj2[i] = dict.fromkeys(a, 0)
                        neighborScore, neighborAdj, neighborNodes = self.runSmallParsimony(n, adj1, deepcopy(nodes), [v, u])
                        if neighborScore < newScore:
                            newScore = neighborScore
                            newAdj = neighborAdj
                            newNodes = neighborNodes
                        neighborScore, neighborAdj, neighborNodes = self.runSmallParsimony(n, adj2, deepcopy(nodes), [v, u])
                        if neighborScore < newScore:
                            newScore = neighborScore
                            newAdj = neighborAdj
                            newNodes = neighborNodes                
                        visited.add((v, u))
                        visited.add((u, v))
            if newScore < score:
                trees.append((newScore, newAdj, newNodes))
        return trees

In [None]:
'''
We represent the masses in a spectrum as a sequence Spectrum of integers  s1,…,sms1,…,sm in increasing order, where s1s1 is 
zero and smsm is the total mass of the (unknown) peptide. We define a labeled graph Graph(Spectrum) by forming a node for each 
element of Spectrum, then connecting nodes sisi and sjsj by a directed edge labeled by an amino acid aa if sj−sisj−si is equal 
to the mass of aa. As we assumed when sequencing antibiotics, we do not distinguish between amino acids having the same integer 
masses (i.e., the pairs K/Q and I/L).
Construct the graph of a spectrum.
     Given: A space-delimited list of integers Spectrum.
     Return: Graph(Spectrum).
Note: Throughout this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty 
amino acids. Examples sometimes use the toy amino acid alphabet {X, Z} whose masses are 4 and 5, respectively.
Sample Input:
57 71 154 185 301 332 415 429 486
Sample Output:
0->57:G
0->71:A
57->154:P
57->185:K
71->185:N
154->301:F
185->332:F
301->415:N
301->429:K
332->429:P
415->486:A
429->486:G
'''

massTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}

def spectrum_graph(Spectrum):
    Spectrum = sorted(Spectrum)
    for i in range(len(Spectrum)):
        for j in range(i+1,len(Spectrum)):
            if Spectrum[j]-Spectrum[i] in massTable.values():
                aa = [ key for key,val in massTable.items() if val == Spectrum[j]-Spectrum[i]]
                print(str(Spectrum[i])+'->'+str(Spectrum[j]) + ':' + aa[0])
                
Spectrum = [57,71,154,185,301,332,415,429,486]
Spectrum = [0] + Spectrum
spectrum_graph(Spectrum)

In [None]:
'''
integer_mass_table.txt --

G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186

'''

'''
Given an amino acid string Peptide, its ideal spectrum, denoted IdealSpectrum(Peptide), is the collection of integer masses of 
all its prefixes and suffixes. Note that an ideal spectrum may have repeated masses; for example, IdealSpectrum(GPG) = {0, 57, 
57, 154, 154, 211}. We say that an amino acid string Peptide explains a collection of integers Spectrum if 
IdealSpectrum(Peptide) = Spectrum.
Decoding an Ideal Spectrum Problem: Reconstruct a peptide from its ideal spectrum.
     Input: A collection of integers Spectrum.
     Output: An amino acid string Peptide that explains Spectrum.
Sample Input:
57 71 154 185 301 332 415 429 486
Sample Output:
GPFNA
'''

class Node(object):
    def __init__(self, value):
        self.value = value
        self.children = {}

    def add_child(self, child, acid):
        self.children[child] = acid


def interpreter(conn):
    tspec = conn.readline().strip().split(' ')
    tspecint = [int(x) for x in tspec]
    tspecint.insert(0, 0)
    return tspecint


def graph_build(spec, conn):
    graph = {i: Node(i) for i in spec}
    for i in spec:
        for j in spec:
            if i < j:
                diff = j-i
                if diff in mass_to_amino:
                    conn.write(str(i)+'->'+str(j)+':'+mass_to_amino[diff]+'\n')
                    graph[i].add_child(j, mass_to_amino[diff])
    return graph


def mass(seq):
    m = 0
    for lett in seq:
        m += amino_to_mass[lett]
    return m

def ideal_spec(seq):
    spec = list()
    spec.append(0)
    spec.append(mass(seq))
    for i in range(1, len(seq)):
        spec.append(mass(seq[:i]))
        spec.append(mass(seq[i:]))
    spec = list(set(spec))
    spec.sort()
    return spec

In [None]:
'''
trial.txt --

57 71 154 185 301 332 415 429 486

'''
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/integer_mass_table.txt') as e:
        mass_to_amino = dict()
        amino_to_mass = dict()
        for item in e:
            temp = item.strip().split(' ')
            try:
                mass_to_amino[int(temp[1])] = temp[0]
                amino_to_mass[temp[0]] = int(temp[1])
            except IndexError:
                pass

with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt', 'r') as f:
    spectrum = interpreter(f)
    final = max(spectrum)

with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/out.txt', 'w') as g:
    spec_graph = graph_build(spectrum, g)

possibles = [(spec_graph[0], '')]
out_pep = []
while possibles:
    curr = possibles.pop()
    if not curr[0].children:
        if curr[0].value == final:
            out_pep.append(curr[1])
    else:
        for child, acid in curr[0].children.items():
            possibles.append((spec_graph[child], curr[1]+acid))

for item in out_pep:
    spec_out = ideal_spec(item)
    if spec_out == spectrum:
        print(item)

In [None]:
'''
Given an amino acid string Peptide = a1 . . . an of length n, we will represent its prefix masses using a binary peptide 
vector Peptide' with mass(Peptide) coordinates. This vector contains a 1 at each of the n prefix coordinates
mass(a1), mass(a1 a2), . . . , mass(a1 a2 . . . an ) , and it contains a 0 in each of the remaining noise coordinates. The 
toy peptide XZZXX, whose prefix masses are 4, 9, 14, 18, and 22, corresponds to the peptide vector 
(0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1) of length 22.
1. Converting a Peptide into a Peptide Vector Problem: Convert a peptide into a peptide vector.
     Input: An amino acid string Peptide.
     Output: The peptide vector Peptide'.
Sample Input:
XZZXX
Sample Output:
0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1
2. Converting a Peptide Vector into a Peptide Problem: Convert a peptide vector into a peptide.
     Input: A binary vector P.
     Output: A peptide whose peptide vector is equal to P (if such a peptide exists).
Sample Input:
0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1
Sample Output:
XZZXX
'''

In [None]:
#1. Converting a Peptide into a Peptide Vector Problem

massTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163,'X':4,'Z':5}

def get_mass(peptide):
    if len(peptide) == 0:
        return 0
    return sum([massTable[pep] for pep in peptide])

def peptide_vector(peptide):
    mass = []
    for i in range(1,len(peptide)+1):
        mass.append(get_mass(peptide[:i]))
    pepVec = [0] * max(mass)
    for mi in mass:
        pepVec[mi-1] = 1
    return pepVec

peptide = 'XZZXX'
ret = peptide_vector(peptide)
print(' '.join(map(str,ret)))

In [None]:
#2. Converting a Peptide Vector into a Peptide Problem

massTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163,'X':4,'Z':5}

def get_mass(peptide):
    if len(peptide) == 0:
        return 0
    return sum([massTable[pep] for pep in peptide])

def spectrum_graph(Spectrum):
    Spectrum = sorted(Spectrum)
    specGraph = defaultdict(list)
    weighGraph = defaultdict(list)
    for i in range(len(Spectrum)):
        for j in range(i+1,len(Spectrum)):
            if Spectrum[j]-Spectrum[i] in massTable.values():
                aa = [ key for key,val in massTable.items() if val == Spectrum[j]-Spectrum[i]]
                specGraph[Spectrum[i]].append(Spectrum[j])
                weighGraph[Spectrum[i]].append(aa[0])
    specGraph[max(Spectrum)] = []
    weighGraph[max(Spectrum)] = []
    return specGraph,weighGraph

def depthFirstPaths(graph,s):
    nodes = graph.keys()
    edgeTo = dict((key,None) for key in graph.keys())
    marked = dict((key,False) for key in graph.keys())
    postorder = []
    edgeTo, marked,postorder = dfs(graph,s,edgeTo,marked,postorder)
    return edgeTo, marked,postorder

def pathTo(graph,weights,s,v,edgeTo):
    stack = []
    weightstack = []
    w = edgeTo[v]
    stack.append(v)
    weightstack.append(weights[w][graph[w].index(v)])
    while w != s:
        stack.append(w)
        v = w
        w = edgeTo[v]
        weightstack.append(weights[w][graph[w].index(v)])
    stack.append(s)
    return stack[::-1],''.join(weightstack[::-1])


def from_peptide_vector(vector):
    mass = [i+1 for i in range(len(vector)) if vector[i]==1]
    mass = [0] + mass
    return mass

def ideal_spectrum(peptide):
    idealPrefixSpec = []
    for i in range(1,len(peptide)+1):
        idealPrefixSpec.append(get_mass(peptide[:i]))
    return idealPrefixSpec

def decode_ideal_spec(spec):
    specGraph,weighGraph = spectrum_graph(spec)
    source = min(spec)
    sink = max(spec)
    edgeTo, marked,postorder = depthFirstPaths(specGraph,source)
    paths,peptides = pathTo(specGraph,weighGraph,source,sink,edgeTo)
    peptide = []
    for i in range(len(paths)-1):
        peptide.append(weighGraph[paths[i]][specGraph[paths[i]].index(paths[i+1])])
    return ''.join(peptide)

pepvector = [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
spec = from_peptide_vector(pepvector)
print(decode_ideal_spec(spec))

In [None]:
'''
Given a spectral vector Spectrum', our goal is to find a peptide Peptide maximizing Score(Peptide, Spectrum'). Since the mass 
of a peptide and the parent mass of the spectrum that it generates should be the same, a peptide vector should have the same 
length as the spectral vector under consideration. We will therefore define the score between a peptide vector and a spectral 
vector of different length as −∞.
Peptide Sequencing Problem: Given a spectral vector, find a peptide with maximum score against this spectrum.
     Input: A spectral vector Spectrum'.
     Output: An amino acid string Peptide that maximizes Score(Peptide', Spectrum') among all possible amino acid strings.
Given a spectral vector Spectrum' = (s1, . . . , sm), we will construct a DAG on m + 1 nodes, labeled with the integers from 
0 (source) to m (sink), and then connect node i to node j by a directed edge if j − i is equal to the mass of an amino acid. 
We will further assign weight si to node i (for 1 ≤ i ≤ m) and assign weight zero to node 0.
Any path connecting source to sink in this DAG corresponds to an amino acid string Peptide, and the total weight of nodes on 
this path is equal to Score(Peptide', Spectrum'). We have therefore reduced the Peptide Sequencing Problem to the problem of 
finding a maximum-weight path from source to sink in a node-weighted DAG.
Solve the Peptide Sequencing Problem.
     Given: A space-delimited spectral vector Spectrum'.
     Return: An amino acid string with maximum score against Spectrum'. For masses
     with more than one amino acid, any choice may be used.
Note: When a spectral vector Spectrum' = s1 ... sm is given, it does not have a zero-th element; in your implementations, you 
should assume that s0 is equal to zero.
Input
20 2 -14 -4 -10 -4 5 16 20 -12 -1 -9 11 -11 12 3 -1 0 3 21 3 -2 10 11 -11 15 17 2 4 8 -19 28 28 29 1 21 27 -15 25 -15 10 10 26 -9 -13 7 -6 9 27 -3 2 -12 0 20 26 -14 15 29 10 30 -17 25 21 -6 26 25 24 6 9 29 -13 27 16 -5 27 25 -3 -20 3 12 27 5 29 3 -13 9 6 12 14 -14 -17 -8 -13 17 6 20 0 -20 -7 -4 -12 18 7 11 3 -8 23 0 -6 27 6 -20 6 1 15 -14 -20 -3 22 -13 6 10 10 -18 -6 -1 -14 8 16 -4 -12 -12 -11 30 11 -4 -11 10 -15 23 -19 27 -17 -3 -3 13 12 -19 5 22 -8 19 21 -6 21 -20 -6 4 -20 -5 17 0 2 13 24 22 6 20 -3 17 -12 24 -18 5 -12 -9 -17 17 -3 29 9 -12 1 -17 -4 8 2 17 -20 28 -15 -11 26 16 19 4 23 -18 22 -15 1 16 14 30 3 12 -5 27 14 -5 29 5 13 16 -3 4 -9 -20 11 11 12 13 2 13 -2 -15 3 4 -20 28 -8 4 28 5 28 16 -3 3 19 17 6 -5 6 -15 6 15 -8 29 22 -3 -5 -5 16 24 14 26 -19 13 -17 22 16 -20 -7 14 -11 -10 -19 30 14 28 20 14 12 -11 -19 -4 -19 -20 22 2 -5 9 -10 -4 -12 15 8 22 27 5 9 18 -13 27 27 -12 8 28 29 -12 -9 29 -15 -7 7 18 -6 -4 -15 -11 23 25 27 -8 3 22 29 7 16 -8 11 -17 -9 5 22 4 -17 7 -18 2 0 -8 2 0 3 -6 23 -3 -17 21 9 -6 7 3 20 9 24 -11 21 1 25 30 -17 -11 12 -13 8 7 -12 -17 0 -1 19 20 -11 13 -6 -14 6 -4 27 6 -18 -4 21 20 -11 21 16 -15 2 -8 8 24 24 -15 -12 -5 -20 -15 27 29 25 -19 -20 14 -16 7 18 -15 25 11 -12 -5 1 -4 12 14 -3 -11 14 -14 25 4 21 -2 -1 -17 23 -3 -9 19 26 8 -18 2 3 20 19 -15 -20 3 26 -3 11 -14 8 25 -16 8 8 2 23 29 -10 28 -16 13 -1 11 22 -20 8 18 2 -16 -17 18 30 0 -19 23 -6 -10 11 24 16 27 19 -16 -17 -9 -8 -5 1 5 0 15 9 24 10 18 10 -7 -9 12 10 21 27 -10 4 -13 -17 -1 3 -17 0 -7 -20 20 29 -13 -11 -4 -12 11 3 25 26 20 27 14 29 17 19 -2 8 17 -8 -8 14 -15 -5 0 23 0 23 22 24 13 1 25 -17 -16 7 18 4 24 -9 10 -12 17 8 28 3 29 -2 26 -8 -3 22 23 -19 1 18 -8 -4 21 29 29 6 -5 5 24 -9 -12 -9 -10 3 5 0 -7 23 22 13 11 16 -20 28 -8 -17 -3 13 13 24 8 13 14 6 -5 -3 20 24 -19 14 -9 -6 16 3 28 -20 -18 11 -18 15 7 29 5 11 -14 21 6 16 -4 16 -10 -17 7 10 2 25 -16 1 16 -19 20 5 1 -3 5 11 21 21 -1 6 -11 -7 28 -8 8 -10 3 24 -9 -4 21 -6 1 -1 11 -13 -12 20 20 -13 -14 -20 -19 26 15 -16 -5 -9 10 4 5 -10 23 14 22 9 -16 16 21 9 15 3 30 6 8 22 25 24 22 1 -16 -3 19 6 -4 -7 -6 25 -13 13 15 21 10 30 -12 19 -1 -2 -19 14 29 -16 12 -17 -8 -12 5 8 25 18 22 7 14 13 25 -20
Output
GGPGGPGGAGG
'''

class PeptideSequencing:
    def __init__(self):
        massDict, aaDict = self.AminoAcidMassDict()
        spectralVector = self.readFromFile()
        peptide = self.findPeptide(spectralVector, massDict)
        print(peptide)
        f = open('result.txt', 'w')
        f.write(peptide)
        f.close()
        
    def AminoAcidMassDict(self):
        massTable = '''
G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186'''
        mass = massTable.split()
        return {int(mass[i+1]):mass[i] for i in range(0, len(mass), 2)}, {mass[i]:int(mass[i+1]) for i in range(0, len(mass), 2)}
    
    def _input(self):
        data = sys.stdin.read().strip().split()
        spectralVector = list(map(int, data))
        spectralVector.insert(0, 0)
        return spectralVector
    
    def readFromFile(self):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/peptide_sequencing.txt', 'r')
        for line in f:
            data = line.strip().split()
        spectralVector = list(map(int, data))
        spectralVector.insert(0, 0)
        return spectralVector
    
    def findPeptide(self, spectralVector, massDict):
        l = len(spectralVector)
        adj = [[] for _ in range(l)]
        for i in range(l):
            for j in range(i, l):
                if j-i in massDict:
                    adj[i].append(j)
        
        # Bellman-Ford algorithm
        dist = [-np.inf] * l
        parent = [None] * l
        dist[0] = 0
        updated = True
        for i in range(l-1):
            if not updated:
                break
            updated = False
            for u in range(l):
                for v in adj[u]:
                    if dist[u] + spectralVector[v] > dist[v]:
                        dist[v] = dist[u] + spectralVector[v]
                        parent[v] = u
                        updated = True
        u = l-1
        path = [u]
        while 0 != u:
            u = parent[u]
            path.insert(0, u)

        peptide = ''.join([massDict[path[i+1]-path[i]] for i in range(len(path)-1)])
        return peptide
    
PeptideSequencing()

In [None]:
'''
Despite many attempts, researchers have still not devised a scoring function that reliably assigns the highest score to the 
biologically correct peptide, i.e., the peptide that generated the spectrum. Fortunately, although the correct peptide often 
does not achieve the highest score among all peptides, it typically does score highest among all peptides limited to the 
species’s proteome. As a result, we can transition from peptide sequencing to peptide identification by limiting our search to 
peptides present in the proteome, which we concatenate into a single amino acid string Proteome.
Peptide Identification Problem: Find a peptide from a proteome with maximum score against a spectrum.
     Input: A spectral vector Spectrum and an amino acid string Proteome.
     Output: An amino acid string Peptide that maximizes Score(Peptide, Spectrum) among all substrings of Proteome.
Sample Input:
0 0 0 4 -2 -3 -1 -7 6 5 3 2 1 9 3 -8 0 3 1 2 1 8
XZZXZXXXZXZZXZXXZ
Sample Output:
ZXZXX

'''
massTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, \
            'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, \
            'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, \
            'V': 99, 'Y': 163,'X':4,'Z':5}

def get_mass(peptide):
    if len(peptide) == 0:
        return 0
    return sum( massTable[pep] for pep in peptide )

def peptide_vector(peptide):
    mass = []
    for i in range(1,len(peptide)+1):
        mass.append(get_mass(peptide[:i]))
    pepVec = [0] * max(mass)
    for mi in mass:
        pepVec[mi-1] = 1
    return pepVec

def peptide_identification(proteome,specVec):
    massSpec = len(specVec)
    peptides = []
    for i in range(len(proteome)):
        for j in range(i+1,len(proteome)):
            masspep = get_mass(proteome[i:j])
            if masspep == massSpec:
                pepVec = peptide_vector(proteome[i:j])
                if len(pepVec) == len(specVec):
                    peptides.append(proteome[i:j])
            elif masspep > massSpec:
                #if the prefix is already larger, then dont need to consider the rest
                break
    if len(peptides) != 0:
        maxScore = float('-inf')
        for peptide in peptides:
            pepVec = peptide_vector(peptide)
            score = sum([pepVec[i]*specVec[i] for i in range(len(pepVec))])
            if score > maxScore:
                maxScore = score
                pepHit = peptide
    else:
        pepHit = []
    return pepHit

specVec = list(map(int,'0 0 0 4 -2 -3 -1 -7 6 5 3 2 1 9 3 -8 0 3 1 2 1 8'.split()))
proteome = 'XZZXZXXXZXZZXZXXZ'

peptide_identification(proteome,specVec)

In [None]:
'''
Like peptide sequencing algorithms, peptide identification algorithms may return an erroneous peptide, particularly if the 
score of the highest-scoring peptide found in the proteome is much lower than the score of the highest-scoring peptide over 
all peptides. For this reason, biologists usually establish a score threshold and only pay attention to a solution of the 
Peptide Identification Problem if its score is at least equal to the threshold.
Given a set of spectral vectors SpectralVectors, an amino acid string Proteome, and a score threshold threshold, we will solve 
the Peptide Identification Problem for each vector Spectrum' in SpectralVectors and identify a peptide Peptide having maximum 
score for this spectral vector over all peptides in Proteome (ties are broken arbitrarily). If Score(Peptide, Spectrum) is 
greater than or equal to threshold, then we conclude that Peptide is present in the sample and call the pair 
(Peptide, Spectrum') a Peptide- Spectrum Match (PSM). The resulting collection of PSMs for SpectralVectors is denoted 
PSMthreshold(Proteome, SpectralVectors).
PSM Search Problem: Identify all Peptide-Spectrum Matches scoring above a threshold for a set of spectra and a proteome.
     Input: A set of spectral vectors SpectralVectors, an amino acid string Proteome, and an integer threshold.
     Output: The set PSMthreshold(Proteome, SpectralVectors).
Sample Input:
-1 5 -4 5 3 -1 -4 5 -1 0 0 4 -1 0 1 4 4 4
-4 2 -2 -4 4 -5 -1 4 -1 2 5 -3 -1 3 2 -3
XXXZXZXXZXZXXXZXXZX
5 
Sample Output:
XZXZ
PSMSearch(SpectralVectors, Proteome, threshold).
    PSMSet ← an empty set
    for each vector Spectrum' in SpectralVectors
          Peptide ← PeptideIdentification(Spectrum', Proteome)
          if Score(Peptide, Spectrum) ≥ threshold
              add the PSM (Peptide, Spectrum') to PSMSet
    return PSMSet
    
'''
massTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, \
            'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, \
            'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, \
            'V': 99, 'Y': 163,'X':4,'Z':5}

def get_mass(peptide):
    if len(peptide) == 0:
        return 0
    return sum( massTable[pep] for pep in peptide )

def peptide_vector(peptide):
    mass = []
    for i in range(1,len(peptide)+1):
        mass.append(get_mass(peptide[:i]))
    pepVec = [0] * max(mass)
    for mi in mass:
        pepVec[mi-1] = 1
    return pepVec

def peptide_identification(proteome,specVec):
    massSpec = len(specVec)
    peptides = []
    for i in range(len(proteome)):
        for j in range(i+1,len(proteome)):
            masspep = get_mass(proteome[i:j])
            if masspep == massSpec:
                pepVec = peptide_vector(proteome[i:j])
                if len(pepVec) == len(specVec):
                    peptides.append(proteome[i:j])
            elif masspep > massSpec:
                #if the prefix is already larger, then dont need to consider the rest
                break
    if len(peptides) != 0:
        maxScore = float('-inf')
        for peptide in peptides:
            pepVec = peptide_vector(peptide)
            score = sum([pepVec[i]*specVec[i] for i in range(len(pepVec))])
            if score > maxScore:
                maxScore = score
                pepHit = peptide
    else:
        pepHit = []
    return pepHit

def PSMSearch(SpectralVectors, Proteome, threshold):
    PSMset = []
    for specVec in SpectralVectors:
        specVec = list(map(int,specVec.split()))
        peptide = peptide_identification(Proteome,specVec)
        print(peptide)
        if len(peptide) != 0:
            pepVec = peptide_vector(peptide)
            score = sum([pepVec[i]*specVec[i] for i in range(len(pepVec))])
            if score >= threshold:
                PSMset.append(peptide)
    return set(PSMset)

specVecs = ['-1 5 -4 5 3 -1 -4 5 -1 0 0 4 -1 0 1 4 4 4'\
,'0 -4 2 -2 -4 4 -5 -1 4 -1 2 5 -3 -1 3 2 -3']
proteome = 'XXXZXZXXZXZXXXZXXZX'
threshold = 5

peptides = PSMSearch(specVecs, proteome, threshold)
print('\n'.join(peptides))

In [None]:
#Use this for PSM search
class PSMSearch:
    def __init__(self):
        massDict, aaDict = self.AminoAcidMassDict()
        spectralVectorSet, threshold, proteome = self.readFromFile()
        PSMSet = self.searchPSM(spectralVectorSet, threshold, proteome, aaDict)
        for peptide in PSMSet:
            print(peptide)
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/result.txt', 'w')
        for peptide in PSMSet:
            f.write(peptide+'\n')
        f.close()

    def AminoAcidMassDict(self):
        massTable = '''
G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186'''
        mass = massTable.split()
        return {int(mass[i+1]):mass[i] for i in range(0, len(mass), 2)}, {mass[i]:int(mass[i+1]) for i in range(0, len(mass), 2)}

    def readFromFile(self):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt', 'r')
        data = []
        for line in f:
            data.append(line.strip().split())
        threshold = int(data[-1][0])
        proteome = data[-2][0]
        spectralVectorSet = [[0]+list(map(int, d)) for d in data[:-2]]
        return spectralVectorSet, threshold, proteome 

    def findPeptide(self, spectralVector, proteome, aaDict):
        l = len(spectralVector) - 1
        n = len(proteome)
        score = -np.inf
        peptide = ''
        for i in range(n-1):
            for j in range(i, n):
                if sum([aaDict[aa] for aa in proteome[i:j+1]]) > l:
                    break
                if sum([aaDict[aa] for aa in proteome[i:j+1]]) != l:
                    continue
                prefixMasses = [sum([aaDict[aa] for aa in proteome[i:i+k+1]]) for k in range(j-i+1)]
                currScore = sum([spectralVector[m] for m in prefixMasses])
                if currScore > score:
                    score = currScore
                    peptide = proteome[i:j+1]
        return score, peptide

    def searchPSM(self, spectralVectorSet, threshold, proteome, aaDict):
        PSMSet = []
        for spectralVector in spectralVectorSet:
            score, peptide = self.findPeptide(spectralVector, proteome, aaDict)
            if score >= threshold:
                if not peptide in PSMSet:
                    PSMSet.append(peptide)
        return PSMSet

PSMSearch()

In [None]:
'''
Define Pr(i, t) as the sum of probabilities of all peptides with mass i for which Score(Peptide, Spectrum'i ) is equal to t. 
The set of peptides contributing to Pr(i, t) can be split into 20 subsets depending on their final amino acid. Each peptide 
Peptide ending in a specific amino acid a results in a shorter peptide Peptidea if we remove a; Peptidea has mass i − |a| and 
score t − si. Since the probability of Peptide is 20 times smaller than the probability of Peptidea, the contribution of Peptide 
to Pr(i, t) is 20 times smaller than contribution of Peptidea to Pr(i − |a|, t − si ). Therefore, Pr(i, t) can be computed as
Pr(i,t)=∑all amino acids a120⋅Pr(i−|a|,t−si),
which differs from the recurrence for computing Size(i, t) only in the presence of the factor 1/20. We can now compute the 
probability of a spectral dictionary as 
Pr(Dictionarythreshold(Spectrum'))=∑t≥thresholdPr(m,t).
Solve the Probability of Spectral Dictionary Problem.
     Given: A spectral vector Spectrum', an integer threshold, and an integer max_score.
     Return: The probability of the dictionary Dictionarythreshold(Spectrum').
Note: Use the provided max_score for the height of your table.
Input - trial.txt --
-10 11 3 10 11 12 -6 -5 4 4 -2 9 6 -8 9 -6 -1 10 -6 14 4 13 1 -6 5 -7 13 0 -1 12 -2 11 7 -10 9 13 14 -7 7 -9 -6 4 14 2 -9 1 12 13 15 6 15 13 -6 -10 -10 -8 -8 -7 -10 -7 -6 -4 6 9 -6 7 11 -1 -8 1 9 -5 6 7 -3 -10 -9 -1 4 7 7 -6 14 -6 12 15 7 8 11 -5 8 -8 12 -3 -1 -7 -6 9 13 12 -3 7 7 6 3 1 2 4 10 11 -10 -3 14 9 6 8 -9 1 5 -6 -8 5 -7 6 -6 -7 4 1 -3 7 5 10 11 12 0 -10 12 13 11 3 9 8 -10 9 -8 0 15 4 1 1 -4 12 2 4 0 15 -10 4 -10 -10 6 -5 -5 0 10 -5 8 1 14 6 -3 12 9 -7 -4 -9 -9 7 2 6 4 -10 -9 8 -4 -5 0 7 -4 -3 5 12 -10 3 -6 -10 6 10 -6 3 -5 15 4 14 -1 10 -9 13 11 -7 -5 -3 14 15 6 -3 -8 -5 0 12 0 12 2 8 -1 6 2 4 -6 3 11 -4 -10 1 -5 0 14 -5 -6 -1 15 13 12 -10 6 4 0 14 -1 5 15 13 4 -6 13 12 7 14 6 15 10 -9 1 -8 10 9 6 6 2 9 -2 5 11 -4 -6 -10 -7 10 9 8 -6 1 -8 2 -1 -1 -4 -2 0 9 11 -6 9 11 5 5 14 7 -10 14 -4 7 4 14 14 14 8 2 5 14 -4 13 7 10 14 -7 -6 11 -7 -2 -6 -3 1 -7 7 10 15 -6 -2 0 14 1 9 -7 5 -3 -5 5 -5 0 -4 1 3 11 9 -4 -3 -4 0 1 -4 15 -8 -3 0 0 11 -9 11 5 -9 1 -1 -7 -3 8 -9 11 5 4 4 -7 11 -1 -4 -5 7 -7 7 3 6 13 -1 11 -3 13 11 4 3 2 3 0 12 -6 3 12 -10 -8 -9 12 -2 12 5 -3 5 11 5 1 -2 3 5 1 11 6 -6 -2 0 -7 15 14 15 -10 0 6 13 9 10 -2 10 2 8 6 -6 5 -2 1 13 8 14 1 -4 11 11 -8 0 8 5 5 9 -1 -7 3 15 -7 -8 -3 11 9 0 10 2 1 13 4 0 -6 15 15 -1 10 3 1 2
30
200
Output
0.00132187890625
'''

class SpectralDictProb:
    def __init__(self):
        massList = self.AminoAcidMassList()
        sVector, threshold, maxScore = self.readFromFile()
        p = self.dictProb(sVector, threshold, maxScore, massList)
        print(p)
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/result.txt', 'w')
        f.write(str(p))
        f.close()
        
    def AminoAcidMassList(self):
        massTable = '''
G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186
'''
        mass = massTable.split()
        return [int(mass[i+1]) for i in range(0, len(mass), 2)]

    def readFromFile(self):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/dataset_11866_11.txt', 'r')
        data = []
        for line in f:
            data.append(line.strip().split())
        sVector = [0] + list(map(int, data[0]))
        threshold = int(data[1][0])
        maxScore = int(data[2][0])
        return sVector, threshold, maxScore
    
    def dictProb(self, sVector, threshold, maxScore, massList):
        prob = dict()
        prob[(0, 0)] = 1
        p = sum([self.getProb(len(sVector)-1, t, sVector, massList, prob) for t in range(threshold, maxScore+1)])
        return p
    
    def getProb(self, i, t, sVector, massList, prob):
        if (i, t) in prob:
            return prob[(i, t)]
        if i < 0 or t < 0:
            prob[(i, t)] = 0
            return 0
        p = sum([self.getProb(i-m, t-sVector[i], sVector, massList, prob)/20 for m in massList])
        prob[(i, t)] = p
        return p
    
SpectralDictProb()

In [None]:
#or u can use this as well for spectral dict probability
massTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, \
            'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, \
            'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, \
            'V': 99, 'Y': 163}

#massTable = {'X':4,'Z':5}

def get_size_spectral_dictionary(spectrum,threshold,maxscore):
    size = []
    for t in range(maxscore+1):
        size.append([0]*(len(spectrum)))
    size[0][0] = 1

    for i in range(1,len(spectrum)):
        for t in range(maxscore+1):
            total = 0
            for aa in massTable.keys():
                m = massTable[aa]
                if i-m>=0 and t-spectrum[i]<=maxscore and t-spectrum[i]>=0:
                    total = total + size[t-spectrum[i]][i-m]
            size[t][i] = float(total)/20 # divided by 20 means 
    ret = 0
    for i in range(len(spectrum)-1,len(spectrum)):
        for t in range(maxscore+1):
            if t >= threshold and t<=maxscore:
                ret += size[t][i]
    return ret

with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt') as f:
    lines = f.read().splitlines()
spectrum = lines[0]
threshold = int(lines[1])
maxscore = int(lines[2])

spectrum = list(map(int,[0] + spectrum.split(' ')))
specSize = get_size_spectral_dictionary(spectrum,threshold,maxscore)
print(specSize)

In [None]:
'''
Size of Spectral Dictionary Problem: Find the size of a spectral dictionary for a given spectrum and score threshold.
     Input: A spectral vector Spectrum' and an integer threshold.
     Output: The number of peptides in Dictionarythreshold(Spectrum').
We will use dynamic programming to solve the Size of Spectral Dictionary Problem. Given a spectral vector Spectrum' = 
(s1, . . . , sm), we define its i-prefix (for i between 1 and m) as Spectrum'i = (s1, . . . , si ) and introduce a variable 
Size(i, t) as the number of peptides Peptide of mass i such that Score(Peptide, Spectrum'i) is equal to t.
The key to establishing a recurrence relation for computing Size(i, t) is to realize that the set of peptides contributing 
to Size(i, t) can be split into 20 subsets depending on their final amino acid a. Each peptide ending in a specific amino 
acid a results in a shorter peptide with mass i − |a| and score t − si   if we remove a from the peptide (here, |a| denotes 
the mass of a). Thus,
Size(i,t)=∑all amino acids aSize(i−|a|,t−si).
Since there is a single “empty” peptide of length zero, we initialize Size(0, 0) = 1. We also define Size(0, t) = 0 for all 
possible scores t, and set Size(i, t) = 0 for negative values of i. Using the above recurrence, we can compute the size of a 
spectral dictionary of Spectrum' = (s1, . . . , sm) as
|Dictionarythreshold(Spectrum)|=∑t≥thresholdSize(m,t).
Solve the Size of Spectral Dictionary Problem.
     Given: A spectral vector Spectrum', an integer threshold, and an integer max_score.
     Return: The size of the dictionary Dictionarythreshold(Spectrum').
Note: Use the 20 amino acid alphabet as well as the provided max_score for the height of your table. Your answer should be the 
number of peptides whose score is at least T and at most max_score.

Input -- trial.txt
14 -4 -3 -3 5 9 0 14 2 1 -4 6 -1 13 2 -5 13 -8 -8 3 0 -10 14 4 14 14 8 -8 1 3 -10 -2 2 -9 3 6 13 -10 6 -8 12 2 8 -1 -5 -6 -6 10 3 -3 12 -4 14 3 11 14 15 12 -7 -5 -2 11 13 -9 15 -8 -10 5 -8 5 6 -9 -2 7 -6 -1 -2 12 12 -3 9 0 3 0 5 -6 3 3 -7 6 0 -6 5 8 -7 5 3 13 13 -2 -9 0 2 13 13 12 7 -2 -10 -5 -7 7 13 11 14 -4 -9 15 -10 5 -7 -6 -7 -6 11 5 9 8 -4 7 1 -9 12 2 8 12 -6 0 2 -5 10 11 14 15 -1 3 -3 3 -3 12 15 4 -2 14 13 8 -10 2 -3 0 -6 8 3 10 0 9 10 13 15 6 9 -10 -9 1 -3 -10 8 1 -10 2 -1 14 -3 15 -1 0 1 6 -7 5 12 6 -9 2 1 -2 14 -5 1 -8 -6 11 -5 2 -3 -8 7 -6 -10 8 6 13 -8 -5 -10 12 -5 -9 8 9 0 10 15 -1 4 2 -8 9 1 -9 -6 -8 -1 -1 5 10 -4 7 3 11 4 12 6 6 13 -3 12 -3 1 7 11 6 13 8 3 -6 5 11 4 -1 15 10 -8 -7 0 4 7 5 -4 8 -3 -4 -8 9 -2 -3 13 1 12 4 -1 13 -1 -5 -5 7 7 -7 -5 6 6 -2 -5 7 10 14 11 12 -9 6 -3 4 15 -8 11 -3 -7 5 -4 7 9 15 -9 8 13 6 -2 -3 9 6 5 14 10 -7 -9 -8 10 2 -3 -1 2 3 12 13 6 -2 8 -5 5 -3 -8 10 3 0 12 -7 10 6 15 8 7 -2 8 14 -2 13 -1 8 15 -7 -7 -7 7 -3 -2 5 -4 -3 15 11 -4 9 11 13 15 8 4 -6 7 12 14 6 -10 -5 -9 4 -9 13 -3 0 12 3 12 -5 11 1 15 -8 5 3 -5 7 15 -2 -9 0 0 1 1 -1 -4 -1 5 12 12 -5 8 5 14 12 5 -9 2 -10 -9 4 -2 6 5 -3 -7 7 5 -8 -10 8 -1 7 3 6 -6 14 -8 6 -5 -8 -10 14 -2 12 4 5 -2 9 -4 1 -5 -3 -6 -8 -9 -10 10 4 9 11 -7 6 -4 5 13 -8 -7 -3 10 14 4 10 6 4 0 13 -3 11 -9 2 -8 6 -8 4 -1
37
200
Output
330
'''

class SpectralDictSize:
    def __init__(self):
        massList = self.AminoAcidMassList()
        sVector, threshold, maxScore = self.readFromFile()
        s = self.dictSize(sVector, threshold, maxScore, massList)
        print(s)
        
    def AminoAcidMassList(self):
        massTable = '''
G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186'''
        mass = massTable.split()
        return [int(mass[i+1]) for i in range(0, len(mass), 2)]

    def readFromFile(self):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt', 'r')
        data = []
        for line in f:
            data.append(line.strip().split())
        sVector = [0] + list(map(int, data[0]))
        threshold = int(data[1][0])
        maxScore = int(data[2][0])
        return sVector, threshold, maxScore
    
    def dictSize(self, sVector, threshold, maxScore, massList):
        size = dict()
        size[(0, 0)] = 1
        s = sum([self.getSize(len(sVector)-1, t, sVector, massList, size) for t in range(threshold, maxScore+1)])
        return s
    
    def getSize(self, i, t, sVector, massList, size):
        if (i, t) in size:
            return size[(i, t)]
        if i < 0 or t < 0:
            size[(i, t)] = 0
            return 0
        s = sum([self.getSize(i-m, t-sVector[i], sVector, massList, size) for m in massList])
        size[(i, t)] = s
        return s
    
SpectralDictSize()

In [None]:
import sys
import queue
import numpy as np
from copy import deepcopy

'''
We will use the term block indel to refer to the addition or removal of a block of consecutive zeroes from a binary vector. 
Thus, applying k modifications to an amino ￼acid string Peptide corresponds to applying k block indels to its peptide vector 
Peptide'. We define Variantsk(Peptide) as the set of all modified variants of Peptide with up to k modifications.
Given a peptide Peptide and a spectral vector Spectrum', our goal is to find a modified peptide from Variantsk(Peptide) with 
maximum score against Spectrum'.
Spectral Alignment Problem: Given a peptide and a spectral vector, find a modified variant of this peptide that maximizes the 
peptide-spectrum score, among all variants of the peptide with up to k modifications.
     Input: An amino acid string Peptide, a spectral vector Spectrum', and an integer k.
     Output: A peptide of maximum score against Spectrum' among all peptides in Variantsk(Peptide).
     
Input -- trial.txt
LVWSTE
-3 1 12 1 5 12 -4 10 0 9 -4 -9 -6 14 -10 8 11 -3 3 2 11 12 -10 -4 15 -5 2 1 4 -10 -10 7 9 10 8 -9 15 13 -9 7 7 1 13 15 0 -3 -2 7 6 6 7 15 -5 10 7 -4 3 11 14 10 -10 2 -8 12 0 0 -6 15 1 14 3 12 4 11 -4 -7 5 3 8 -10 3 7 11 8 5 15 7 13 3 1 -1 -1 9 5 7 3 9 10 3 -9 10 6 13 8 5 2 -5 14 3 14 12 -7 11 -3 15 11 -8 10 -4 14 -4 5 -2 8 -3 2 -7 12 -10 0 -3 14 0 6 14 0 14 7 -5 3 -3 -1 -3 7 8 -4 2 -6 10 5 13 -9 -3 10 1 4 0 11 12 5 -10 -6 1 -9 1 13 -3 7 -5 6 0 5 3 -9 3 -4 1 -6 -4 11 -10 8 -2 15 6 5 -7 12 3 11 5 15 -8 6 -4 8 10 -7 12 -3 -4 10 11 -4 -3 9 0 -5 1 -6 11 7 -3 -7 -9 13 10 -7 -6 14 15 -10 14 15 -8 5 1 10 -5 2 -7 4 14 10 7 -1 1 12 -7 7 -8 -1 12 3 -9 4 -7 6 8 10 8 5 6 12 4 4 4 12 -4 6 7 -10 7 -7 0 -1 -2 6 -10 -5 -4 5 -3 -1 1 2 10 -9 -6 14 9 -3 -8 -2 -4 -1 0 -10 6 10 -6 -3 4 -1 -7 0 6 3 -3 5 4 -4 0 3 0 -1 -1 -4 -10 2 -3 -9 -10 5 7 10 5 2 -8 2 -6 13 6 -7 5 5 -5 15 8 11 12 4 8 5 11 3 -7 14 3 8 0 2 -4 6 -7 1 8 13 -1 14 3 -9 12 -3 12 14 0 -2 11 11 13 3 15 -9 -5 2 1 -10 7 -8 -10 -6 -7 1 0 7 9 5 -9 5 0 -3 -2 13 14 -10 4 3 5 13 -3 -4 -5 11 14 5 1 -10 -1 4 -6 13 -6 1 -5 2 -2 3 12 0 -7 15 10 -10 11 7 0 9 -7 -5 9 2 -8 -9 -5 -6 13 7 11 -3 10 1 -2 0 6 -2 -4 5 5 -4 15 11 4 -7 10 -1 9 -2 5 13 -10 14 7 6 4 -8 1 12 12 2 5 13 7 6 -4 -2 7 4 -8 -7 8 14 -8 14 -2 -5 -9 -5 11 10 7 3 1 11 1 14 4 -9 14 11 12 -1 -3 -7 10 13 -7 4 6 8 2 1 -3 -3 -9 11 7 -1 5 -5 -6 -9 2 11 8 13 1 -6 7 5 3 2 2 6 12 -9 12 3 -3 -1 -10 -4 -4 10 10 14 12 -2 7 14 9 3 10 -5 5 -7 4 9 1 13 10 13 13 -8 6 4 10 -4 15 -9 -1 14 9 -10 3 15 0 -3 2 4 -5 1 8 3 15 12 3 0 9 5 1 3 5 14 -6 13 0 13 -7 4 -9 -2 -9 -3 8 8 13 13 8 4 -2 0 5 -9 -2 0 11 7 0 -2 -6 -6 11 2 -3 7 1 5 -10 7 -10 0 12 12 -2 7 -3 2 -6 13 8 -4 11 -10 -9 4 9 -8 14 -3 7 -2 -4 8 4 -3 -1 -2 -5 12 -9 8 15 10 10 13 10 6 4 -8 7 -9 -1 7 -2 14 -10 2 6 -9 6 9 0 6 15 10 2 6 12 10 9 -2 5 -7 4 -3 9 -6 6 1 6 14 13 14 3 4 9 -8 0 10 7 -6 3 11 7 9 -8 8 5 -8 8 4 10 0
3
Output
L(-61)VW(-9)STE(+69)
'''

class SpectralAlignment:
    def __init__(self):
        massDict, aaDict = self.AminoAcidMassDict()
        peptide, sVector, k = self.readFromFile()
        mPeptide = self.constructAlignment(peptide, sVector, k, aaDict)
        print(mPeptide)
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/result.txt', 'w')
        f.write(mPeptide)
        f.close()
    
    def AminoAcidMassDict(self):
        massTable = '''
G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186'''
        mass = massTable.split()
        return {int(mass[i+1]):mass[i] for i in range(0, len(mass), 2)}, {mass[i]:int(mass[i+1]) for i in range(0, len(mass), 2)}
    
    def readFromFile(self):
        f = open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/trial.txt', 'r')
        data = []
        for line in f:
            data.append(line.strip().split())
        peptide = data[0][0]
        sVector = [0] + list(map(int, data[1]))
        k = int(data[2][0])
        return peptide, sVector, k
    
    def getScore(self, node, score):
        if node in score:
            return score[node]
        else:
            score[node] = -np.inf
            return -np.inf

    def constructAlignment(self, peptide, sVector, k, aaDict):
        prefixMasses = [0] + [sum([aaDict[aa] for aa in peptide[:i+1]]) for i in range(len(peptide))]
        diff = {prefixMasses[i+1]:(prefixMasses[i+1]-prefixMasses[i]) for i in range(len(peptide))}
        score = dict()
        score[(0,0,0)] = 0
        backtrack = dict()
        for i in prefixMasses[1:]:
            if i < len(sVector):
                score[(i,i,0)] = sVector[i]+score[(i-diff[i],i-diff[i],0)]
                backtrack[(i,i,0)] = (i-diff[i],i-diff[i],0)
            else:
                break
        for t in range(1, k+1):
            for i in prefixMasses[1:]:
                for j in range(1, len(sVector)):
                    prevList = [(i-diff[i],j-diff[i],t)] + [(i-diff[i],j1,t-1) for j1 in range(j)]
                    prevIndex = np.argmax([sVector[j] + self.getScore(node, score) for node in prevList])
                    score[(i, j, t)] = sVector[j] + self.getScore(prevList[prevIndex], score)
                    backtrack[(i, j, t)] = prevList[prevIndex]
        lastNodes = [(prefixMasses[-1], len(sVector)-1, t) for t in range(k+1)]
        t = np.argmax([self.getScore(node, score) for node in lastNodes])
        prevNode = lastNodes[t]
        mPeptide = ''
        pList = [peptide[i] for i in range(len(peptide))]
        while (0,0,0) != prevNode:
            node = backtrack[prevNode]
            if node[2] == prevNode[2]:
                mPeptide = pList.pop() + mPeptide
                prevNode = node
            else:
                indel = prevNode[1]-node[1]-diff[prevNode[0]]
                if indel > 0:
                    mPeptide = '(+'+str(indel)+')'+mPeptide
                else:
                    mPeptide = '('+str(indel)+')'+mPeptide 
                mPeptide = pList.pop() + mPeptide
                prevNode = node
        return mPeptide
    
SpectralAlignment()