In [None]:
# Given a string Text, its k-mer composition Compositionk(Text) is the collection of all k-mer substrings of Text (including repeated k-mers)
# Generate the k-mer composition of a string.
def composition(text, k):
    patterns = []
    for i in range(len(text)-k + 1):
        pat = text[i:i+k] 
        patterns.append(pat)
    return patterns

In [None]:
# consecutive 3-mers in TAATGCCATGGGATGTT are linked together to form this string's genome path.
# Reconstruct a string from its genome path
def string_reconstruct(dna_list):
    dna_first = dna_list[0]
    for dna in dna_list[1:]:
        dna_first = dna_first + dna[-1]
    return dna_first

with open("/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/dna_reconstruct.txt") as f:
    dna_list  = f.read().splitlines()

In [None]:
# To generalize the construction of the above graph to an arbitrary collection of k-mers Patterns, we form a node for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph on these k-mers, denoted Overlap(Patterns)
# The overlap graph Overlap(Patterns), in the form of an adjacency list

def suffix(t):
    return t[1:]

def prefix(t):
    return t[:-1]

def overlap_graph(reads):
    overlaps = []
    for j in sorted(reads):
        for i in reads:
            if suffix(i) == prefix(j):
                overlaps.append((i, j))
    overlaps_df = pd.DataFrame(overlaps)
    overlaps_df.columns = ["i","j"]
    overlaps_df = overlaps_df.groupby('i').agg({'j': ', '.join}).reset_index()
    overlaps = dict(zip(overlaps_df.i, overlaps_df.j))
    return overlaps

for key, value in overlap_graph(dna_list).items():
    print(key + " -> " + value)

In [None]:
# Solve the De Bruijn Graph from a String Problem.
# The de Bruijn graph DeBruijnk(Text) is formed by gluing identically labeled nodes in PathGraphk(Text).

def deBruijn(dna, k):
    reads = []
    for i in range(len(dna)-k+1):
        pat = dna[i:i+k]
        reads.append(pat)
    de_bruijn_dict = dict()
    for kmer in sorted(reads):
        if kmer[:-1] in de_bruijn_dict:
            de_bruijn_dict[kmer[:-1]].add(kmer[1:])
        else:
            de_bruijn_dict[kmer[:-1]] = {kmer[1:]}
    de_buijn = [' -> '.join([item[0], ','.join(item[1])]) for item in sorted(de_bruijn_dict.items())]
    return de_buijn

In [None]:
# For every k-mer in Patterns, we connect its prefix node to its suffix node by a directed edge in order to produce DeBruijn(Patterns).
# Construct the de Bruijn graph from a set of k-mers.

def build_deGruijnGraphFromKmers(kmers):
    kmers = sorted(kmers)
    ''' we need to merge the prefix and suffix set in case there are single isolated edges! '''
    k_1mers = [k[:-1] for k in kmers] + [k[1:] for k in kmers]
    k_1mers = sorted(set(k_1mers))

    nodes = {}
    for i in range(len(k_1mers)):
        nodes[i] = k_1mers[i]
    invnodes = {v:k for k,v in nodes.items()}

    edges = {}
    for pat in kmers:
        prefixpat = pat[:-1]
        suffixpat = pat[1:]
        if invnodes[prefixpat] in edges:
            edges[invnodes[prefixpat]].append(invnodes[suffixpat])
        else:
            edges[invnodes[prefixpat]] = [invnodes[suffixpat]]

    temp = []
    for key, vals in edges.items():
        print(nodes[key],'->',','.join([nodes[val] for val in vals]))
        temp.append(nodes[key] +' -> '+','.join([nodes[val] for val in vals]))
    return temp

In [None]:
# EulerianCycle(Graph)
#     form a cycle Cycle by randomly walking in Graph (don't visit the same edge twice!)
#     while there are unexplored edges in Graph
#         select a node newStart in Cycle with still unexplored edges
#         form Cycle’ by traversing Cycle (starting at newStart) and then randomly walking 
#         Cycle ← Cycle’
#     return Cycle

In [None]:
# Solve the Eulerian Cycle Problem.
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/dataset_203_2-2.txt') as f:
    lines = f.read().splitlines()
adjlist = {}
for line in lines:
    node = int(line.split(' -> ')[0])
    edge = list(map(int,line.split(' -> ')[1].split(',')))
    adjlist[node] = edge
    
def eulerian_cycle(adjlist):
    nodes = list(adjlist.keys())
    eulerianCycle = []

    marked = {} # we need to mark the EDGE
    edges = 0
    for key,val in adjlist.items():
        marked[key] = [False] * len(val)
        edges += len(val)
    # choose any vertex v and push it onto stack
    v = random.choice(nodes)
    # greedily add to cycle, depth-first search style
    stack = []
    stack.append(v)
    # while the stack is not empty
    while len(stack):
        u = stack[-1]
        unmarkedEdges = [i for i,mark in enumerate(marked[u]) if mark == False]
        if len(unmarkedEdges):
            w = adjlist[u][unmarkedEdges[0]]
            marked[u][unmarkedEdges[0]] = True
            stack.append(w)
        else:
            del stack[-1]
            eulerianCycle.append(u)
    return eulerianCycle[::-1]

cycle = eulerian_cycle(adjlist)
printcycle = ''.join([str(c) + '->' for c in cycle])
print(printcycle[:-2])

In [None]:
#find eulerian path
from functools import reduce
def eulerian_path(edge_dict):
    '''Returns an Eulerian path from the given edges.'''
    # Determine the unbalanced edges.
    out_values = reduce(lambda a,b: a+b, edge_dict.values())
    #print(out_values)
    for node in set(out_values+list(edge_dict.keys())):
        out_value = out_values.count(node)
        if node in edge_dict:
            in_value = len(edge_dict[node])
        else:
            in_value = 0

        if in_value < out_value:
            unbalanced_from = node
        elif out_value < in_value:
            unbalanced_to = node

    # Add an edge connecting the unbalanced edges.
    if unbalanced_from in edge_dict:
        edge_dict[unbalanced_from].append(unbalanced_to)
    else:
        edge_dict[unbalanced_from] = [unbalanced_to]

    # Get the Eulerian Cycle from the edges, including the unbalanced edge.
    cycle = eulerian_cycle(edge_dict)

    # Find the location of the unbalanced edge in the eulerian cycle.
    divide_point = list(filter(lambda i: cycle[i:i+2] == [unbalanced_from, unbalanced_to], range(len(cycle)-1)))[0]

    # Remove the unbalanced edge, and shift appropriately, overlapping the head and tail.
    return cycle[divide_point+1:]+cycle[1:divide_point+1]

with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/dataset_203_6.txt') as f:
    lines = f.read().splitlines()
adjlist = {}
for line in lines:
    node = int(line.split(' -> ')[0])
    edge = list(map(int,line.split(' -> ')[1].split(',')))
    adjlist[node] = edge
path = eulerian_path(adjlist)
print('->'.join(map(str,path)))
with open('ans.txt', 'w') as output_data:
    output_data.write('->'.join(map(str,path)))

In [None]:
#a method to assemble a genome, since the String Reconstruction Problem reduces to finding an Eulerian path in the de Bruijn graph generated from reads
#Solve the String Reconstruction Problem
def make_edges(seqs):
    edges = []
    for read in seqs:
        edges.append(read[:-1]+ ' -> ' +read[1:])
    return edges

def reconstruct_string(edges):
    string_dict = {line.strip().split(' -> ')[0]:line.strip().split(' -> ')[1] for line in edges}
    head = list(filter(lambda x: x not in string_dict.values(), string_dict.keys()))[0]
    tail = list(filter(lambda x: x not in string_dict.keys(), string_dict.values()))[0]
    reconstructed_str = head[0]
    current_str = head
    while current_str != tail:
        current_str = string_dict[current_str]
        reconstructed_str += current_str[0]
    reconstructed_str += tail[1:]

    return reconstructed_str

data = []
with open("/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/dataset_203_7.txt") as input_data:
    for line in input_data:
        data.append(line[:-1])
k = int(data[0])
reads = data[1:]
edges = make_edges(reads)
ans = reconstruct_string(edges)
print(ans)

In [1]:
# Solve the k-Universal Circular String Problem.
def build_deGruijnGraphFromKmers(kmers):
    kmers = sorted(kmers)
    ''' we need to merge the prefix and suffix set in case there are single isolated edges! '''
    k_1mers = [k[:-1] for k in kmers] + [k[1:] for k in kmers]
    k_1mers = sorted(set(k_1mers))

    nodes = {}
    for i in range(len(k_1mers)):
        nodes[i] = k_1mers[i]
    invnodes = {v:k for k,v in nodes.items()}

    edges = {}
    for pat in kmers:
        prefixpat = pat[:-1]
        suffixpat = pat[1:]
        if invnodes[prefixpat] in edges:
            edges[invnodes[prefixpat]].append(invnodes[suffixpat])
        else:
            edges[invnodes[prefixpat]] = [invnodes[suffixpat]]

    temp = []
    for key, vals in edges.items():
        print(nodes[key],'->',','.join([nodes[val] for val in vals]))
        temp.append(nodes[key] +' -> '+','.join([nodes[val] for val in vals]))
    return edges, nodes

def universal_string(k):
    kmers = [ ''.join(x) for x in product('01', repeat=k) ]
    adjlist,nodes = build_deGruijnGraphFromKmers(kmers)
    cycle = eulerian_cycle(adjlist)

    length = 2**k
    epath = [nodes[p] for p in cycle]
    string = [epath[0]]
    for pa in epath[1:]:
        string.append(pa[-1])
    uni_string  = ''.join(string)
    uni_string = uni_string[:length]
    print(uni_string)

In [None]:
def build_deGruijnGraphFromPairedKmers(pairedKmers):
    # remove the white space in the end of some lines
    pairedKmers = [pk.strip() for pk in pairedKmers]
    # first need to sork the pairedKmers
    kmers = [pk.replace('|','') for pk in pairedKmers]
    sortedIndex = sorted(range(len(kmers)), key=lambda k: kmers[k])
    # extract the prefix and suffix of both the first and second part of the kmers separated by '|'
    sortedKmers = [pairedKmers[i] for i in sortedIndex]

    k_1mers = []
    for kmer in sortedKmers:
        kmer = kmer.split('|')
        k_1mers.append(kmer[0][:-1]+'|'+kmer[1][:-1])
        k_1mers.append(kmer[0][1:]+'|'+kmer[1][1:])

    k_1mers = sorted(set(k_1mers))

    nodes = {}
    for i in range(len(k_1mers)):
        nodes[i] = k_1mers[i]
    invnodes = {v:k for k,v in nodes.items()}

    edges = {}
    for pat in pairedKmers:
        pat = pat.split('|')
        node1 = pat[0][:-1]+'|'+pat[1][:-1]
        node2 = pat[0][1:]+'|'+pat[1][1:]
        if invnodes[node1] in edges:
            edges[invnodes[node1]].append(invnodes[node2])
        else:
            edges[invnodes[node1]] = [invnodes[node2]]

    adjlist = {}
    temp = []
    for key, vals in edges.items():
        adjlist[key] = vals
    return adjlist,nodes

In [None]:
def stringSpelledByGappedPatterns(gappedPatterns, k, d):
    firstPatterns = [ pat.split('|')[0] for pat in gappedPatterns ]
    secondPatterns = [ pat.split('|')[1].rstrip() for pat in gappedPatterns ]
    preffixString = stringSpelledByPatterns(firstPatterns,k)
    suffixString = stringSpelledByPatterns(secondPatterns,k)
    if preffixString[k+d:] == preffixString[k+d:]:
        return preffixString + suffixString[-k-d:]
    else:
        return None

def stringSpelledByPatterns(patterns,k):
    string = [patterns[0]]
    for pat in patterns[1:]:
        string.append(pat[-1])
    return ''.join(string)

In [None]:
#Solve the String Reconstruction from Read-Pairs Problem.
#reconstruct a string from read-pairs 
pairedKmers = ['GAGA|TTGA', 'TCGT|GATG', 'CGTG|ATGT', 'TGGT|TGAG', 'GTGA|TGTT', 'GTGG|GTGA', 'TGAG|GTTG', 'GGTC|GAGA', 'GTCG|AGAT']

with open(filename) as f:
    lines = f.read().splitlines()
k = int(lines[0].split()[0])
d = int(lines[0].split()[1])
pairedKmers = lines[1:]


adjlist,nodes = build_deGruijnGraphFromPairedKmers(pairedKmers)
print("number of nodes:",len(nodes))
numedge = 0
for v in adjlist.keys():
    numedge += len(adjlist[v])
print("number of outgoing edges:",numedge)

path = eulerian_path(adjlist)
patterns = [nodes[p] for p in path]
string = stringSpelledByGappedPatterns(patterns,k,d)
print(string)

In [None]:
# Generate the contigs from a collection of reads (with imperfect coverage).
# Solve the Contig Generation Problem
with open(filename) as file:
    kmers = [line.strip() for line in file.readlines()]
    
import collections
def flatten(x):
    result = []
    for el in x:
        if isinstance(x, collections.Iterable) and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result

edges = {}
for kmer in kmers:
    if kmer[:-1] in edges:
        edges[kmer[:-1]].append(kmer[1:])
    else:
        edges[kmer[:-1]] = [kmer[1:]]

balanced, unbalanced = [], []
out_values = reduce(lambda a,b: a+b, edges.values())
for node in set(out_values+list(edges.keys())):
    out_value = out_values.count(node)
    if node in edges:
        in_value = len(edges[node])
    else:
        in_value = 0
    if in_value == out_value == 1:
        balanced.append(node)
    else:
        unbalanced.append(node)

get_contigs = lambda s, c: flatten([c+e[-1] if e not in balanced else get_contigs(e,c+e[-1]) for e in edges[s]])
contigs = sorted(flatten([get_contigs(start,start) for start in set(unbalanced) & set(edges.keys())]))

print('\n'.join(contigs))

In [None]:
# CODE CHALLENGE: Solve the String Reconstruction Problem.
#      Input: An integer k followed by a list of k-mers Patterns.
#      Output: A string Text with k-mer composition equal to Patterns. 
#      (If multiple answers exist, you may return any one.)

def build_deGruijnGraphFromKmers(kmers):
    kmers = sorted(kmers)
    ''' we need to merge the prefix and suffix set in case there are single isolated edges! '''
    k_1mers = [k[:-1] for k in kmers] + [k[1:] for k in kmers]
    k_1mers = sorted(set(k_1mers))

    nodes = {}
    for i in range(len(k_1mers)):
        nodes[i] = k_1mers[i]
    invnodes = {v:k for k,v in nodes.items()}

    edges = {}
    for pat in kmers:
        prefixpat = pat[:-1]
        suffixpat = pat[1:]
        if invnodes[prefixpat] in edges:
            edges[invnodes[prefixpat]].append(invnodes[suffixpat])
        else:
            edges[invnodes[prefixpat]] = [invnodes[suffixpat]]

    temp = []
    for key, vals in edges.items():
        print(nodes[key],'->',','.join([nodes[val] for val in vals]))
        temp.append(nodes[key] +' -> '+','.join([nodes[val] for val in vals]))
    return edges, nodes

from functools import reduce
def eulerian_path(edge_dict):
    '''Returns an Eulerian path from the given edges.'''
    # Determine the unbalanced edges.
    out_values = reduce(lambda a,b: a+b, edge_dict.values())
    #print(out_values)
    for node in set(out_values+list(edge_dict.keys())):
        out_value = out_values.count(node)
        if node in edge_dict:
            in_value = len(edge_dict[node])
        else:
            in_value = 0

        if in_value < out_value:
            unbalanced_from = node
        elif out_value < in_value:
            unbalanced_to = node

    # Add an edge connecting the unbalanced edges.
    if unbalanced_from in edge_dict:
        edge_dict[unbalanced_from].append(unbalanced_to)
    else:
        edge_dict[unbalanced_from] = [unbalanced_to]

    # Get the Eulerian Cycle from the edges, including the unbalanced edge.
    cycle = eulerian_cycle(edge_dict)

    # Find the location of the unbalanced edge in the eulerian cycle.
    divide_point = list(filter(lambda i: cycle[i:i+2] == [unbalanced_from, unbalanced_to], range(len(cycle)-1)))[0]

    # Remove the unbalanced edge, and shift appropriately, overlapping the head and tail.
    return cycle[divide_point+1:]+cycle[1:divide_point+1]

#example
kmers = ['CTTA','ACCA','TACC','GGCT','GCTT','TTAC']

adjlist,nodes = build_deGruijnGraphFromKmers(kmers)
path = eulerian_path(adjlist)
epath = [nodes[p] for p in path]
string = [epath[0]]
for pa in epath[1:]:
    string.append(pa[-1])
print(''.join(string))

In [None]:
#Translate an RNA string into an amino acid string.
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

text = 'AUGGCAUUGGAUCUCGUUCAAUACCCUAUCAUCGGUGUAAGCUUGUGUCAUUGCGCCGGCUUGCGCGGACGUCCGAAAAUCGGUAGUGCGCUACCAUUGCCGUGUGCAUCGCUUCGCAUGGUUUACGAAAGUGGAUGCAUACCCUACCACAAGGUUACCCAUUCCAACUGGAGCUACGAAGCACUUGGGGCAAGUAUUUCCUUGACAGUAUGGAUUAACGGAUACAUUUAUGGAUGUUCAAAAGGUCGAACAGUGACCGACCGGUUUCUACUUGUUGUGCCCAAAGGAUACGGUCUGCUUUAUAAUGAAGCCAAGGAACUCUUACCAAUGGGAAUAUAUACUGUCAGCAUGCUGCCACCCAAGUACCUAGUAUUUAACGGACCCAAGGCGUUCCGGGCGAAGAGGGGUAACCUGCGGCCCAACAGUAAUAAUGAGCCACCCAGUGGGCCGCAACGGGAACUUAUUAGGAGAGUGCCAUCGUAUUAUUACGACUUGUCACGGGCCUACUCCGUAGAGAACACGCCACCGCCUGACUUUGCCAACACCGCUAUCUCCCCGGGAGUUGCAUCCCCAAUGUAUCAUGCAGGGUUACAAAGCCCCCUCUCUUACCUUCAUCAACGUGAUAUGCCUGCCGGAUUGAAACCUUUUCUGAGAAGGGCGAGUUCCCUCCUGACAGCACAUUAUCUUAUUUUAACUCAUCAGGCUGGGGGGGACCUCUGGCUGAGCAAUCUUAUUUGCCACCCAACACCCACACGGUUACACGUGAUUAGCGCACAUGCUUUUGCACUAGGAAAUAACCCUGGAGGCAGGCUAGGCCUUUGCCAGUGGAAUGGAUCAUCGCAUUUUUACCCACAUCCCGCGAGCGCGAUCCACGACUACAUCGGGGUCGGUAUAGCUACAUUACGGGUAUCGUUGAAAAGCGUAGUUACAGAGACCAUAGCAGGUACCGAGCCUAGCGCAUCCGAAUCAUGUUUACGCCUGCCGGGUGCUCGCUCAUACUUAUUCCAGGUUCGCACCGGGCGUAUAGACAGAAAGAUCCUAUUCGAUCCGAUAUGCCGGAGGGCAGAUCUAUGUUUUGAGCGUGUAAGAAGCCGAAGAGCCCGUACUCCUGACGGGGUCUUAAGAUUACCCCUUCGCCCUGAAGACUUGUACGCGCAAUGCCGUCUUCUGUCCCUAACGCUGAGGACAUAUACUCAGUUCCGACCUAAAUGUGCGGGAGCUCCGAGCGUGGGCACGCGUGUUCCUGAUAUGGUAUUAGAUCUGUCUUGUCACGAUUUAAGACUUCAAAGCGACGGCAUGAACAGUAAGUCACAUCGCCUGGCACUUUGCGUGCGUCAUACGGGCUGCCAUUCAAAAAGAUCAGCCCAGAAAGAACCCCCGGAAUGGUUCGGCGCAAUGGAUCCCAAAUUUAUGGGUCCGCUUUGGCUCUACAAUGACGUGGGAGAUCCUGUUAGGUUGCGGCCUUUAGUACACGAGUAUAGGACCCAAGUGGGACGGUACAGAGGGUCACUGACGGAUCCAAAACCCCACCUCUCAGCGGUGACUCCCGCCAGCGGCUCACUCUGGCUACUUAAAGACGUCGGGAUCGUUAGCUGUCCGCCCCAACCCAAGACUCUAUACGAAAUUAAAAAUCGGCACCGUCCCAGUGUUGAAGGACCUGUGCAGAAAGGCUGGUACGAUGCCUCUGAAAACAGCUGUCAAACCCGGAGACGUCGGUGUUCUCCGCAGAUGCCACGAUCCCAACAGCGCAACGGACAUGCGCUGCCUGCUGUCUGGCCGAGAAUACAGGAAAAGUUAUUACAUUUAAGAGUUGGUAUCGUCCUGCCUGAAAUCCCGUUGAAUUAUGAGGGUCAUUUGUGCACUGCAGACGACCGAGUCGGUAUCCUCCCUACCAGGAUAGCAUGGAAGUGGGAAGACACCAAUGAUACGGCAUCGGCACCCGGACCAGACGGCGCUAUUAGUCGCGAGGCAUUUUACACAUCCGUCCCAACAGCGGAUUUAAUCCUGAGGGGGUGUCAGGAAUCCCUAUCUACCGCCAGAGAGUGGAAUAAGUUGCACGCACGAGUGGACCAGGAUGGGGUCCCUCCCGCAUGUUCUGCUUGCAGAGCAACAUUUCGGCUAAGGUCAAGCGAUAUUCAUACUGCUUUCGACCCCGAAUUCGCGAACUGCCCCCCGUCCACUAUAUCGCAUCGACCGAUCGGGCGUGCUCGGCACCUAUCUUCACCGAAAACUGUGUUCAGUCUUUUCCAGAAUGCACUAUCAACGGGACCGCGGCGAGCCGGCGGUACUUACUCUGGCCCAACGGACAUGAUACGAAGAGUGAAUUCAAGAGCGUCUUGUAGCGAGGAACCGACGAACUGUAUCCAGGCAGCACAAGCCGCUCAUGCUCCUGGGCUAAUGGUACAUUUCAGGAGUCCCCGGAUGUUACAACGCCGGGAAAUUCAAGGCAAAAGACGGCGAACCUGGAGUACUCAGAGGGUGGCCCAGUGCCGAGCAUAUACCAUAGACCGUCUACACAGACUCCUAACCCGUUGUAUUACGGUCAGAGCAGUCCCACAUAAAUAUUCAGAAUACUUCGGGACAACCCCGCAUAGUUGCGAGGCGUCUCGUCAGGCCCAUUUGCGGGCUCCAAGUAGCCUACUUGGCGACUUAUACCUACGUCGCCUCUCACUACACACGAAUAUGGCAUGCUGGCCGUGCGGGAUUGGACUUUUCAGAAAUUAUACAGCUGCAACCGGCCUUAAUUGCGGUCUUACUUUCCUGAGAGGAACCUUUAUGGGUCGUGCCAGUGGACGUCCCGUGGAAAAUGGCAUCCUGGAUUCAACAUCGCAGCACAUCCCCGUUUGCACUUACCCGCCAACCCGCCUUGAAUCGACGACAUGCUCGGCCACUAGGCGGGCAAGCAGAGAGAAAUCAGGGACCCCCCAACACGUAUUUAACGUGACAUGUAAUCGGGUAAUUAGGGGCGGUAGACGGAACUCAUUCCACCACCGCACUUCGGGUUCAGUGUCGGCAGUAACUGAUUGCUCUGCCUUGCGGCACGUGCAUUGGAUGCUACUCACGUUCAGUGAAAGUACCGCUAGACUCCUGAGGGUAUUACGAACAAACCCGAAGCCACUAACGCGCUUGCAGGCGCGCCUCCGUUCUCGGGGUAUCGUGCAGUGUAAACAACGUCACAAACUGUCGUACGAGUUGCUGUACGUAUGUAAGAGUAGCCACUUGUGCUUCUCGGGAUAUGAGACCGCCCCAUUGGGGUCAUCCGACUGGAUAAAGUUUAUGGUCUGCGGCAAGGUAAGGUUGAUGGCCAUCGCUGGGCCAAACGAGCCUCCAAGGGUGGUUAGCUUUCACGCCGUGCGAACGGGAAAGGAGAUUGCUAUUCUGUUCGGUGCGACUAAGGUUACACCCCAUUACCAACCCGCGGCCCAAACCACCGCGUUCACAGCUCCGCUAUCGCAAAUCCCUCUUUCUAAAUGCCGACGAAGUCCCACCCCUCCAUCAUGGACGUGCUACCUAAGGCCCGUUUAUCACACUACUCAUCAGAGCCGACCCCCGAAAGGUUUGCAAAAGACCAGGACUACGAAGCUGCAAACACGACGAGAGCUGGACAAAGAUCGUACGUGGAGUGUGACGUCCAUUAUCACCACGCAGCCUUCAGUGUUUAAGCACCAAGGACUAAAUGGACUAUCAAUCGAUGUAAUGGACGAUCGGCUGACCAACGUAUACCACUCAAGUACGCCGGACAGUGGGAGUGUAGAUGCGAAUAUCAGACCGGAGCACAGCCCAAGAAACUACAACCUACUGUGUUACUUAGAGUCAGGCCUAAUGCUUAGGCGGUCGGGAUUCUCCGCAGUAACCCCGCCAGCGGAGCCGAGGAAGAUGGAGAUCUACUUACUCCAACAGUGCAUAUCGCUACCCAACGUGAGCGUACAUCCUCAGGGCGGCAUUUCAGGGCUUCGCAUGGACUGGGAAGAUGUGCGGAACAUACCUGAAGAGUCAGUCAAUUGGGGGGACGGACGUCCGGACGGGACUGAAGGUUUAUUAAGAAGAAGCGUAUCUCCACUUGGGGUAUUCCAAAUAGAAACUCCUAUACAAGACAGGGAAUGGGAGGAUUCACCCAACGCACGUUACAAUGUGGCCGCUAUGCCUUGCACGAAGGGUCAAAUAUGCCCGGAGUGCCAAUCUAACAAGAACCGAACCUGGGCGUUUGCAUUUAACUGGGGACCGCUACUAAUGUGCACAUCUAACCCUGACCGUCUGGACACCGUACGAGUAACGAAUCUCCCGCUGCAGAUCAUAGAAUUCAGCUCCAUCUCACCAUGUGCAGUAUCACAGCCUGAACUCCGAUCCGGGUCUAAUCUGAGGGACAGCCGGGGAAACGAGUCAUGGUUAGCGAUUAUGACCCGGUAUGAGACGGAUUCCCUACAAUCUGUUAUAUUCAUUCACCGUCGGACGAGUCUAACGACAAACCCGGACGUCGAGGGUAUCGCAUACAGACCGGUGUACGGGCGUUAUGUAGACGCUCCAGUAGAUCAACAAAAGUGGAAUGUCACGACCCGAAGCAUCCAUCGAGAUUUUGGCGGUGGAUUCGAGCACGACAACGCUGAGAGUCCAAGAGGGACACAUUGCUCCCGCUGCAGUACAGCGAACACUUUUCUCGGUGUAUCAGCGUUUGGCACAGCUGAGACCCUGAAGAAGAGUGCCUCGAGGGUACAGGAACACGGAUGUAUUCGUCAUCAUGAUAUCAGCGCUUUUCGCGGUAGUAGGGGAACGUCUUCUCUUGAGGGCAUAAACGUUGUGGCGUGUCCUGGUACAAUAUCCCUCGUUUUACACAGAAUAACCGACGAAGCGGUAGUGGUGAAAACCGUCUGGCUGCGUCCGGAAAAGUUCGUCAGCGCCAAGAUCGCAUCAACGGGGGGAAGCCAGACUAGAGGUAUGAGUCAUCUCUACCAGAUACUUUUGAGAAGCCCCAUGAUGGACGCAAGGAGUGAGUCCGAUCAAGCCGGUAUGUCGGAGCGCAUUAAAAUACGUCAACGUCCCUGCGUUAGACCCUACGUAUCGAGCCCCCAGAGCCUCACCUGCCAGAUAGAUUCCUUUCUUCCAUCCGGCUUGGCUUACUCUAGACUACGAGAAAGAGUUCUUAGUAUGAGGACUAUAUGUUAUAUAGCAACCAAAUCUGAGGCCUGGAAGGGAGCGCGUAUGCCUACGCAGGGCCUAAACUCUCCCACGACAACCGAUUUCUUCAGCCGAGGAGGACAGCUCAAACGCGAUGAACUCGUCCACUUUUGCCAGCGUGUUAUAUACCCCUCGCCAUGGGGACUGGGGGCGCAACGAGGUUACCUGGCGUGGCAUGAGGGAAAUGCUUCCCAUACGUACGUUUCUGUGAUUGAAAACAGAACAAGCUCGCUCGUGCGCUUGGGGUGGGAAGAUUUCAUCUAUAACGGCCCUUCUGCUUUCUGGACGUCAGCUUAUCCACUUUCCCACUCUCCUUCCGAGAUAAUCAGUAACAGUGUCCUCAUUGCUUCUGUUGGUACCAAGUACAAUCUACUCGCCGUAACUGUUUCCAAUAGCCAUAAGAGUUUCGAUAACGUCACAGCUAAUCGACUAAAGUACGACAAAGAAUCCACUGCAACCGUUUUUCAGAAACUCGUGUCGACCCAUAAAUUACGCUACAGCAAUUGUAUAACGAGAACAGACUCCAAUCUGCGAGUUAAGAUCACGCAGCCGCCAAGCUCUCAUUGGGCGAGGCGGACUCGCUGCGAGUUAAUAUUAUCUCCGAUUCGAGCUUUUCAAAUGUGUAUCGACAGGUGUAGUUUCCCGAGUAGAAGUGGACAUCCAAGAUCCACUCGAACAAAGCCCCCAUCAACCGACGUUGUCUCCAACGCGGGCGAAUGUUCGCCCAUCCUUAGACAUUCGAGAGGGUUGGCAUGUACGGGAAUUGUUCUUGUCGGCUGUCUAGCGAUCGCAGCCCUGAGCGUUGUGCAGUGUGAGUCCUUUGUUCCGCGGCUCAAAGAAGCUAUCGUGUUCACGAUUGCUACUCUAGUCAAUCCAUUUUUCAUUCGUAUUCGCACCGAACUGUACGGGUUAGCUUGGGAUCGUCGUUUAUACGCACCCGUACAACGUAGGGCUGGACCCCUAGAGUAUCUCUGGGGAGGCAAAAUAGUGAGCAAUAUAACACGGUUAGACCACUGGAUAUACAGGCAGGCGGGUCAGUUUUUUACGUCCUCUCAUCGAGGCGGCAGUCGAUCUGGAUGGACCUCGCCUAAAAACGAGCGCGGAUUGCACGCGUACGAUAAGUUUACCCGGGGUUGUUGUCAUGGCCUUCUCUCCCCCACGGUUGGCAGUUGCGAAAAAACUGCUGUCGACCAGCCACGCUGGUACAAUAACGAUGAUCUAAUGAAUCUACAUGCUCCCCAUGCAUCCGUAUUAGUGCACACUCCCCUGCGCAAGGCAUUUCUAGUGUGGUAUAAGACUCUUGCAUCCACAUCAAAUCCCGCUCAGAUGGUCUUCGAAGAACUUGGUGCGAAAGUCCUACCUUGUAUAAUUGUACUGGGUUUGCUCGACGAACGUGUUGCGAGCUGGGCACAACUGCCGCCGUCCAUCGGCAAUAUAGCCAAACGUUCGCAACGCUCCGACGAUGAGACUCGCUCGUCUGCCUUGGAUCACCGGAAAGUAAGACGGCCUUAUAUUAUUUCAGAGAGUGAAGUCAGACUACGCUGCGUUUCGGAAUAUAAUCACUUUUUAACCUUUCUGCUCCCAAUAGAUCGGUCCAGGGGUAUCCAGGCUUGGUCAGAUAUGUGGUUUACAGAGUCGGACACGAACACCGCGAAGGAACCUUACGGCAUCUAUAAACCGCACGCUGGAAAUCAAGGGCACUGUAUUGUGUUGAACCUUAGUAUGCGGGCUUAUGUCUUGCUGUCCAUUAAGACAAUAUACGAUAUCGUCAGACAAUCGGGGGGGUGUGAUUUCGUAAGCGACCCCCGAAGUCGUCUACGCCCGCCUGAAACUCCCCCCCGCGAGCAUCUGUUCCCUUAUACGCGGGAAUGCGUGCGCCUACAUGUCACUGCACCCGAAGCCAGGGUAAUGAUGCUAAGGAUUCCGUGGCGCUUAAUGGCUUCCGCAAAACAUAAUUUCAAUCCAUGUGGCCUGUCGCAUGACAAUCGCAUGGAGGCGGGGCCCGCUUACCAGCCGCUACUACGUAUUGCAGAGAUCUUUAAGUGCGUUCAUCUAUGCUGUCACUUACUCCAUUGCCUAGGGGUGUCCUUUGGUACUAAGUUUUCUACAUACGAGGGGAGACCGUUAUCGGCUUCUACUAAGAUACAGUCUCACACUCUAUCGCUCCCUACGACCCUCUUGACGACUAGACUCACGUCUGGACACUUGUGUAUUGUGGGACUAGUCGUCCGGUUUGAUGCAACGCGUCCAAUGCCUAGCGGGUGUUUAGGAACUAACAUAGGAGGCCUUCAGCCUGGAGGGGUAAGAGAUAAAAAGGUGCGUUUAAGGUCGUCCGAGGUUGAGCCAAGAAGCAAGCAACGCAUUGUAAGUGGCCAAAGUGCCAAUCUUCUCGGCGCAGGCGUUUCCAGCGAGCUGGAGGCGGCUGCAACUGAACGAAUGCCCGUCAUCCGCUUAUCUCCCUGGGUAGUACCGACGGAUCAGAGUGUAUAUUUGUCCCAGGAUUUUCAACUGCGCAAACUCAGCCGCGAUAACCUCGCUAAUUCCUCAACGCUUCUGGAAGGGGAGGUGACACCGUCCCAUGUUGGGAAUACCCCGGCUGCAAGACACACUUCUCCGAUCAUACAGGGAUAUCGCACAGGUCGGAAUAUGAGCGCUCUCUUCGUUCACGUUCGCAGAGAUACCAGAGGUCCGGAGGUACGAAGCUUAGCAUCAACAUUCAGUACUCCAGAGGGCACCAUGCAGAACCCGCAGUUGUGCUUCCGCGCUAAUCGGGCCUAUCCUAUACUAUCAUCGGAAAUGCCUGCAAACCAGUGCACUUGCGGAUGUUGCAUUAAAAUGCCCCACGCUGCACGGGAUGGUCUCUCUCUUUUGAGUCCUCCGAGCUUAACGUAUCAUGCCCACAGACCAUCAGCGAGCAAUGAAAAUUCGAGGUCGACGGAGUUAACUCGGAUUACCGCUCCCGACUGCACCGAGACAUCUCCCGAAAUCGAACGAUCUACGGGCUUAACCCCUACGUCUAGGAGCGCUUUUGCAAGCGUAGAGAUUGCAAAAUUCCAAACAGCCUCACCCGGCACUCCGCGUACCGGGCAAAAGUCUCCAAUUGGUAGGUGCCAAAGCGGAGCGGAGGCAGCUAUAGCAGGGUUUGCGGGAGGUUGCAUUUCUUGUAUCGUAAGUAGCAAACUUCAAAUCGUCACUUUACUCAGCACGAUUUACAAACUUGCCGUAAUGCAAUUGCUUACUGCGAGUGUACGUCUAAGAGGAUCCCGUCUUAUUAUCUGUAGUGUCCAUGGCCAGUCCGGCUUCCACCAGUUAUACCAAGGCCAGAUCUUCUUAACAAAACCUGUUAUAGACCUCUGCCUGAAGUACCAUGCAUUCCCGGUCUACUUUGAGAUGGUCCUAGGCGCGUGGUCGCCCGUGGCGAGGGAGUUUGUAUGGUCCUCCGGGCCAAGUAAGACCGGAGUCCAGAAAUUCGUACCCAGCCAGCGGAGAACGGGGUUUCAUAAGCACAAGGCCUCGACGUCAGGCUUGGACCCGCUAGUGUGGGCAGCUCUAAAUUGUAAAGCUCAGCUCUGCCGUUUUGGCGUUGGACUCAUCGAUCAGUUUCUCCGGGGUCACUCUGCGAUCGUAGCUGAGUGCGAUCCUCAACGUAUAUCUCGACAUGGUUACCCCUCAUCCUCCUACAAGGGCAGCCUGGAUGACCGCCCGGAACCGCCUAGCCACGCAUGGUGUUGUCACAGCUUUUACAAGGAAAAGAGUUAUUUGCAUACUAAUUUAGGAAGCAACUUGGCGAAAGUGGCAGCAGCAUAUGAUUUUCCCACUGUCCCCCCGGGAGAUCAUCUAAAGAUACGCUGUGGUUCUAUUGAUAACGAGUUACAAGUGCAGGUCACAGACGACACGGCCCAGGCUGAUCAUGUAAACCAUUUGCUUAGGCAAAGUGACGGCCGGAGCUCUGACACGCACUCCAUAUUUCUGACCCCGACACCCGCUAUUGGCGCCUGUUUACGAAGGAAUAUCAUUUUGAACGUCAGGGGAAGUGCAUUAAUCGAAAACGUUGAAUCGGUGGCCCCAUCCCAGAUAUGCCAUCUACGGGAUAUGAGUACGACGGUUUCGCUUCCCAUUGUAUAUGCUCCUAGAGGCCAGAUCAACUUCUGGAGCGUUACGCGAUAA'

coding_rna = Seq(text, IUPAC.unambiguous_rna)
coding_rna

protein_seq = coding_rna.translate()
print(protein_seq)


In [None]:
# Find substrings of a genome encoding a given amino acid sequence.
# A DNA string Text, an amino acid string Peptide, and the array GeneticCode
# find All substrings of Text encoding Peptide (if any such substrings exist).
GeneticCode = {'ACC': 'T', 'GCA': 'A', 'AAG': 'K', 'AAA': 'K', 'GUU': 'V', 'AAC': 'N', 'AGG': 'R', 'UGG': 'W', 'GUC': 'V', 'AGC': 'S', 'ACA': 'T', 'AGA': 'R', 'AAU': 'N', 'ACU': 'T', 'GUG': 'V', 'CAC': 'H', 'ACG': 'T', 'AGU': 'S', 'CCA': 'P', 'CAA': 'Q', 'CCC': 'P', 'UGU': 'C', 'GGU': 'G', 'UCU': 'S', 'GCG': 'A', 'CGA': 'R', 'CAG': 'Q', 'CGC': 'R', 'UAU': 'Y', 'CGG': 'R', 'UCG': 'S', 'CCU': 'P', 'GGG': 'G', 'GGA': 'G', 'GGC': 'G', 'CCG': 'P', 'UCC': 'S', 'UAC': 'Y', 'CGU': 'R', 'GAA': 'E', 'AUA': 'I', 'AUC': 'I', 'CUU': 'L', 'UCA': 'S', 'AUG': 'M', 'UGA': '', 'CUG': 'L', 'GAG': 'E', 'AUU': 'I', 'CAU': 'H', 'CUA': 'L', 'UAA': '', 'GCC': 'A', 'UUU': 'F', 'GAC': 'D', 'GUA': 'V', 'UGC': 'C', 'GCU': 'A', 'UAG': '', 'CUC': 'L', 'UUG': 'L', 'UUA': 'L', 'GAU': 'D', 'UUC': 'F'}

def rna_translate_protein(rna):
    i = 0
    protein = []
    while i < len(rna)-2:
        if len(GeneticCode[rna[i:i+3]]) ==0:
            return protein
        else:
            protein.append(GeneticCode[rna[i:i+3]])
        i = i + 3
    return ''.join(protein)

def peptide_encoding(text,peptide):
    k = 3 * len(peptide)
    ret = []
    for i in range(len(text)-k+1):
        rna = text[i:i+k].replace('T','U')
        protein = rna_translate_protein(rna)
        if protein == peptide:
            ret.append(text[i:i+k])
    reversetext = reverse_complement(text)
    for i in range(len(reversetext)-k+1):
        rna = reversetext[i:i+k].replace('T','U')
        protein = rna_translate_protein(rna)
        if protein == peptide:
            ret.append(reverse_complement(reversetext[i:i+k]))
    return ret

def count_seq(peptide):
    ret = 1
    for pep in peptide:
        ret = ret * len([i for i, v in GeneticCode.items() if v == pep])
    print(ret)

def reverse_complement(dna):
    dnadict = {'A':'T','C':'G','G':'C','T':'A'}
    reverseDna = [ dnadict[c] for c in dna ]
    reverseDna = reverseDna[::-1]
    return ''.join(reverseDna)

# text = "ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA"
# peptide = "MA"
protein = peptide_encoding(text,peptide)
print('\n'.join(protein))

In [None]:
#How many subpeptides does a cyclic peptide of length n have
def subpeptide(n):
    subpeptides = n*(n-1)
    return subpeptides

subpeptide(35046)

In [None]:
#Generate the theoretical spectrum of a cyclic/linear/other peptide.
#The theoretical spectrum of a cyclic peptide Peptide, denoted Cyclospectrum(Peptide), is the collection of all of the masses of its subpeptides, in addition to the mass 0 and the mass of the entire peptide, with masses ordered from smallest to largest
MassTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}

def theoretical_spectrum(peptide):
    k = len(peptide)
    spec = []
    while k > 0:
        for i in range(len(peptide)-k+1):
            subpep = peptide[i:i+k]
            spec.append(sum([MassTable[s] for s in subpep]))
        k -= 1
    spec.append(0)
    return sorted(spec)

def linearSpectrum(peptide):
    prefixMass = [0]*(len(peptide)+1)
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + MassTable[peptide[i]]
    linearSpec = [0]
    for i in range(len(peptide)):
        for j in range(i+1,len(peptide)+1):
            linearSpec.append(prefixMass[j]-prefixMass[i])
    return sorted(linearSpec)

def cyclicSpectrum(peptide):
    prefixMass = [0]*(len(peptide)+1)
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + MassTable[peptide[i]]
    peptideMass = prefixMass[-1]
    cyclicSpec = [0]
    for i in range(len(peptide)):
        for j in range(i+1,len(peptide)+1):
            cyclicSpec.append(prefixMass[j]-prefixMass[i])
            if i > 0 and j < len(peptide):
                cyclicSpec.append(peptideMass - prefixMass[j] + prefixMass[i]) #<--Nice!
    return sorted(cyclicSpec)

In [None]:
peptide = 'MKIMSAFWIDSSED'
spec = cyclicSpectrum(peptide)
print(' '.join(map(str,spec)))

In [5]:
#How many subpeptides does a linear peptide of given length n have
def linear_subpeptide(n):
    subpeptides = int(n * ((n+1)/2)) + 1
    return subpeptides

linear_subpeptide(10074)

50747776

In [None]:
#match peptides with the spectrum. Returns index of target peptides that matches the spectrum             
def linearspectrum(peptide):
    amino_acid_mass_table = {'G':57,'A':71,'S':87,'P':97,'V':99,'T':101,'C':103,'I':113,'L':113,'N':114,'D':115,'K':128,'Q':128,'E':129,'M':131,'H':137,'F':147,'R':156,'Y':163,'W':186} 
    sub_peptides = [['',0]]
    for l in range(len(peptide))[1:]:
        for pos in range(len(peptide)):
            if pos + l <= len(peptide):
                sub_peptide = peptide[pos:pos+l]
                sub_peptides.append([sub_peptide,0])
    if (peptide):
        sub_peptides.append([peptide,0])

    for entry in sub_peptides:
        entry[1] = _get_peptide_mass(amino_acid_mass_table, entry[0])
    return sorted(sub_peptides, key=lambda entry: entry[1])

def _get_peptide_mass(amino_acid_mass_table, peptide):
    mass = 0
    for pos in range(len(peptide)):
        mass += amino_acid_mass_table[peptide[pos]]
    return mass

def quiz6():
    print("----- Quiz 6 -----")
    #peptides = ["TCE", "AQV", "VAQ", "ETC", "CTV", "CET"]
    peptides = ["QCV", "ETC", "TVQ", "AVQ", "TCE", "TCQ"]
    spectrum_set = set("0 71 99 101 103 128 129 199 200 204 227 230 231 298 303 328 330 332 333".split(' '))
    for peptide in peptides:
        sub_spectrum = [str(item[1]) for item in linearspectrum(peptide)]
        if set(sub_spectrum).issubset(spectrum_set):
            print(str(peptides.index(peptide)+1))

In [None]:
'''
    CYCLOPEPTIDESEQUENCING(Spectrum)
        Peptides ? a set containing only the empty peptide
        while Peptides is nonempty
            Peptides ? Expand(Peptides)
            for each peptide Peptide in Peptides
                if Mass(Peptide) = ParentMass(Spectrum)
                    if Cyclospectrum(Peptide) = Spectrum
                        output Peptide
                    remove Peptide from Peptides
                else if Peptide is not consistent with Spectrum
                    remove Peptide from Peptides
Sample Input:
     0 113 128 186 241 299 314 427
Sample Output:
     186-128-113 186-113-128 128-186-113 128-113-186 113-186-128 113-128-186
'''

input = '0 101 103 113 128 128 128 128 129 147 163 214 231 248 256 256 257 257 266 275 276 359 361 376 377 379 384 385 385 394 404 480 487 489 505 507 513 513 522 524 532 608 616 618 627 633 635 641 650 652 660 736 744 746 755 755 761 763 779 781 788 864 874 883 883 884 889 891 892 907 909 992 993 1002 1011 1011 1012 1012 1020 1037 1054 1105 1121 1139 1140 1140 1140 1140 1155 1165 1167 1268'
with open('/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/integer_mass_table.txt') as f:
    masses = list(set([int(line.strip().split()[1]) for line in f]))

spectrum = [int(i) for i in input.strip().split()]
#print spectrum


def expand_list(peptides, masses):
    if len(peptides) == 0:
        return [([m], [0, m]) for m in masses]

    def combine_spectrum(peptide, mass):
        def extend_spectrum(masses, m):
            return masses + [m] + [(sum(peptide[0][i:]) + m)
                                   for i in range(len(peptide[0]))]
        return (peptide[0]+[mass], extend_spectrum(peptide[1], mass))
    return [combine_spectrum(p, m) for p in peptides for m in masses]


def is_consistent(linear, spectrum):
    for e in linear:
        if linear.count(e) > spectrum.count(e):
            return False
    return True

consistent = []

for i in range(20):
    consistent = [cand for cand in expand_list(consistent, masses)
                  if is_consistent(cand[1], spectrum)]
    print ("i: %s, len(consistent): %s" % (i, len(consistent)))
    if len(consistent) == 0:
        break
    print (*(sorted(['-'.join([str(i) for i in c[0]])
                   for c in consistent])), sep = ' ')

In [None]:
# Cyclopeptide Scoring Problem: Compute the score of a cyclic peptide against a spectrum
# To generalize the Cyclopeptide Sequencing Problem to handle noisy spectra, we need to relax the requirement that a candidate peptide’s theoretical spectrum must match the experimental spectrum exactly, and instead incorporate a scoring function that will select the peptide whose theoretical spectrum matches the given experimental spectrum the most closely.

MassTable = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}

def cyclicSpectrum(peptide):
    prefixMass = [0]*(len(peptide)+1)
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + MassTable[peptide[i]]
    peptideMass = prefixMass[-1]
    cyclicSpec = [0]
    for i in range(len(peptide)):
        for j in range(i+1,len(peptide)+1):
            cyclicSpec.append(prefixMass[j]-prefixMass[i])
            if i > 0 and j < len(peptide):
                cyclicSpec.append(peptideMass - prefixMass[j] + prefixMass[i]) #<--Nice!
    return sorted(cyclicSpec)

import collections
def cyclic_scoring(peptide,expSpectrum):
    theoSpectrum = cyclicSpectrum(peptide)
    theoSpec_multiset = collections.Counter(theoSpectrum)
    expSpec_multiset = collections.Counter(expSpectrum)
    overlap = list((theoSpec_multiset & expSpec_multiset).elements())
    theoSpec_remainder = list((theoSpec_multiset - expSpec_multiset).elements())
    expSpect_remainder = list((expSpec_multiset - theoSpec_multiset).elements())
    return len(overlap)

#peptide = 'NQEL' 
#expSpectrum = [0,99,113,114,128,227,257,299,355,356,370,371,484]
score = cyclic_scoring(peptide,expSpectrum)
print(score)

In [None]:
#Implement LeaderboardCyclopeptideSequencing
#output - LeaderPeptide after running LeaderboardCyclopeptideSequencing(Spectrum, N)
aminoacid = ['G', 'A', 'S', 'P', 'V', 'T', 'C', 'L', 'N', 'D', 'K', 'E', 'M', 'H', 'F', 'R', 'Y', 'W']
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}
def expand(leaderboard):
    """Expands each peptide/aminoacid in leaderboard by all 18 aminoacids with distinct masses."""
    expanded = []
    for i in leaderboard:
        expanded += [i+j for j in aminoacidMass.keys()]
    return expanded   

def mass(peptide):
    """Calculates the mass of peptide using the aminoacidMass dictionary"""
    massOfPeptide = 0
    for i in peptide:
        massOfPeptide += aminoacidMass[i]
    return massOfPeptide

def cyclicSpectrum(peptide):
    """Input: An amino acid string Peptide.
     Output: The cyclic spectrum of Peptide."""
    prefixMass = [0]*((len(peptide)+1))
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + aminoacidMass[peptide[i]]
    peptideMass = prefixMass[len(peptide)]
    cyclic_spectrum = [0]
    for i in range(len(prefixMass)-1):
        for j in range(i+1, len(prefixMass)):
            cyclic_spectrum.append(prefixMass[j] - prefixMass[i])
            if i > 0 and j < (len(prefixMass)-1):
                cyclic_spectrum.append(peptideMass - (prefixMass[j] - prefixMass[i]))
    return sorted(cyclic_spectrum) 

from collections import Counter
def score_peptide(peptide, spectrum):
    """Cyclopeptide Scoring Problem: Compute the score of a cyclic peptide against a spectrum.
     Input: An amino acid string Peptide and a collection of integers Spectrum. 
     Output: The score of Peptide against Spectrum, Score(Peptide, Spectrum)."""
    spectrum_peptide = cyclicSpectrum(peptide)
    c1, c2 = Counter(spectrum_peptide), Counter(spectrum)
    return sum([min(n, c2[k]) for k,n in c1.items()])

def linearSpectrum(peptide):
    """Input: An amino acid string Peptide.
     Output: The linear spectrum of Peptide."""
    prefixMass = [0]*((len(peptide)+1))
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + aminoacidMass[peptide[i]]
    #print 'prefixMass', prefixMass
    linear_spectrum = [0]
    for i in range(len(prefixMass)-1):
        for j in range(i+1, len(prefixMass)):
            linear_spectrum.append(prefixMass[j] - prefixMass[i])
    return sorted(linear_spectrum) 

def score_linear_peptide(peptide, spectrum):
    """Compute the score of a linear peptide with respect to a spectrum.
     Input: An amino acid string Peptide and a collection of integers Spectrum.
     Output: The linear score of Peptide with respect to Spectrum, LinearScore(Peptide, Spectrum)."""
    spectrum_linear_peptide = linearSpectrum(peptide)
    c3, c4 = Counter(spectrum_linear_peptide), Counter(spectrum)
    return sum([min(n, c4[k]) for k,n in c3.items()])

def trim_leaderboard(leaderboard, spectrum, N):
    """Input: A collection of peptides Leaderboard, a collection of integers Spectrum, and an integer N.
     Output: The N highest-scoring linear peptides on Leaderboard with respect to Spectrum."""
    scores =  [[score_linear_peptide(peptide, spectrum), peptide] for peptide in leaderboard]
    sorted_scores = sorted(scores, reverse = True)
    if len(leaderboard) <= N:
        return [i[1] for i in sorted_scores]
    else:
        return [i[1] for i in sorted_scores if i[0] >= sorted_scores[int(N)-1][0]]

def leaderboard_cyclopeptide_sequencing(spectrum, N):
    """ Input: An integer N and a collection of integers Spectrum.
     Output: LeaderPeptide after running LEADERBOARDCYCLOPEPTIDESEQUENCING(Spectrum, N)"""
    leaderboard = aminoacid
    leaderpeptide = ''
    parentmass = max(spectrum)
    while len(leaderboard) > 0:
        leaderboard = expand(leaderboard)
        for peptide in leaderboard[:]:
            if mass(peptide) == parentmass:
                if score_peptide(peptide, spectrum) > score_peptide(leaderpeptide, spectrum):
                    leaderpeptide = peptide
            elif mass(peptide) > parentmass:
                leaderboard.remove(peptide)
        leaderboard = trim_leaderboard(leaderboard, spectrum, N)   
    return leaderpeptide

In [None]:
# Compute the convolution of a spectrum
# The list of elements in the convolution of Spectrum. If an element has multiplicity k, it should appear exactly k times; you may return the elements in any order.

def spectral_convolution(spectrum):
    convolution = []
    #spectrum = sorted(spectrum)
    for i in range(len(spectrum)):
        for j in range(i+1,len(spectrum)):
            if spectrum[j]-spectrum[i] > 0 :
                convolution.append(spectrum[j]-spectrum[i])
    return convolution

#spectrum = [0,137,186,323]
convolution = spectral_convolution(spectrum)
print(' '.join(map(str,convolution)))

In [None]:
# We now have the outline for a new cyclopeptide sequencing algorithm. Given an experimental spectrum, we first compute the convolution of an experimental spectrum. We then select the M most frequent elements between 57 and 200 in the convolution to form an extended alphabet of candidate amino acid masses. In order to be fair, we should include the top M elements of the convolution "with ties". Finally, we run the algorithm LeaderboardCyclopeptideSequencing, where the amino acid masses are restricted to this alphabet. We call this algorithm ConvolutionCyclopeptideSequencing.
#output - A cyclic peptide LeaderPeptide with amino acids taken only from the top M elements (and ties) of the convolution of Spectrum that fall between 57 and 200, and where the size of Leaderboard is restricted to the top N (and ties).

from collections import Counter

def spectral_convolution(spectrum):
    convolution = []
    spectrum = sorted(spectrum)
    for i in range(len(spectrum)):
        for j in range(i+1,len(spectrum)):
            if spectrum[j]-spectrum[i] >=57 and  spectrum[j]-spectrum[i] < 200:
                convolution.append(spectrum[j]-spectrum[i])
    return convolution 

def convolution_cyclopeptide_sequencing(spectrum,M,N):
    convolution = spectral_convolution(spectrum)
    theoSpecScore = [ c[1] for c in Counter(convolution).most_common(M)] #<-- DIDN'T CONSIDER TIES
    threshold = min(theoSpecScore)
    convolution = dict(Counter(convolution))
    theoSpec = [k for k,v in convolution.items() if v>=threshold]
    leadPeptide = leaderboard_cyclopeptide_sequencing(spectrum, N)
    return leadPeptide

In [None]:
def linear_spectrum(peptide):
    prefix_mass = [0]
    for i in range(len(peptide)):
        prefix_mass.append(prefix_mass[i]+amino_acid_mass_table[peptide[i]])

    theoretical_spectrum = [0]
    for i in range(len(prefix_mass)-1):
        for j in range(i+1, len(prefix_mass)):
            theoretical_spectrum.append(prefix_mass[j]-prefix_mass[i])
    return sorted(theoretical_spectrum)

def _linear_score(peptide, spectrum):
    ls = linear_spectrum(peptide)
    cs = spectrum.copy()
    score = 0
    for c in ls:
        if c in cs:
            score += 1
            cs.remove(c)
    return score

def quiz4():
    print("------ Quiz 4 ------")
    print(_linear_score("PEEP", [int(mass) for mass in "0 97 129 129 129 194 226 323 323 355 452".split(' ')]))