In [None]:
# The difference between the original copy and this code is that:
#(1)
# This code works with all aminoacids, not just the ones whose individual mass
# is contained in the experimental spectrum (because the exp spectrum may have missing/false masses)



In [None]:
# a function which takes in a peptide and 
# spits out the theoretical spectrum of that peptide
# where "theoretical spectrum" is is the collection of all of the masses of its subpeptides, 
# in addition to the mass 0 and the mass of the entire peptide, with masses ordered from smallest to largest.

def cyclic_spectrum(peptide):
  amino_acid_table = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
  prefix_mass = [0] * (len(peptide)+1)

  # prefix_mass is the list of mass of 1st, 1st+2nd, 1st+2nd+3rd,...1st+...+last
  for i in range(1,len(peptide)+1):
    prefix_mass[i] = prefix_mass[i-1] + amino_acid_table[peptide[i-1]]
  
  # take the value of the total mass of the peptide and store it in a var  
  peptide_mass = max(prefix_mass)
  cyclic_spectrum = [0]
  for i in range(0, len(peptide)):
    for j in range(i + 1,len(peptide)+1):
      cyclic_spectrum.append(prefix_mass[j] - prefix_mass[i])

    # this here part accounts for the cyclic nature of the peptide
      if i > 0 and j < len(peptide):
        cyclic_spectrum.append(peptide_mass - (prefix_mass[j] - prefix_mass[i]))

  cyclic_spectrum.sort()        
  
  return cyclic_spectrum

In [None]:
# a function which takes in a peptide
# and spits out its linear spectrum
# where "linear spectrum" is a list of the integer masses of the subpeptides

def linear_spectrum(peptide):
    
  amino_acid_table = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
  prefix_mass = [0] * (len(peptide)+1)

  # prefix_mass is the list of mass of 1st, 1st+2nd, 1st+2nd+3rd,...1st+...+last
  for i in range(1,len(peptide)+1):
    prefix_mass[i] = prefix_mass[i-1] + amino_acid_table[peptide[i-1]]

  linear_spectrum = [0]
  for i in range(0, len(peptide)):
    for j in range(i + 1,len(peptide)+1):
      linear_spectrum.append(prefix_mass[j] - prefix_mass[i])

  
  linear_spectrum.sort()        
  return linear_spectrum
 

In [None]:
# Input: linear peptide, an experimental spectrum
# Output: The similarity score of the peptide and the exp spectrum

def linpeptide_scoring(lin_peptide, exp_spectrum):
  lin_spec = linear_spectrum(lin_peptide)
  # store unique elts in exp_spectrum
  lst_uniq = list(set(exp_spectrum))
  score = 0
  for i in lst_uniq:
    score = score + min(exp_spectrum.count(i), lin_spec.count(i))
  
  return(score)

In [None]:
# Input: Cyclic peptide, experimental spectrum
# Output: How many matches of masses there are between cyclic peptide and exp_spectrum

def cyclopeptide_scoring(cyc_peptide, exp_spectrum):
  cyc_spec = cyclic_spectrum(cyc_peptide)
  # store unique elts in exp_spectrum
  lst_uniq = list(set(exp_spectrum))
  score = 0
  for i in lst_uniq:
    score = score + min(exp_spectrum.count(i), cyc_spec.count(i))
  
  return(score)

In [None]:
def trim(leaderboard, spectrum, N):
  linear_scores = [None]*len(leaderboard)
  final_peptides = []
  for j in range(len(leaderboard)):
    peptide = leaderboard[j]
    linear_scores[j] = linpeptide_scoring(peptide, spectrum)
  
  # link the two lists
  from operator import itemgetter
  h = [list(x) for x in zip(*sorted(zip(linear_scores, leaderboard), key=itemgetter(0)))]
  linear_scores = h[0]
  leaderboard = h[1]
  linear_scores.reverse()
  leaderboard.reverse()

  # sort the linear scores list in a decreasing order
  
  for j in range(N + 1,len(leaderboard)):
    if linear_scores[j] < linear_scores[N]:
      leaderboard = leaderboard[0:j]           
      return leaderboard

  return leaderboard

In [None]:
# a function which takes in a peptide (as a string)
# and it spits out its mass

def mass(peptide):
  # a dictionary with name of amino acid as a key and its int mass as a value
  # the empty peptide is denoted by '0' - zero, and has mass 0
  amino_acid_mass = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
  # split peptide into amino acids
  peptide = list(peptide)
  mass = 0
  for i in peptide:
    mass = mass + amino_acid_mass[i]
    
  return mass  

In [None]:
# a function which checks if a list is a sublist of another list
def is_sublist(sub_list, list_):
    for item in sub_list:
      if sub_list.count(item) > list_.count(item):
        return False
    return True

In [None]:
# Input: a k-long peptide and a list of amino acids
# Output: the newly formed (k+1)-long peptides, formed by peptide + amino acd from list
def expand(peptide_list, amino_list):
  amino_list = []
  new_peptides = []
  for peptide in peptide_list:
    for a in amino_list:
      new_peptides.append(peptide+a)
    
  return new_peptides  

In [None]:
def leaderboard_cyclopep_seq(exp_spectrum, N):
  aminos = ['G', 'A', 'S', 'P', 'V', 'T', 'C', 'L', 'N', 'D', 'K', 'E', 'M', 'H', 'F', 'R', 'Y', 'W']
  parental_mass = max(exp_spectrum)
  leader_peptide = ''
  leaders = {}
  abs_leaders = []
  candidate_peptides = ['']
  max_score = 0
  while candidate_peptides != []:
    leaderboard = [pept + amino for pept in candidate_peptides for amino in aminos]
    candidate_peptides = []
    for candidate in leaderboard:
      mass_cand = mass(candidate)
      if mass_cand == parental_mass:
        leaders[candidate] = cyclopeptide_scoring(candidate, exp_spectrum)
        if cyclopeptide_scoring(candidate, exp_spectrum) > cyclopeptide_scoring(leader_peptide, exp_spectrum):
          leader_peptide = candidate
          max_score = cyclopeptide_scoring(candidate, exp_spectrum)
          
      elif mass_cand < parental_mass:
        candidate_peptides.append(candidate) 

    if len(candidate_peptides):
      candidate_peptides = trim(candidate_peptides, exp_spectrum, N)
  
  for b in leaders.keys():
    if leaders[b] == max_score:
      abs_leaders.append(b)   

  leader_peptide = abs_leaders[0]
  print(abs_leaders)
  print(max_score)

  s = ''
  for z in range(len(leader_peptide)):
    if z == len(leader_peptide)-1:
      s = s + str(mass(leader_peptide[z]))
    else:
      s = s + str(mass(leader_peptide[z])) + '-'

  return s


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
spec = [0, 71, 113, 129, 147, 200, 218, 260, 313, 331, 347, 389, 460]

In [None]:
leaderboard_cyclopep_seq(spec,10)

'147-71-129-113'

In [None]:
# for a test set check Rosalind
f = open('/dir/file.txt', 'r+')

In [None]:

with f as file:
  # read first line and extract peptide 
    number = file.readline().strip('\n') 
    number = int(number)
    exp_spec = list(file.readline().strip().split(' ')) 
exp_spec = list(map(int, exp_spec))


In [None]:
leaderboard_cyclopep_seq(exp_spec, 1000)