In [None]:
# Task 1 and 2

#Task 1 - generate nodes (from prefixes and suffices of the kd-mers)
#Task 2 - generate paired debruijn graph


# takes in a list of kd_mers and k - the length of each read
# outputs adjacency list

def debruin_kdmers(kd_mers,k):

  # sort patterns lexicographically
  # patterns.sort()  

  # an empty dictionary to store nodes
  nodes = {}

  # extract all nodes in dna
  for i in kd_mers:
    nodes[(i[0][0:k-1],i[1][0:k-1])] = []
   
  # loops over patterns and finds adjacent nodes 
  
  for j in kd_mers:
    start_node = (j[0][0:k-1],j[1][0:k-1])   
    adj_node = (j[0][1:k],j[1][1:k])
    nodes[start_node].append(adj_node)

  return(nodes)
   
      

In [None]:
#Task 3 - find an Eulerian path

# takes in an adjacent list and spits out an Eulerian path of the graph
def EulerianPath(adj): 
   
    # adj represents the adjacency list of 
    # the directed graph 
       
    if len(adj) == 0: 
        return # empty graph 

    else:
      # list to store all out nodes
      all_out_nodes = []

      # find start node
      start_node = []

      for j in adj.keys():
        for k in adj[j]:
          all_out_nodes.append(k)

      # find the nodes with zero outnodes and add them to the dictionary
      # (otherwise it throws errors)    
      unique_elts = set(all_out_nodes)
      for z in unique_elts:
        if z not in adj.keys():
          adj[z] = []   
          

      # find the odd vertex:
      for t in adj.keys():
        if len(adj[t])- all_out_nodes.count(t) == 1:
          start_node.append(t)
      # Maintain a stack to keep vertices 
      # We can start from any vertex, hence we choose one at random
      
      curr_path = [start_node[0]] 
    
      # list to store final circuit 
      circuit = [] 
    
      while curr_path: 
    
          curr_v = curr_path[-1] 
          
          # If there's remaining edge in adjacency list   
          # of the current vertex  
        
          if adj[curr_v]: 
    
              # Find and remove the next vertex that is   
              # adjacent to the current vertex 
              next_v = adj[curr_v].pop()
    
              # Push the new vertex to the stack 
              curr_path.append(next_v) 
    
          # back-track to find remaining circuit 
          else: 
              # Remove the current vertex and  
              # put it in the curcuit 
              circuit.append(curr_path.pop()) 

    reverse_circuit = []
    for i in range(len(circuit) - 1, -1, -1):
      reverse_circuit.append(circuit[i])
    return reverse_circuit                

    

In [None]:
#Task 4 - merge and print genome

# a function which takes in a path, consisting of tuples
# builds a string from the [0] elements and a string from the [1] elements
# merges the overlapping strings
# returns the result

def PathToGenomePairs(path, k, d, n):

  front_string=''
  back_string=''

  # construct front and back string by adding first letter from each front and back tuple
  # to the respective (front or back) string

  for i in range(len(path)-1):
    front_string = front_string + path[i][0][0]
    back_string = back_string + path[i][1][0]

  # add the final k letters to each string
  front_string = front_string + str(path[len(path)-1][0])  
  back_string = back_string + str(path[len(path)-1][1]) 
  # calculate how much of the second string is contained in the first string
  text_length = k + d + k + n - 1
  overlap = 2*len(front_string) - (text_length)
  genome = front_string + back_string[overlap::]

  return genome
  

In [None]:
# a function which takes in a set of paired (k,d)-mers and 
# spits out the reconstructed text which corresponds to these (k,d)-mers
def StringRecPairedReads(kd_mers, k, d):

  n = len(kd_mers)

  #Task 1 - generate nodes (from prefixes and suffices of the kd-mers)
  #Task 2 - generate paired debruijn graph
  db = debruin_kdmers(kd_mers,k)

  #Task 3 - find an Eulerian path
  eu = EulerianPath(db)

  #Task 4 - merge and print genome
  text = PathToGenomePairs(eu,k,d,n)
  
  return text


In [None]:
f = open('/dir/file.txt', 'r+')

In [None]:
mylist=[]
with f as file:
     mylist = [tuple(i.rstrip('\n').split('|')) for i in f]
print(mylist)

[('50 200',), ('TTCGTGCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAG', 'ACAAAGCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGC'), ('TCGTGCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGA', 'CAAAGCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCA'), ('CGTGCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGAC', 'AAAGCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCAC'), ('GTGCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACA', 'AAGCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACT'), ('TGCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAA', 'AGCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACTT'), ('GCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAA', 'GCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACTTT'), ('CTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAG', 'CTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACTTTT'), ('TACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGC', 'TGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACTTTTA'), ('ACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCC', 'GGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACTTTTAT

In [None]:
mylist = mylist[1::]

In [None]:
StringRecPairedReads(mylist, 50, 200)

'TTCGTGCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCCACACTATACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCGTCCTAGGCGAAAAATCCATCATACTTACAGAAAACGATGCCTTAACGTCCCTGTTATACGGCCCTGATACGGCCCTGACTGTGTCACCCTTATACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCTGGCACTTTTATGACGAGACAAAGCCTGTGTCACCCTTATGGCACTTTTATGTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCACGAGACAAAGCGGTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCGAGACATAAACCAATGTGTTAAGTCAGGTTTGAACAAGATATCGACTACGGCCCTGACTTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCGTGTCTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCACCCTTATGGCACTTTTATGTACTACGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCGGCCCTGACTGTGTCACCCTTATGGCACTTTTATGACGAGACAAAGCACGAGACAAAGCTCCTTGAGATCCACGTCTATTCCGGCGAGGCACATCGGTCCCTGTCTATTCTTATTCGTCCTATGTCTGAGTTGTCTACGTACTCACAATGTGCAACTGTAAGCTTCGGCTGGCAAACCGCCATGCGGGATAAGCTCCCTCAACTTCAGCCACTGGCCGACAGACGAATCTCCCCCTCCAGGCCATGCTCCTCGCTGATTGGCTGCTAGGCCTGGACGATTTGTACGGCCCTGACTGTGTCACCCTTATTACGGCCCTGACTGTGTCACCCTTATGGCAC