For a graph to be a **Eulerian graph**, for every node, the number of incoming edges (**indegree**) must equal the number of outgoing edges (**outdegree**)—that is, a Eulerian graph must be **balanced**. Furthermore, for a graph to be Eulerian, it must be **strongly connected**—that is, it must be possible to reach any node from any other node through a sequence of edges (there must be a **path**). 

**Euler's Theorem**: Every balanced, strongly connected directed graph is Eulerian.

In [9]:
f = open("eulerCycle.txt", "r")
cycle_input = f.read().rstrip("\n").split("\n")
f.close()

def int_list(ls):
    return [int(i) for i in ls]
cycle_input_dict = {int(line.split(": ")[0]): int_list(line.split(": ")[1].split(" ")) for line in cycle_input}
print(cycle_input_dict)


{0: [18, 3, 46, 80], 1: [0, 130, 4], 10: [12], 100: [101, 386, 439], 101: [34, 669], 102: [100, 147, 612], 103: [117, 21], 104: [105], 105: [103, 626, 797], 106: [83, 995], 107: [108], 108: [106], 109: [111, 199, 252], 11: [10, 381], 110: [109, 914], 111: [323, 49], 112: [113, 170], 113: [114], 114: [50], 115: [116, 257], 116: [103], 117: [115, 228], 118: [241, 59], 119: [120, 149, 313], 12: [187, 67, 7, 91], 120: [118], 121: [123, 725], 122: [121, 203], 123: [7], 124: [126], 125: [24, 678], 126: [125, 267, 403], 127: [129], 128: [18, 837], 129: [128, 312], 13: [4], 130: [131, 225], 131: [132, 134], 132: [1], 133: [131], 134: [135, 320, 434], 135: [133, 691, 779], 136: [137, 421], 137: [138], 138: [297, 4, 463], 139: [28], 14: [15, 183, 37, 572], 140: [139, 218, 596, 631], 141: [140, 196, 555], 142: [143], 143: [144], 144: [509, 74], 145: [102, 163, 518], 146: [145], 147: [146, 306], 148: [119], 149: [150], 15: [13, 26], 150: [148], 151: [153, 955], 152: [151], 153: [55], 154: [156], 1

In [10]:
import random

def random_path(current_graph, current_node):
    cycle = [current_node]
    while True:
        if current_node in current_graph.keys():
            next_node = random.choice(current_graph[current_node])
            cycle.append(next_node)
            if len(current_graph[current_node]) > 1:
                current_graph[current_node].remove(next_node)
            else:
                current_graph.pop(current_node)
            current_node = next_node
        else:
            break
    return cycle, current_graph
    
def eulerian_cycle(graph):
    # 1. form Cycle by randomly walking in Graph
    current_graph = graph.copy()
    current_node = random.choice(list(current_graph.keys()))    
    cycle, current_graph = random_path(current_graph, current_node)
    cycle_prime = cycle

    def new_loop(ls, new_start):
        new_start_index = ls.index(new_start)
        nl = ls[new_start_index:] + ls[1:new_start_index+1]
        return nl

    while len(current_graph) > 0:
        new_start = random.choice([node for node in cycle if node in current_graph.keys()])
        
        nl = new_loop(cycle, new_start)
        c, g = random_path(current_graph, new_start)
        cycle_prime = nl + c[1:]
        cycle = cycle_prime
    
    return cycle

In [11]:
f = open("eulerCycle.txt", "r")
cycle_input = f.read().rstrip("\n").split("\n")
f.close()

def int_list(ls):
    return [int(i) for i in ls]
cycle_input_dict = {int(line.split(": ")[0]): int_list(line.split(": ")[1].split(" ")) for line in cycle_input}
print(" ".join(str(x) for x in eulerian_cycle(cycle_input_dict)))

922 923 924 374 188 189 358 360 359 189 12 7 122 203 204 202 418 558 557 609 738 736 737 609 971 972 970 609 608 607 557 556 418 419 681 680 728 729 727 680 679 419 420 202 122 121 725 724 726 857 858 856 726 121 123 7 61 369 368 477 475 802 803 804 475 476 368 367 61 62 97 98 293 292 294 98 99 62 63 289 291 290 63 325 326 397 710 988 989 990 710 711 709 397 399 657 655 656 399 398 326 327 63 7 959 958 960 7 2 21 104 105 626 627 625 105 797 798 796 828 827 826 796 105 103 117 228 227 226 388 390 389 515 514 516 389 226 117 115 257 258 567 565 566 258 256 115 116 103 21 19 24 76 683 684 682 76 356 357 355 76 78 77 24 316 317 489 487 889 891 890 487 488 317 318 24 124 126 267 266 265 279 277 818 817 819 277 921 919 920 277 278 982 983 984 278 265 126 403 743 744 742 403 405 404 126 125 678 676 677 947 946 948 677 125 24 23 22 19 840 839 838 19 686 685 687 19 20 34 102 147 306 304 521 522 520 304 452 453 451 304 305 147 146 145 518 642 640 641 518 517 519 145 163 165 751 752 753 165 164 1

Eulerian Path: 
Input: Adjacency list of a directed graph that has an Eulerian path
Output: An Eulerian path in this graph.

In [24]:
def eulerian_path(graph, maxkey, minkey):
    current_graph = graph.copy()
    current_node = random.choice(list(current_graph.keys())) 
    #current_node = maxkey  # diff
    cycle, current_graph = random_path(current_graph, current_node)
    cycle_prime = cycle

    def new_loop(ls, new_start):
        new_start_index = ls.index(new_start)
        nl = ls[new_start_index:] + ls[1:new_start_index+1]
        return nl

    while len(current_graph) > 0:
        new_start = random.choice([node for node in cycle if node in current_graph.keys()])
        
        nl = new_loop(cycle, new_start)
        c, g = random_path(current_graph, new_start)
        cycle_prime = nl + c[1:]
        cycle = cycle_prime
    # diff
    path = cycle[:-1]
    print(path)
    for i in range(len(path)-1):
        if path[i] == maxkey and path[len(path)-1] == minkey:
            return path
        if [path[i], path[i+1]] == [minkey, maxkey]:
            return path[i+1:] + path[0:i+1]
    
    return path

def eulerian_path(graph):
    sources = graph.keys()
    dests = sum(list(graph.values()), [])
    all_nodes = set(list(sources) + list(dests))
    all_nodes_dict = {}
    for i in all_nodes:
        if i in graph.keys():
            n = len(graph[i])
        else:
            n = 0
        for j in dests:
            if i == j:
                n -= 1
        all_nodes_dict[i] = n
    
    odds = {}
    for key, value in all_nodes_dict.items():
        if value % 2 == 1:
            odds[key] = value
    
    maxkey = max(odds, key=odds.get)
    minkey = min(odds, key=odds.get)
    graph.setdefault(minkey, []).append(maxkey)

    path = eulerian_cycle(graph)[:-1]
    for i in range(len(path)-1):
        if path[i] == maxkey and path[len(path)-1] == minkey:
            return path
        if [path[i], path[i+1]] == [minkey, maxkey]:
            return path[i+1:] + path[0:i+1]
    
    return path

In [25]:
f = open("eulerPath.txt", "r")
path_input = f.read().rstrip("\n").split("\n")
f.close()
path_input_dict = {int(line.split(": ")[0]): int_list(line.split(": ")[1].split(" ")) for line in path_input}
print(path_input_dict)


print(path_input_dict)
print(" ".join(str(x) for x in eulerian_path(path_input_dict)))

{0: [2], 1: [3], 2: [1], 3: [0, 4], 6: [3, 7], 7: [8], 8: [9], 9: [6]}
{0: [2], 1: [3], 2: [1], 3: [0, 4], 6: [3, 7], 7: [8], 8: [9], 9: [6]}
6 7 8 9 6 3 0 2 1 3 4


In [31]:
def prefix(pattern):
    return pattern[:-1]
def suffix(pattern):
    return pattern[1:]

def de_bruijn_from_kmers(patterns):
    adjacency_list = {}
    for p in patterns:
        pref = prefix(p)
        suff = suffix(p)
        adjacency_list.setdefault(pref, []).append(suff)
    return adjacency_list

def max_overlap_index(s1, s2):

    def similarity(s1, s2):
        sim_list = []
        for i in range(len(s1)):
            if s1[i] == s2[i]:
                sim_list.append(1)
            else:
                sim_list.append(-1) # penalize mismatches
        return sum(sim_list)
    
    index_to_overlap = {i: similarity(s1[i:], s2[0:len(s2)-i]) for i in range(len(s1))}
    return max(index_to_overlap, key=index_to_overlap.get)

def string_from_genome_path(path):
    string = path[0]
    for i in range(1, len(path)):
        prev = path[i-1]
        curr = path[i]
        max_index = max_overlap_index(prev, curr)
        string += curr[len(curr)-max_index:]
    return string

In [32]:
def string_reconstruction(patterns):
    db = de_bruijn_from_kmers(patterns)
    path = eulerian_path(db)
    text = string_from_genome_path(path)
    print(text)

In [34]:
f = open("stringRecon.txt", "r")
sr_input = f.read().rstrip("\n").split("\n")
f.close()
patterns = sr_input[1].split(" ")
#patterns = ["CTTA", "ACCA", "TACC", "GGCT", "GCTT", "TTAC"]
string_reconstruction(patterns)

AGAATCAAACAGATAAATGCTCCATAAGCAGCTTGATTCCTTGCGATCGTCCAGTAGTCCTACCTATTCTATACCCTCATACGAGAGTTACGGAACTGCGTGTGCTTGTTCTTTATAGCTATCAGTCCCGCGGCTTTAAACGGAAGAATATCAGGCCATCTGATGAACCCTACTACCACGCAAGCTCCTCTAGCATCGATTCAAGGAGATACTTAGAGGGTTTGTTTCTCTAGAGCCCGTTTATGCGCAGATACTACCCCTCAAGTACGAAATCTGGCAATGCGAGCGGTTAATTGATATAGCCGTCGAAAATGAGAATGCGAAACCTTCAAAGCTGCCGACCTTTTGGTGCTTACATCCTGAGTGCCATCATTTCGACTAGCTGAACACAGTGTGACCTATGTTGGGTACTAGCCGTCTTTCGATGCTACGTGAAGCTCCCGAGAGTGTAACAGATATCCTTTACCAATACATGCGCCAATGTGGTCTTACAGGGCAAGCGTACCGGAGCCCTGGCGCCTCCGTTGAGCTTTGCTCAAGAATGTGAGAAGAGGTAGGCCAGGAGCGAATTGTCCAGCGCAGCCAGTTGCACTCCGTGTAGCTGATGTGGAAAATGGCGAATCGCAATCTCACGGTGCACATGATACGATTAAGTAGTGCCTTCGACTTAGTTGTCTGGTACACGAGGTCAGCGGGCTGGAGCCGATCGGATCCATCGGCGATATATCGATCTCCACATTAGCATTTGATCGGCAAGAGCTAGCTCGATGTTAGGGGCAAATGGGTGGCACGATTGGATTGATTGTTACCGTGGAGGAGGTCCGATATTTCGGGATTTGTACGCGACTATTTCTCGCCTCATAACCCTAAATCAACGATCTGTAATCTCCAGCAGGACTCTTGCGTATGGATGGCGACGATAGGCTTCGTCCGGGCCCCGCCCGTCTTGTAACTAGAAGCCTGCAGCTCGCGCAATCCACAATCCGCAGCTGCACCTGAT