In [1]:
def create_graph_from_reads(reads):
    graph = dict()
    for read in reads:
        graph[read[1:]] = []
        graph[read[:-1]] = []
        
    for read in reads:
        graph[read[:-1]].append(read[1:])
    
    return graph

In [2]:
from collections import Counter

def get_inside_nodes(graph):
    count_out = Counter()
    for out_node in graph:
        count_out[out_node] += len(graph[out_node])
        
    count_in = Counter()
    for out_node in graph:
        for in_node in graph[out_node]:
            count_in[in_node] += 1
    
    nodes = []
    for node in graph:
        if count_out[node] == 1 and count_in[node] == 1:
            nodes.append(node)
            
    return nodes

In [3]:
class Contig:
    def __init__(self, node1, node2):
        self.nodes = [node1, node2]
        
    @property
    def start(self): 
        return self.nodes[0]
    
    @property
    def end(self): 
        return self.nodes[-1]
    
    def __str__(self):
        string = self.nodes[0][:-1]
        for node in self.nodes:
            string += node[-1] 
        return string
    
    def join(self, contig):
        self.nodes += contig.nodes[1:]

In [4]:
filename = 'rosalind_ba3k.txt'

In [5]:
with open(filename) as file:
    reads = []
    for line in file:
        reads.append(line.rstrip())

In [6]:
graph = create_graph_from_reads(reads)
inside_nodes = get_inside_nodes(graph)

In [7]:
contigs = []
for node in graph:
    for next_node in graph[node]:
        contigs.append(Contig(node, next_node))

In [8]:
for node in inside_nodes:
    left = None
    right = None
    for contig in contigs:
        if contig.start == node:
            right = contig
        if contig.end == node:
            left = contig
            
    left.join(right)
    contigs.remove(right)

In [9]:
for contig in contigs:
    print(contig, end = ' ')

ACATAAGGTATCGGCTGGTA ACATAAGGTATCGGCTGGTA CACATAAGGTATCGGCTGGT CACATAAGGTATCGGCTGGT CTGAAAGACCGAAAGCTTGGATGCCACAGCTGCT ACTGAAAGACCGAAAGCTTG ACTGAAAGACCGAAAGCTTG TTAAGTTATGAATGGGGATTGATTCATCATTCCCACG CTTAAGTTATGAATGGGGAT CTTAAGTTATGAATGGGGAT AACACATCAGGTGAATCTCCCTTGTAATTGCAGC GAACACATCAGGTGAATCTC GAACACATCAGGTGAATCTC CTGAGGACAGCACCACCCAT CTGAGGACAGCACCACCCAT AATGGACACGAGTTCTTTTT AATGGACACGAGTTCTTTTT AAATGGACACGAGTTCTTTT AAATGGACACGAGTTCTTTT GCGACCACCGTTCGCTTGTT GCGACCACCGTTCGCTTGTT AGCGACCACCGTTCGCTTGT AGCGACCACCGTTCGCTTGT TGATTCATCATTCCCACGTTGTTTAGCCGCTTAGTCC GCGGCCATTCAACTAACGTTGTCTGAATAGGAAATG TGGCGGGGTTGCCGTGCTGT TGGCGGGGTTGCCGTGCTGT TTGGATGCCACAGCTGCTAC TTGGATGCCACAGCTGCTAC CTTGGATGCCACAGCTGCTA CTTGGATGCCACAGCTGCTA ATAACATACCATAAGGGCCTAAGAGTTATAGGCTCCA AATAACATACCATAAGGGCC AATAACATACCATAAGGGCC TGGATAAATTGTATTTTTGCAGCAACGCGCGGCC ATGGATAAATTGTATTTTTG ATGGATAAATTGTATTTTTG ATAACTCAATCCGATCGTGCATGCCGAGTAGTATTAG CATAACTCAATCCGATCGTG CATAACTCAATCCGATCGTG AGAATCGGGCTTGCGAGCCGACGAATGGCACGGT