This rosalind-based assignment required a single notebook. I have provided this in files::Jupyter Assignments


This is a Rosalind-based assignment. Consider (and do) the 7 new problems. These are all from chapter 3 of the text.

problem8	Generate the k-mer Composition of a String

problem9	Reconstruct a String from its Genome Path

problem10	Construct the Overlap Graph of a Collection of k-mers

problem11	Construct the De Bruijn Graph of a String

problem12	Construct the De Bruijn Graph of a Collection of k-mers

problem13.py	Find an Eulerian Path in a Graph

problem14.py	Reconstruct a String from its k-mer Composition

# Problem 8 

In [None]:
class StringComposition:
    """
    List all possible k-mers from a given sequence, and return them in sorted order as strings.
    
    Input: A DNA sequence. The desired k-mer length. 
    Output: All k-mers as strings, sorted.
    """
    def __init__(self, input_data, n):
        """
        initialize the class variables. 
        
        """
        self.data = input_data # input sequence
        self.kmer_size = n # length of the kmer
        self.kmers = [] # initialize an empty array to store kmers.
        
    
    def compose(self):
        """
        Create kmers from the given sequence. Returns the list of kmers stored in self.kmers.
        """
        for index in range((len(self.data[1])-self.kmer_size)+1): # iterate over the sequence with a window size of self.kmer_size
            kmer = self.data[1][index:index+self.kmer_size] 
            self.kmers.append(kmer) # appends every kmer to the class variable array.
        return self.kmers
    
    def writeFile(self, output_kmers, output_file):
        """
        prints the list of created kmers from the input sequence as well as the output is stored in an output file.
        """
        print(output_kmers) # Prints the output kmers.
        with open(output_file, 'w') as file: # writes the created kmers in an output file.
            for k in output_kmers: 
                file.write(k+"\n") 

def main():
    """
    Main function to parse the input file and preprocess the data.
    """
    with open("rosalind_ba3a.txt") as file:
        data = file.readlines()
    data = [d.replace("\n", "") for d in data]
    n = int(data[0])
    str_compose = StringComposition(data, n)
    k_mers = str_compose.compose()
    str_compose.writeFile(k_mers, "rosalind_ba3a_output.txt")
    
if __name__ == "__main__":
    main()

# Problem 9

In [None]:
class Genome:
    """
    Generate a consensus sequence by reconstructing a string from a list of k-mers.
    
    Input: A list of k-mers. The length of each k-mer. 
    Output: The consensus sequence in string format.
    """
    def __init__(self, input_kmers):
        """
        initialize the class variables. 

        """
        self.kmers = input_kmers
        self.genome = ""
    
    def genomeReconstruct(self):
        """
        Construct a string by reassembling a list of k-mers.
        """
        self.genome += self.kmers[0]
        for d in self.kmers[1:]:
            self.genome += d[-1]
    
    def writeFile(self):
        """
        prints the created sequence as well as adds to the output file. 

        """
        print(self.genome)
        with open("rosalind_ba3b_output.txt", 'w') as file:
            file.write(self.genome)
        
def main():
    """
    Main function to parse the input file and preprocess the data.

    """
    with open("rosalind_ba3b.txt") as file:
        data = file.readlines()
    processed_data = []
    for d in data:
        processed_data.append(d.replace("\n", ""))
        
    genome = Genome(processed_data)
    genome.genomeReconstruct()
    genome.writeFile()

if __name__ == "__main__":
    main()

# Problem 10

In [None]:
class OverlapGraph:
    """
    Compute the overlap and present it in the form of an adjacency list.
    
    input: adjancecy list
    output: overlap Graph
    
    """
    def __init__(self, input_data):
        """
        initialize the class variables. 
        """
        self.kmers = input_data
        self.out_dict = dict()
        self.directed_array = []
    
    def constructGraph(self):
        """
        Determine the overlap among k-mers.
        """
        for n in range(len(self.kmers)):
            for i in range(len(self.kmers)):
                if n==i:
                    continue
                else:
                    p = self.kmers[n]
                    p1 = self.kmers[i]
                    suffix = p[:-1]
                    prefix = p1[1:]
                    if suffix==prefix:
                        if p1 not in self.directed_array:
                            self.out_dict[p] = p1
                            self.directed_array.append(p1)

    def writeFile(self):
        """
        prints the graph

        """
        with open('rosalind_ba3c_output.txt','w') as file:
            for key, values in self.out_dict.items():
                string_out = "{} -> {}\n".format(key,values)
                file.write(string_out)
                print(string_out)


def main():
    """
    Main function to parse the input file and preprocess the data.
    """
    with open("rosalind_ba3c.txt") as file:
        data = file.readlines()
    processed_data = []
    for d in data:
        processed_data.append(d.replace("\n", ""))
    
    overlap_graph = OverlapGraph(processed_data)
    overlap_graph.constructGraph()
    overlap_graph.writeFile()

if __name__ == "__main__":
    main()

# Problem 11

In [11]:
class DeBrujin:
    """
    Creates the De Bruijn Graph for a given sequence using specified k-mers. 
    
    Inputs: a sequence in string format and an integer k (k-mer length). 
    Outputs: the De Bruijn Graph represented as an adjacency list.
    """
    def __init__(self, input_data, n):
        """
        Initialize the class variables.
        """
        self.sequence = input_data
        self.k = n
        self.adjacency_list = dict()
        
    def constructGraph(self):
        """
        Generates a De Bruijn Graph represented as an adjacency list.
        """
        for index in range(len(self.sequence)-self.k+1):
            kmer = self.sequence[index:index+self.k-1]
            adj = self.sequence[index+1:index+self.k]
            if kmer in self.adjacency_list:
                self.adjacency_list[kmer].append(adj)
            else:
                self.adjacency_list[kmer] = [adj]

    def writeFile(self):
        """
        prints the graph
        """
        with open('rosalind_ba3d_output.txt','w') as file:
            for key in sorted(self.adjacency_list.keys()):
                string_out = "{} -> {}\n".format(key,",".join(self.adjacency_list[key]))        
                print(string_out)
                file.write(string_out)

def main():
    """
    Main function to parse the input file and preprocess the data.
    """
    with open("rosalind_ba3d.txt") as file:
        data = file.readlines()
    preprocessed_data = [d.replace("\n","") for d in data]
    n = int(preprocessed_data[0])
    data = preprocessed_data[1]
    graph = DeBrujin(data, n)
    graph.constructGraph()
    graph.writeFile()
    
if __name__ == '__main__':
    main()

AAAAACAGCTG -> AAAACAGCTGG

AAAAACTCCGG -> AAAACTCCGGA

AAAACAGCGTG -> AAACAGCGTGG

AAAACAGCTGG -> AAACAGCTGGC

AAAACTCCGGA -> AAACTCCGGAT

AAAAGTTGATA -> AAAGTTGATAG

AAAATTCATAA -> AAATTCATAAG

AAACAACCGGT -> AACAACCGGTA

AAACACCCCCA -> AACACCCCCAT

AAACAGCGTGG -> AACAGCGTGGC

AAACAGCTGGC -> AACAGCTGGCG

AAACATGTGTT -> AACATGTGTTT

AAACCAAGGGG -> AACCAAGGGGG

AAACCGCGCGT -> AACCGCGCGTA

AAACCGCTTAG -> AACCGCTTAGT

AAACGAACACC -> AACGAACACCA

AAACTCACCGA -> AACTCACCGAA

AAACTCCGGAT -> AACTCCGGATC

AAACTGCGATT -> AACTGCGATTC

AAACTTGCAAA -> AACTTGCAAAT

AAAGATACCTA -> AAGATACCTAC

AAAGGGTACGG -> AAGGGTACGGT

AAAGGGTCCTC -> AAGGGTCCTCG

AAAGGGTGAAA -> AAGGGTGAAAG

AAAGGTCTGTC -> AAGGTCTGTCG

AAAGTCCCGAA -> AAGTCCCGAAC

AAAGTGTTTAC -> AAGTGTTTACG

AAAGTTGATAG -> AAGTTGATAGG

AAATAGTGGGA -> AATAGTGGGAC

AAATCAAGCAC -> AATCAAGCACC

AAATCTGTCCA -> AATCTGTCCAC

AAATGGTAAAC -> AATGGTAAACC

AAATGTATCGA -> AATGTATCGAC

AAATTCATAAG -> AATTCATAAGA

AACAACCGGTA -> ACAACCGGTAA

AACACCACGAT -> ACACC

# Problem 12

In [17]:
class DeBruijn:
    """
    Constructs a De Bruijn Graph from a set of k-mers.
    
    Inputs: A list of k-mers.
    Outputs: A De Bruijn graph represented as an adjacency matrix.
    """
    def __init__(self, input_data):
        """
        Initialize the class variables.

        """
        self.kmers = input_data
        self.final_dict = dict()

    def constructGraph(self):
        """
        Builds a De Bruijn Graph based on a set of k-mers.
        """
        for i in self.kmers:
            if i[:-1] not in self.final_dict.keys():
                self.final_dict[i[:-1]] = i[1:]
            else:
                self.final_dict[i[:-1]] += ","+ i[1:]
    
    def writeFile(self):
        """
        prints the graph

        """
        with open('rosalind_ba3e_output.txt','w') as file:
            for key in self.final_dict.keys():
                string_out = "{} -> {}\n".format(key, self.final_dict[key])
                print(string_out)
                file.write(string_out)

def main():
    """
    Main function to parse the input file and preprocess the data.
    """
    with open("rosalind_ba3e.txt") as file:
        data = file.readlines()
    
    processed_data = []
    
    for d in data:
        processed_data.append(d.replace("\n", ""))
    
    graph = DeBruijn(processed_data)
    graph.constructGraph()
    graph.writeFile()

if __name__ == '__main__':
    main()

AGTCTGGGTTTGAGGCATG -> GTCTGGGTTTGAGGCATGA

AAACGCATTATGAACCCTG -> AACGCATTATGAACCCTGT

TAATATGAGTTGCTTTGGG -> AATATGAGTTGCTTTGGGT

ATACCTCGATACTCGGCTT -> TACCTCGATACTCGGCTTG

TAAAGAAACATCCTAGAAT -> AAAGAAACATCCTAGAATA

TCAAATAGCGCAATGAGCC -> CAAATAGCGCAATGAGCCT

AGAAAGACCTAGACCAATC -> GAAAGACCTAGACCAATCT

AGTAGAAATTTTGGTCGTG -> GTAGAAATTTTGGTCGTGA

AACTATTCTAAAGAAGCCC -> ACTATTCTAAAGAAGCCCC

GGAACAAGTAGCGTCTTAC -> GAACAAGTAGCGTCTTACG

ATGGGGTTGGTAACTATTC -> TGGGGTTGGTAACTATTCT

CCCTGTTTACACAGCTTGA -> CCTGTTTACACAGCTTGAT

ATCTTTTGGGTTCGTGTAT -> TCTTTTGGGTTCGTGTATT

AGAGCTCAGGTGACTACCA -> GAGCTCAGGTGACTACCAG

GGTCATTGCCCCTAAGTGG -> GTCATTGCCCCTAAGTGGC

CCGGCCCTGGTCTAGTTTA -> CGGCCCTGGTCTAGTTTAA

CAAATATAGGCCCTGTTTA -> AAATATAGGCCCTGTTTAA

ACGTTGGAAGAAAGACCTA -> CGTTGGAAGAAAGACCTAG

TCAAAACCCGACGGAATGG -> CAAAACCCGACGGAATGGA

TAGGCCACGCGACCACAGG -> AGGCCACGCGACCACAGGC

AATGGACACACGTGCACGA -> ATGGACACACGTGCACGAC

ATTCGCGATATCGACACCA -> TTCGCGATATCGACACCAC

GGACTCACCACCCATGGAG -> GACTCACCA

# Problem 13

In [7]:

class Eulerian:
    """
    The Eulerian class encompasses both nodes and edges from a specified list of edges, 
    and its methods enable the creation of an Eulerian path that spans all nodes.
    """
    def __init__(self, input_graph):
        """
        Initialize the class variables.
        """
        self.graph = input_graph
        self.eulerian_path = []
    
    def count_dict(self):
        """
        This method counts the number of incoming and outgoing nodes of the provided graph and stores in a dictionary. 
        The dictionary is used to assume the starting node for the eulerian path.
        
        return: dictionary with incoming nodes and dictionary with outgoing nodes.
        """
        in_dict = dict()
        out_dict = dict()
        
        keys = self.graph.keys()
        in_dict = {x:0 for x in keys}
        out_dict = {x:0 for x in keys}
    
        for key in self.graph.keys():
            out_dict[key] = len(self.graph[key])
            for value in self.graph[key]:
                in_dict[value] += 1
        return in_dict, out_dict
    
    def getStartNode(self, in_dict, out_dict):
        """
        This method sets the startNode based on a difference of 1 between outgoing and incoming node list lengths.
        
        return: starting node of the euler path.
        """
        for key in self.graph.keys():
            diff = out_dict[key] - in_dict[key]
            if diff==1:
                start_node = key
        return start_node

    def getEulerPath(self, v):
        """
        This method uses recursion to discover Eulerian cycles in a graph. Initiating from the stating node, the function 
        recursively travels through every node until the last node is reached with 0 outgoing nodes and backtracks the path
        to traverse the unvisted nodes inorder to get the eulerian path. 
        
        input: starting node for the eulerian path.
        return: eulerian path.
        """
        vertex = self.graph[v]

        while len(vertex) > 0:
            new_node = vertex[0]
            self.graph[v].remove(new_node)
            self.getEulerPath(new_node)
        self.eulerian_path.insert(0, v)
        
    def writeToFile(self):
        """
        prints the eulerian path
        """
        string_out = ""
        for i in self.eulerian_path[:-1]:
            string_out += str(i)+"->"
        string_out += str(self.eulerian_path[-1])
        print(string_out)
        with open("rosalind_ba3g_output.txt", 'w') as file:
            file.write(string_out)

def main():
    """
    Main function to parse the input file and preprocess the data.
    """
    with open('rosalind_ba3g.txt', 'r') as file:
        data = file.readlines()
    
    graph_dict = dict()
    
    for line in data:
        edge = line.replace("\n","").split(" -> ")
        node = edge[0]
        in_nodes = edge[1].split(",")
        for in_node in in_nodes:
            if node not in graph_dict:
                graph_dict[node] = []
                graph_dict[node].append(in_node)
            else:
                graph_dict[node].append(in_node)
            
            if in_node not in graph_dict:
                graph_dict[in_node] = []
            
    eulerPath = Eulerian(graph_dict)

    in_dict, out_dict = eulerPath.count_dict()

    start_node = eulerPath.getStartNode(in_dict, out_dict)

    eulerPath.getEulerPath(start_node)
    eulerPath.writeToFile()

    
if __name__ == '__main__':
    main()

2022->465->1877->1878->1876->465->381->379->479->480->478->379->531->530->2377->2379->2378->530->758->1357->1359->1358->758->757->2154->2152->2153->757->759->1470->1468->1469->2037->2036->2574->2572->2573->2036->2035->1469->759->962->1134->1133->1397->1911->1910->1909->1397->1398->1396->1133->1132->962->1216->1453->2438->2437->2439->2560->2561->2562->2439->1453->1455->1454->1216->1217->1218->962->963->1097->1098->1096->1530->1529->1528->1096->1846->1848->1847->1096->2565->2563->2564->1096->963->961->759->530->820->822->821->530->529->379->380->97->1192->2358->2357->2356->1192->1193->1410->2093->2092->2094->2329->2331->2330->2094->1410->1408->1409->1193->1194->97->153->152->1207->1209->1208->152->1527->1526->1525->152->365->550->551->1710->1708->1709->551->2041->2043->2042->551->552->816->1809->1834->1836->2233->2234->2235->1836->1835->1809->1807->1808->816->815->814->2107->2109->2108->814->552->365->366->364->152->151->1751->1750->1752->151->97->98->195->1378->1380->1379->2086->2087->2

# Problem 14

In [10]:
class Eulerian:
    """
    The Eulerian class encompasses both nodes and edges from a specified list of edges, 
    and its methods enable the creation of an Eulerian path that spans all nodes.
    """
    def __init__(self, input_kmers):
        """
        Initialize the class variables.
        """
        self.graph = dict()
        self.kmers = input_kmers
        self.eulerian_path = []
        self.sequence = ''
    
    def findEdges(self):
        """
        finds edges and creates a dictionary of outgoind nodes from the provided kmers. To interprete the outgoing 
        edge of every node, prefix and suffix of the kmer is considered. 
        
        returns: graph dictionary with outgoing nodes for every nodes in the graph.
        """
        for kmer1 in self.kmers:
            for kmer2 in self.kmers:
                if kmer1 not in self.graph:
                    self.graph[kmer1] = []
                if kmer2 not in self.graph:
                    self.graph[kmer2] = []
                if kmer1[1:] == kmer2[:-1]:
                    self.graph[kmer1].append(kmer2)

    def count_dict(self):
        """
        This method counts the number of incoming and outgoing nodes of the provided graph and stores in a dictionary. 
        The dictionary is used to assume the starting node for the eulerian path.
        
        return: dictionary with incoming nodes and dictionary with outgoing nodes.
        """
        in_dict = dict()
        out_dict = dict()
        
        keys = self.graph.keys()
        in_dict = {x:0 for x in keys}
        out_dict = {x:0 for x in keys}
    
        for key in self.graph.keys():
            out_dict[key] = len(self.graph[key])
            for value in self.graph[key]:
                in_dict[value] += 1
        return in_dict, out_dict
    
    def getStartNode(self, in_dict, out_dict):
        """
        This method sets the startNode based on a difference of 1 between outgoing and incoming node list lengths.
        
        return: starting node of the euler path.
        """
        for key in self.graph.keys():
            diff = out_dict[key] - in_dict[key]
            if diff==1:
                start_node = key
        return start_node

    def getEulerPath(self, v):
        """
        This method uses recursion to discover Eulerian cycles in a graph. Initiating from the stating node, the function 
        recursively travels through every node until the last node is reached with 0 outgoing nodes and backtracks the path
        to traverse the unvisted nodes inorder to get the eulerian path. 
        
        input: starting node for the eulerian path.
        return: eulerian path.
        """
        vertex = self.graph[v]

        while len(vertex) > 0:
            new_node = vertex[0]
            self.graph[v].remove(new_node)
            self.getEulerPath(new_node)
        self.eulerian_path.insert(0, v)
        
    def writeToFile(self):
        """
        prints the eulerian path
        """
        for i in range(0, len(self.eulerian_path)):
            if i == 0:
                self.sequence += self.eulerian_path[i]
            else:
                self.sequence += self.eulerian_path[i][-1]
                
        with open("rosalind_ba3h_output.txt", 'w') as file:
            file.write(self.sequence)
            print(self.sequence)

def main():
    """
    Main function to parse the input file and preprocess the data.
    """
    kmers = []
    line_count = 0
    with open("rosalind_ba3h.txt", 'r') as file:
        lines = file.readlines()
        
    for line in lines:
        if line_count>0:
            kmers.append(line.strip())
        line_count+=1
    eulerPath = Eulerian(kmers)
    eulerPath.findEdges()
    in_dict, out_dict = eulerPath.count_dict()

    start_node = eulerPath.getStartNode(in_dict, out_dict)

    eulerPath.getEulerPath(start_node)
    eulerPath.writeToFile()

if __name__ == '__main__':
    main()

GGAAAACTTAAGTGCATAAATCTAATTGCACTGAGCTTTCTAATCACCATACCCTTCGTCCAGACTTGCAGGTCGCGCCCCGTAAAATATAATTGAACGCGGCATGCATACGTAGATTGGAGCGCCATCGTGGCTAGGGTACATAAATCGGGAGATCCTGTCGGAGTATAGCACCATTGACGTCTCGTCGTGTGACATCGGGGAAGATTATTCGTGAACAGGGATCATGGTGACACCCACGGCGTTACAGAGGTGGATTATCCGTACTCAAAGTTTCGCTTCGTGTCCCAAAAGGGTGTGGCCGCGAGATGCCGCCCATGTGATCGGCATAAGCCTTCCTTACCACACTGAACGATGCTGACTGTGCCGGCCTAGATAATGAGGTTATCAACCATCGCCGCTTTTTCTGCGGAAATGTTGAAAGTTCCCCACGGGCCGATGACGGCATGCACTTCTATGCCAGATTACTATCGGGTCGTGGAAGTCGTTCTCCCCGGACCCGCTACGTAGCCATGATAGAATACCGGTGCGGGACTCTAAGGTGCCTCGCCGTTGCTTAAACCTTAGACGGTCTCAGCTTTTCGTCAGACTCTCACTGTTGGAGAGCTTCTTGCCATTGTTTGAAAGATGTAACGTCAAGTTGTCATGGTACGTCGCCCCATGATCCGTCCGGTTCGCACTCCCAGTTAAATAGATCAGAGCGTACGGGAGTCTTTTCCATCATAGTATAGGTTGACGCGTAAGCGTTAATTGGGGTATGTGACCAAACAACCCATGGTATTAAATGCTGAAACATCAACAGACCAACCCTCAATATGTCGAGGTATGTAACGGACTCAGGTGAATGCTCATTGTGGTTGGCTCACGAAGGCGAATGTTAAACTCGATTACATTCTTGCATTAGTCAACAGTCTTGAGCGAGCTGAGAGTCAGCCCAAGTTGCAAGGCACCGTATGTCGCCCTGTCGTGCTCCAAATGCCTAAGAACGCCCCCCTTCCCA