## Minimum Skew Problem: Find a position in a genome where the skew diagram attains a minimum.

    Input: A DNA string Genome.
    Output: All integer(s) i minimizing Skewi (Genome) among all values of i (from 0 to |Genome|).


In [4]:
def SkewDiagram(seq):
    s = [0]
    skew = 0
    seq1 = 'ATGatg001001'
    seq_dict = { seq1[i]:int(seq1[i+6]) for i in range(6) }
    seq_dict['C'] = -1
    seq_dict['c'] = -1
    for nucleotide in seq:
        skew += seq_dict[nucleotide]
        s.append(skew)
    return s
    
def minSkew(seq):
    s = SkewDiagram(seq)
    m = min(s)
    ans = [i for i, v in enumerate(s) if v == m]
    return ans

In [5]:
seq = "GATACACTTCCCGAGTAGGTACTG"

In [6]:
minSkew(seq)

[12]

## Hamming Distance and pattern matching

In [7]:
def HammingDistance(seq1, seq2):
    return len([i for i in range(len(seq1)) if seq1[i] != seq2[i]])
    
def ApproxPatternMatching(pattern, text, d):
    pos = []
    l = len(pattern)
    for i in range(len(text)-l+1):
        if HammingDistance(pattern, text[i:i+l]) <= d:
            pos.append(i)
    return pos

def ApproxPatternCount(pattern, text, d):
    c = 0
    l = len(pattern)  
    for i in range(len(text)-l+1):
        if HammingDistance(pattern, text[i:i+l]) <= d:
            c += 1
    return c

In [2]:
seq1 = "CATCAGCAATACGATCATATGCGGATCCGCAGTGGCCGGTAGACACACGT"
seq2 = "CTACCCCGCTGCTCAATGACCGGGACTAAAGAGGCGAAGATTATGGTGTG"

In [3]:
HammingDistance(seq1,seq2)

38

## Approximate Pattern Matching Problem: Find all approximate occurrences of a pattern in a string.

    Input: Strings Pattern and Text along with an integer d.
    Output: All starting positions where Pattern appears as a substring of Text with at most d mismatches.


In [14]:
pattern = "TTCCTAGCC"
text = "AATCCTCGAAGGTGCGGGGTGCGGGGTAAGAGTAGCGAGGTTGGTTTCGGAAGCGATTCGCTCGATCGAGTAGTGCGTCGTAAACCAAAATTCTGGCAGTGGATGGGGCTCCTCGTTTATGTGAAAGATACGCGTACCACATGTCCAAGGTCAACTTTAACAATATTCAAGGTTTTGTGAGTCCAGTCGAGGAGTCAGACACGTTCCCTTGGGTAACCTCTGTACAGATCATGCGTGGGACTTTAATCTATATAATCCAGACGGTTGACGGCGGGGACCGGCGCCGGTGTCCTTAGGAGGTACTGCTAAACAGGTGTAAGCCTAGAAAGCTGGTCCTCATGTATTGCCAATAGCAGCCCAAACATCTAATCTTGGGTCGCTTCTGATAGAGGCCTGCCTCAGCCGAGCGGTGATCGAATCAGCATTTATCAGCCGGGATACTTGATTTCCCCTAACACGAGGTCTGCACAACATTGTCCCTGGTAGGTCGCCACGCTATAGATACCAAGAACACCCGATGCACATTAAAAGTTCATTGGATACGGCTAGGATCCCCTTTATCAAGCCAATATATCAATGCGCCTTGTGGACCTTTGAGGGTTTGGTCCACGGATTGCGCCCTTCGCAAGTAACAAGTTGCGTACCCGTCGGCTCCTACAGACGCATGGGAAGTGACACCACCGGCTGTTTACCAAACAATAGCACATGTGCGCTTGGATAAGTGAGGGCTGGGGTAGACGCGCATCCTTATCATCACTGCTGCATTATGGGAGAGACAGTATGTTTGTTTGGGATAGCCAGATGCACTCTTGCTACGTAGTGCGCTAAAAATATTACAATCACCACACTTCTCTTCGAAGGGATTTAGCCCTTTTTGCACTCGTAAGTCCACCACAGAGTATTTTCTTTTTGAGTTGGGAGAGAGAGTAGTTACGATATTATATATAGACAGAATCTTTGTCACAAAGATCGCAAGACCTGGGCATCCCCGCGGAATGGAGGGGCAAAGGGGGAGTTTGAAAAAGGAGTCGATCTACACGCAAAGGACGACGGCTCTGATTAAGCACAACGGCAGACGAAAGAGCCATGTCGAAATGGTCGGTTCCTCGTTAAGGGATAATAACCATCTGACCATCCCGAAAACCTGAAACTTGTCAACCGGCGGGGGATGCATATGTAAACTTTTTGTATTAATCGGGTCCGTGAGATCGCGTCAAACGCATGTCACTAAGTAGTGAGCAGTCCGACAGAAAGACGCTGTGTTAAACTATGAGTGGTGTTCCTGATCCTAAAGGTCTCATGATTTGACAAACCTTATCATACTCTACGCAGCCTGGCGAAAGGCAATTCAGTAAGTCCCGTCTTCTTGTATCAGACGGTGCCTTTTGACGGCATCATTCCGCACTTGTGACGGGGCCAGATTGACACGCCCCAACTCAAGGATTGGGTGAAGCGTTAATCCAGATGATCTGGACTTCGCCTCGTACAATTCACACTCACGACTGCAGACTTATATGCGGACTCATTAAAGCAGGCACTACCTCTCTACAGAAGACACGTGAGCGCTAGGCATGATCGATTAAACAAGACAAGCTCGCCGTAGACACGAGGGCTCCGTACTATTTACCACCGAACCCCGGTTAATGGCCATACGTTACCTCTAGCCTGCCCCTGTTGTATATTACTCATCACGCTTTTTGTACGTGCCCAATGTTACATGAGGCAGCTGTATCATTTAAAATGTGTTAAAGTGTCACGCCCTAGCCGTCGTCCACGCTGGCAGTAACCTCTTATTGCCGACAACCGATGCGTAAACTGTAGCACGATGTGCGGGAGCCGGTTAGCCCTCCCTCACAGGACGCCATACTTTACGGACAGCAGAACCCTCGGTACAATCGTCCTGGAGTTGAAAGACGACTACACTGAAGTGGTGCTGGGCTAAGTCTGTGGCTACGTCGCTCCCAGTGGGCGAGCATTCTGCGAAGTGTAAAATCGTCCGGCGAAATAGGAGAGGGAGCGCCTAAGGATTGACCAGATATAACAAAGACTGGTTCTTAGAGGCCGTGGCGCGATCTACCTTCGCAAAACTAAGCAATGCATGCTGATTGCCATGCAAATTAAAGGTAGGATTAATGCAAGGTCTCCCTGAACAAGCCGGGCTCGAAGGGGAGCAAGGCGGATATGGATTCAAGAACATACGAAGCCCCCAACACGTGACTGTATTCAAGATAGTCCTTATCTTGTAGCGCCTCTCTTTGTATACGATTTTGTGGGAATCCAAGGACCGGACACGTGGAATTTGTCCGCGACCTCGAGGCACCTGCCGCCACCTGTTTCATGATGTGTGCCGAGCGAGGCACACGTGTATCTGAAACCATCAAAAGGAGATGATTAGAACATAGGGTATTTTGCTCACGCACCCTGGGATTGCTTGGTGCACTCGTCGGATCACGGGGCTGACTGCCATATGTATTGTTAAGTACAAAGATGGGAGGATGAAGATACTTTGCCCAGTGCACGGGCGAGCGCACTCTCCGCCGTCCCTTCATCGCCGAGCGGTCTGTTTCCCGCAGGATGATGATGCAAAACGGGTATGACGCTTGCAGGAATCCTGCGACATAAGCTCTGTGCATAACCCATGAGTGACTCTTCCTGGTTCAAGTAGTACGTACAAGAGCGTGGTAGTATCCGAGTGGCGACAACGTATTATCAATTGAGCGTCGCCCAGCAAAAATCTGAAATCCGGGCATAGGCTATAGTCAACCCGTGGTCCATATGTAAGAACCCTCCCCCCCGAAACGACGTTCCCTGGGGCATTCGTCTCCACCGCGGCGGTCGGAATCGAGGGCGCTGAGCGGGGCGGCTCAGAGCTCGTAGGACCTTTATGGGCCATATACCGGTAAAACCTCGTCGTATACTGACTTATTTTGTAAGGGGGTCTTGTTTTGCCCGCCGTAAGCCTGAAGGTTCTGTGTTTTGTGTGCATAAGATACGAGGATACGCGACCGCGTTGGCCCTTTCACATGTTGTGGTAGGTCAAGATTTGACAGCTAGCCCTGAATTAGCGCCACGGATGCAGTAAGCGGGTACAGGTAACGGCGCCCAATACCTGAAACGTAACCTCTCTCGTGGAGATTTACCCTAAACAAGGATAATCATCGACGGCGAGGTCAGACGTCACCTGAAATGACCCTGCTATCGTCAGTGTGCAATTGAGTTGAATACTGAACCCTTTCCATATTGCGTGTATCTGCGCGGTGTCAAAGCTCACCTAACTACATGAACAGGTAATCCAGTTCCGTGTGTAGTTAGCCTCCCCCCGGAGGCCGTGTCTCAAGGTGTGTGATCGTGCTATCGGAGCCATGCTCATAAAATAGGAGTCCCCATGTAAGGATTAAGTGGCTTGAGTGAAATAATGTCGAGTCTACCCTGCACAGCTCTGCAATCCTCAGAAGACGGAGTGCGGTACGACATTACTTCTGTGTGTGACGCACCAATGCTGAGTGGCGTAAATGCAGTCAAGACGCACGGGTTCAGCGGTCACCTCATCATCCCCAGCGGCTTGTTGCTTTGCTTGGCAGAGCCTAGTGGACCATTCGCATGTTTAATAGTCACGAACTCGTACGAATGATCCGTGGAGTGTCATGAAAGGGCGGAGACAATATGAGTGATCGTGTTTGGTGGCGCCACTTCAGTATGTAGGGTCTGATGACAGGCATCGGGGAACCTCCTGGATTCTGTGGTGTTCGCCACACTCAGCTTTGTATACCTCCGGTCCCTGCCTGTGCATAAGTAAAACATAGTCCGACATAGGGAACCTTATAGCGGACGCTACGTATTCAAATCACAGGCCGCGACTTGTGCCAGAACCATTATTAAGAAATCTTTGTTGTCAGTCCACCGTCGCCATAATGAGGCAGACAGTGGACAGGTGTTGTAAGCCTCGGCCTTTAAGGTTCACTAGAAGTATGGAAGTCCCGAGCTGCTTACCAACACCGCGATTAAATGCTTGTCCTCCTCATAATATGAGGCAGAGCCCCGTTGATCTTGTGGATGTGACCTATCAAGGTCTTACCAAGATACAGTCCGAGATCTTATCTAACCGTCGATATTCGCCTAATAGGCAATTATCATCGTGATCAAAGATTCACAGGGACTGCAGAGCGGGTCATTAGCGGCTGCGAGAGGCTTACGTGCAAACGAAAGCGTACCCCACTGGCCAACCCAGTACTCCTACTGCCGGGGAGCGGTATTCAGCTTAGAGAGGGGATCAACCGAAGTACTCAAGGCACTCACACCATTCTAGGCGATTGCCGTTCTCTGAACGTAATTTGTCCTGACCCGATGGCCATGTCCCTGTTTGGGCGCATTGGTTTGAATGCGAACAATCTGCCATTCCATATTCCATGTCATCTAGGTCCGACAGGTTGAGACGCGGCGCAAGGAGGTTTGGAATGTAACTCGTGCATCGTATCAGAATTTGAATTCAGGGCAGCTGGTCTCACTCCTAAAGCGATATCTCTCTACGGAAACCACCTGCAATGAAACTCGAGGATCCGATTGTAAAGCCCCAGGCACATAAACCACTTCTGCTACTCAACCGCTCCAGTGGGTCCCCTCCGTCTGTCTGCTCTGGCAAAAATATTCACCTGGCGAGCACTTAGCCACTAGCAACATTCTCCAATATATGGGGTCAGGGCAGCAAAGCCCTAGTACGAGAACTCAAGGAGAGACGCGACAGCCGAGGAAGCGTTGCTATGAGAGTAGCTGGGCATTCACGGACACACATCCGCACTAATAATCCCGACGCCACATCGTCATGAGCTCCTGAGCGATGTTGGCACTCTAAATTGAGAACTGTTTACAACGGTTATGCTGCACCACGGAGACGCTAACCAAACCAAAACGCCGAGGCTGAACAATCCCGGACAAACTAAGCAAGAGGGCTGAGCAAGTTGGACCAGATATGCCATCCCGCATGGACGCGTACAATCTGCATGAGAGCGAGCGGGCAGACCCAAAAGAGGCGACGCCTGATTTTTGCCATGGTCCCACGGATGGGGCATAGGAGTTAGGCCAGCGATGTCGGGTGACAAATTTAAGTGACGTATTCGCCAACGTCTTTGTTTGACGTTAGGTTTTAACCTCAGAGTGCAGCGATGGACGGCGGTAATGAAAAAAATGGTGTAGCCACGTTCTTCGGCGACATGTGGCAGGATACTTAAGCGAGAGAATCGATACACTTCCTGCAATGCAGAGGATGGCTGTATTCTTATTTAGAACCGGATGGACGGACTATTAAACTAGTATAGGAATTCGCATCCGTTAATGGGCCACACCACTCTCCAAATGGAGTTGTTATTCAATGTACAGCGCGTATTAAGACGCCAATATTAGTGATCCCTCTATAGCATTATGCTGCTCATATCATCCCTGAACAGGTCATTATTATCTTTCGCTATAGTTGTAACTCCTTACCGAGTCCCTCAGGTAACAAGCAGGGTGCTCAGGGAGTTTCTCTTGACTGCGGCGGACTTCCTACACCTGGAAACCGTGAGAACAACTGACACTCTCTGGACCAATATTGAGTGTAATTCAGAGGCGTCAAGCGTGTCAGAGGGTTAGGAGCTAGAAGTTCACCTTCACGAGCTTCCAAACGGCCAAGAGCCAATTTATAACTGCATCCTGCCCGCCGCTCCCTGACGAGCATAGACCGAAAAACTAAACACTAAAGTACTGCACTTTTGGCCTGGTGCTTAAGAGACTGCCGCGATTCTTCTGGACTAGAGGTTATACCTGATTTGATTACGGAGTTCTGAGAGGTGCCACGTCAGCTGTGGATGCAGCCTCCCTTGAGATTCCAGCCGATAAGGTCGCTGTTAGTTTAGCAGGCTTTAAATGACCGTAGAATACTGTGAGTGGAACAAATCGCCCAGAATGTGACATGGTACCGAACCTATGGACGAACTCTAGGTGGTTCAACCTACGTCAGTAGAGGTTGTGCGCCTTCGATCCTCACCTCCTCTTCACGTATCCACCGTTACCGTGTAAGCACATCTAAAAATCTAGCAAAAATATGTGCTCCAGCGAATCGTAAGCGCCGTGACACAAGTACTGACAGGCTGGGTGACCGAGGCTCTTTATTGCGCGGAACCCCCTGTCCTCAACAGCTGCACAGCTTTCGTGCCCTGACCTACTACATTAATAGGAAGTTGCTAACACGAGCGTGGTTAGCTTCGATTCGGGACACTACAGCTCCCTTCAGTGGTGCTGCCCTAGCATGTGAGTCCTGATAATTATTTCGATCTGGTAACGTCTTAAGCTTAATTAGCCGTCCGTTTAGTTCTAATATTGTGCCGTCAATAAGAGATTTCCTCCCCATACATCTTGCATTACAGGAGGTGGCAATCACTGCTGTTAAGTGTACCCTTTAGATGTGCTACCTTTTAACCGTAGGGATATGGAATTTCAAGCGTGATGGGGCAATAAACATGCTAGATAGCCGGTCCTTACAGAAAGCTCTTACTAGTGAGATCGCAGTACGTCGGCGTCTGGCGCCCAATACTCAGACTGACGAGAGCGCCAAGCGTCCTAGTTGGAGGTGAATAATATTTCGTCGAGTAATATTCGTATCGCTACCGCTTCGCGGCAAGTAGTGGCTGTCTGGATCTCCTCCACACGAACCCATCCAAGAGGCAAGGTACGGCGGCTGTATGTCTTAAGGATTAATAGGCTGAATGGTCATCGTGCCTCCGGCCTAGCCTAGGTCAGATAAAGGGCACTGCGAGAGTACGGTTATGGTGTTCCCATATTTACCCGCCCAGAAGCCAGGTCGGGTACGGAAGCGGGCGGTATCTGACCTAATTATATATGCATTAGAAAATTACTGGCCAGATGGGGAGCGATAGCAGATAGCGCCTCATGTGACTACTACTACGTGCGGAGGAATAATTATCTAGGTATAGGGCCATCCCCAAACAAAGGACAGTGCACGAGAACACGCCTGCTGGGGGGTTCCAGAATCGCTCCCATTCCCCGGAAGGAATTGATGGCTAAAGGATAATTGGATACTCGTTCTGCGCGCGGAACAACAAAATCATCATGTAGGACTCTGGGAGGGGCTCGGTAATTAAAACGACCGACTTCCTGTCATAAACTGTACCTCCTCGGCTTGTTCTGGGTGCGAGGCAAAAGTACGAGCGGAGGCCTTCAGTCGTAACTCAGCACTGTCCCAGTATGGACACAGCTGCGCAGAAAATCAGAATCTGTGAGTATTCTGGCGTTTGTCGGGGTCGGCCGATTCCTCTAGAAAAAAGACAAGGGATTAGGGCCCCCCCCACTCTTGGTGTGGCGAGGAGATCGTCTTGATATGCTGGACTACAAGCCTTCATTAACCAGAGAAATGCCTTTATGGGTCGAGTGATAACGCGCGCCGAACTGTGCTTTCGTGTGCGCACTACGTAACCTACAGCGCGAGAAATTTATTCGACTTTCTCCTCGTACCGACGCCATGAGCGCTGCAAAGTTAGACTACTTTATCGACAGCCAGGCTGTCCCAACATAGGGCTGAGAAGAGTCAGACATGTGACATTCTCGAGGCCGCGGATCCCAGTACCTTTGCGGGTGAAGAACAACGCCACTTCACTAAAGTCTTCGTGCACCACTTATCCGTTCAGCTACCCGCTGCACCTTCTTGATATAAGAACATCGCAATACTCTAGTTGTTGCCTTGTATACGTGAACTCACGGCACCTCTTAAAGGTCGAAAAGAATCCAACGAACACTGCAGATCACTTACCTTACTTTAGTGGAGAATTAGGTACAAATAGAAACTCCCAATAGGTACCGCGTCTCCTAGACGGAGTGATGATCAGGTAGCCTTAGCCTGGAACCGCGTAAGGCGTGGTTCAGAACACAGAGTCCGTCACGAGACACTAGGCGTTGCGACCCGGGGACTTTTCTCGGTTATTAAAATGTTCAATCCGGAGGGAGCACCCCCGACTGCTCTGATGGGCCTGGTTGGTTTTTAAAAGGAGAACCGAAAATATCATCCCTGGAGATGTCATTTTCTCTTTTGCGCTATTCGCCAAAGGGGGGAGCTTCACCATACAAAATAACGACTGGAGTTGGCGCGTAGGTAACTGGCTACAGTACATGATTGCGTTAAGACAACCGTTCTATATGTGAATAGAGTGTGGGGCCGGTATCAAAAACCAAGGAAGCCACATATACGATGACACAGAGATTCTTCGTGGCCCTGGGAGAGAGGCTAAGCTTTGTATAGTGATGTTCAGTAGGCGGCACTGTTCAACGTCGTCAGAAACCTCCACCGCCAGGCCGAGACACAAAAACTGCGAACGTCGATCCCTGCAAACGCAAATTGGCATGCAAACTGTTATTCTCTCGGTACAGTTTGGTGGTGCAGCGTGGGCGGATTCTGTAGACATACTGCCGCCGATTCAATCTTGCCGCCCGACACAACGGTATGTGAATTCTATCAAGTGCACTTTGCGGAATCTCACTACATATTTCAAGTTGCTACCAGCGTCGCACCAGAATCGTCTTGTTACTAACGTCGTTGGTAGCTTTTAGAACCTGAGGGTACTATTGATATATAACCGCCAGGATGTAGATCTCGACGTTCATCCACGAGGTACATACGCTTTATGGGCCGCAGGTAGAGCACTTGGTATATCACGTGATAGGATTCTTTCACTGTTGGCTATGCTCCATTCCCCTCAGCTCTAAGAACTTACTAAAGAGCGCACGTGATGAAAGAAGGGACCCTAGTAAATATATTGGGTTTAACACCTCACTCCGGGGAGGAACCGTCGAGTCGACGGGAGTTTGACCGTATGAGGGGAGCAGCGCACGGTTACCATGACGTCTGACCGGTGATCCGTCGCAGGGTAGGATGCAACACACTGTGTGCATTGTCCTCGCTGCCAACCCGAACGAGCTACCGGGACGCTCCTTTGCGTTACCCCTCGGAGACGTTAGTCATCGTGTATAGGACTACAGCATTGGCCGGATCGAAATTAGTTGCCCGCGAAGTGAATGAGTCTATCAGCCGACCGGCATGTGGCGTTGCCTCGACGGCACTCAAATGCTGCGAGAGCCTGACATTCCGGCGGGGGCCCTAGGGGTAGCACGCGTGCACCAAGCTACACTCAGACAGCCCCCAGGCGGGAGAGCGACAGTGTGGTGGAACAAGTCATTGAGGACATGTCAGGCACAATTAAAGAAATCCTCTTGCCCCCAGTAAGTCCATCTGGACTTAGGGTCTCGGAATTTGTTTGGACGGTACGCAAATTCTCATGCCCTAGTAATCTTGCGCGCCCTGGCATCACCCGACGCCGCTTCGTTTCTACCCAGCGCTGAGGAGGCCGAGGTCCAGGGCGATGTCGTCAGGACCGAAGAGTTGCCGGGAACGTCGCAAGCGCCTATCAACTCTTATTTGCTGAGCGTCGTACCCGGGCTGGCTGTGGGACAAGCTATTTGAGCACCAGGATAGGCCGTCCCGCGGACAGAATGAGAGTAATAGCAGATCGCGGTCCGTTAAGTGCAACACTGGATAGACAGCGCATCCCTGTGAACAGGTTGGTTGGCGACGGACACAGCGTACTCGGCACCTCTGTGCCAGGGAGGAACCTTCGCACCACCGGCGGGCCGCTTATATTGCGAGTTGATAACTGGGCATAGGACTTATCCAAGTAGTTGCTCACTTTCCAACTGAATCGTGGCCGACCCACTTTGGCCGCGCGACCATGTACCCGAATAGACTAGATGACGCGCTTTCTGAGGATGCGAAGCATAATGGGAATTGGCTGGAGAGCCTCGTACCTAGGTTAGGATCATCAGGGGCACAGGGTCTGTTGGTTCGCGACTTTCGGGATGGTTTAAGACACAATTGCATACACGGGAGTCTAACAAAACCAGGTCTAATTATCTCGGTAGCCTTGTACGGAGGGGTACAGCCCCGAAGTCCGCCCTGTAGATCGAGACACACCAAGGAGCCGGCGACAGTTAGTTTTCGAACAACCGTTAAATTCAGTTATGGCATCCTATTCCAGATGGACCTGTGGCAGTGCACAAGCCTCTTGCGAACGCTTTAAGCAGTGTGAGGACAGCGGAAGGGGCCGTTCATTTTGTCCTATGCGTTCAACCTATACCCGAAGGTTGTCCCCTGAGTATATCCCGGTTTTGAGAAGACGGAGACCAGACATTTGAAACACGTATTACTCCACCATTAGTTCGTACTCTCTATTCAGGAATGTCCGTATTGTCGGTAATAGGAGTTGGCTCCGCAGCAGTTTTTCTACGGATATGCCGGTCGTACTTGCATCTTAAGGGTAGGCTGGGCTCATTTTTTGCTGATTTAGTGTTAGGGATTGATAATTCCTAATGACGCTATACGAAAGTACAATGGGATGCGTCCTAGCCGGTTTTAAGCGTCAACGGAAAGGGACCCCTAGCTCCTTTACGGTGCTCCCAGCACGCATGGCCCTGTGGTATAATGCCCGCTGGAACTGACGACCCGAACTGGTCGCGGGAAACGCGCCGTGGTGTTTTACTAAAGCGCGCTATGTCTCGGTATAACCTAGAGACTGCTTCAACGACCTCGTCATTTCAAATCCTGGCAATTACTGATTTATAGGAGCGCGAATAAGGGGCGTCCCTGTTCCCAGCCGTAGAATCAGTATCGGGCTTTGCCCGCTCGAGTCCGCGGACAAATCATTTTGGACAGCTCCGTCTCGGTTTCAGCGCCCATACAAACTCCCATTGACACTTCGAGAGAGCGGTGGTTCAAGCGAGGAAAACAACACTACTGCTGGAAAAGTGGTCAATACAGGCACATAGCCTTGCCGCGCGATTGTCCATGCCCAAAAGGACGTATTGCCCGCGGGCACCGACCCTCCCCTTCTCATCACTGACTGTGTCCCTGTCTGCTTTCACCGCATTCCACGACATCCGCTTCCGTTTCTTGACCCTTCAATGTCGGGCGATTTCCTCCATCCCATTCAGACCCAAAACTCTCGGCTTCCCGCCCCGTACTAACTAAGGCCCTTCTGAACCTGTGGAGACAACTCCGTCTTAGGGTGGTTAATACAAGCTTCAACTGACTCGTGCTTATAGATTTTAAAGTATGCTATTGCATCGGCTACGACGTGACCTGGAAGCTTCGAGGAACTAAGTAAAATGCACGTAAGTACACGCGTTCGTGGCTCGGAGCTCGAGTTTACAGCGCTAACCGTCTTTCGGCTGCCGTGAAGTTCATATAGCAATGCTCGCTTCTGAGCTATCGGGAACTGTTGCTTGGCTATAAAATATAATGACACTACCCGCACAATATAGAACGTTAGCTAGAGGGACCTAACTTAGACGGCGCTACTGTATCCCGACCAAACTCCTCGTAAGCCCACTATCCGCTGACCCTCCGTCTCGATCGGAATTGAAGAGATCGGGGTCTGCGACATGCGGTAAGAAATAGTGATCTTGATTACCAAGCATTACGAACGTTCTGCTTGGTCTGTCCTACATCTTTGAATCGCTGGGGTGATATCCCGCCTGTCCGGACGATGTAAAGGCCAACGGCCTCTATACGTGGTAAGGGCTCCGTATGGGTTCGACGGGCTATGCGGTGTGTGAATGTGATAGGACCTTCGGAACTAGCATGGTGATATAGTTCAGGGACTCTCGGGTGGATCTGACGGAGAGTAACGCACCGATTTAACTAGTGGCGGCCTGTGGGCTATAGCAACGCGCAGGTCCTGTACTCAGGCGTCATTCTACGTAATACGTTTTGGGGGGCCGACACGTACCCTCAGAGTAACAGTGATCCCTCCCATTACGGGATTCTATTTCTGAATCACAAGATAACTTTTCCTTCGCACCTCGAATCTTCGCACTAAGCACAAGTTTTTGTTTAGCGCGAGACCGAAAGGGACGACTGCACCCGGAATTCCCTGTGGCTACCAATGACCGATTCCTTCTCGCGCAAAGAGGATCTTGACCCAGCCCTCATCGTGGTCGGTAGATAGTGGCATGGGTTACACCGTTGGCTAATATATCATGCTGGCTAGCTAGCATGTTAAGTCACCTATCCCATTTATATTATGTTCCGACGCGGGATCCAGCAGGCATTCCGGTCAACGTACAGGATTATTGTTCTGAAAAAATAAGGGCTTGAGTTGTTGGCATCAGGCCAGAGCACCTAGTCATCGAGAAATCTTCCCGATTACCCGGATCAGTCTCACTGATTTGAGAGCCATAAGGTCAACCTGTAATGTGATCTGATGAAAGAGGATGCTAACTAGCCTCGTACCTATAGGCGCAAATGTAGTTTCGAGCACGTCCACTAGATTCCAGTCTAGTTAGATACCATTTTTGCTTGGTCTCATTGTACCGGCTTCTTACCGCGTCAGTCATCCGAGGACCCTAAAAGTCGAATTCGCAATCTCAGGTACCAGCACTCCACGTTAGATGGCCATATCGGGGAGTTGTCCCCACCCTTGGCTTTATTGGCACAGGCGATCCCCTGAACAGGTCAGCGTGGTATGTCGGTACTAGGCTATCATCGGGAGATTTCTACGTGCGCCTAATGGAACCAGTACACGACTCAAGTTAATACGACCCCATCGGACTTTTCTAGGCCCATTTAGTTGCCCCAGTTAATACGGAGCCAGCGACCCCCATGCAAGCGGTATTTCACAGTGTGCCCCGTTTTTCAATGAACGGGATTACTGAAAGTATGGTAATGGTTGGAGACGAGGAGTTGGCCACCGGGAACGTATTGTACGCTGTCTAACTTTGTTTTTAAGGTCGTTGAGCGGCTGACAGTACCTTGAAATAGTGGTAGACCAGTAAACGAATCGTATACGGCAACGGCCACCGATTGCTCCGCCGCTCTAGCCTCCTGACGGCTTCGCATGGGGGGTGTAGCCCGGTTAGTGTTGCGAAGATTTATTCTCATGGCGTTTCCCGGGTAAGGAGAGATCCAAAGGACTAGTGACTATTCTGAGTAGATTCTGAGGGGACCCATTGATTACACACCAAGCACAATTTAAGTGTATTCGACCCAAAGTGGGTGAGAAACACGTCGTTCCTTCGATTCAACTGTGAGGTCGTGGCTGCCGAGGTGATCACCCGGCGAATGGGGTGCCTCGGTTAAGCTTGAGATACTCCCCATGTTGTAGTGAGGGCCCACTTTAATCACGTCATGAAAACCCTTTAAATTATCGACGCCTCCCACCTAGCCTGCACGCGTCGCTACGGCCACAAGCTGAGAGCCGCTTCAGATAATCTCAAAATCTACACATGGATCTTAAGGAGAACGTTACACCGAACTGTCGCACAGTTTCGTCTTTATATATTGGATACGAAATCGAGTCGGTCCCTGAATGTATCCTCACGCCAAAACCTTACAAAGCGTTCCGCGTCTGTTGGGAGAGCGGAATCGAAAACGTACTAGCTAGGACACCTCCGAACAAATAGGTCGAGACAGCTTAGTAAACTCCATACGGCTTACTGATACATGTTCTATGGGAACTGCGGATCCGGTGTAAGCCCCTGCGAGACAGCAAACCGCACGTGGCGTACAAGAGGAGAGCTTTTCAGGATCTTACTGTATAGCAATGTTACCAGGATAAATGCTTCCGCTTGTCTGTCTGTTTTCGTCACCCGCACGAGCAGCTGGTTCGAGAGGCGTATCGTACTGATGGGCTATGGGGTCGGCTGCATAATACCGATACTTGGCTTGCGTCTTCTCGCCTCCAGACAATGAGACTCTAACATAACTCGCTAGATAGACAAATATGGTTGAGTGGGGTTACCTATCAAGGATTTCTCTTGGTCCGCGCATTGGTTATGTGCCGAGATTTCCGAGCCAATCAATTGAGTGACAAATAGCCGTAAACGCATATCCAACGTGGTTATTGTTCATCACCCCAGGCCCTAACAGATGCAGATAAGACTTCAACCGTAACAAAGCCAGTAGATCATAATTATAGCTGTGAGAAACGCCCGTGCGCACGTCCTTGCGGAAGGTTCCCTGTGGATGTGTTAGTCTCCCTGCGTCACTGGATAAGAAAGGATGAACAGTAATGTTGCCCTTAGAGTAAGGGAGCCACTCGCAGACTGTTGCGAGCGCGATCCGTAAGCTTCGGTTACGCATTGCGGGGCCTTCAAACGACCCTCCTCAGTGAGCACACCGCAAAGTAGCGATTGCATACTCCTAGACACATTAAACATCGCAATCAATTGGTCGTCAATCCATTCTTTATCTAAAGACATAACATACGCCAGCTGTCATGCGTTGGCTGGAGAGTATTTGGTGCAAACCTAAAGGACTCGATTACGAGGCGTTCGCGTAAGACGATAAATATGGCGTCTAAAAGGCCAATTACGTTCGACGCAAAAGTATGTGATGAAACACTCTTCACGCATACTGTGAAAAAAAAACTTTTGGTCAAGCCAGATTTTAGAACGGAATTTCTGCATTGTTGGTTCACGGCGCGGTGTTATAACCAGTAGTCCCTCTCAATCTAGCAGGGGGAATTACGGAAGCACAGAGACTATCTCAATCCGCGTGGTTTGGGAGATACGATTTCCCAACGGTGACTTGGTTTGACTTGCGTGCAGTCCCCAGTTGGATACCCCATTTACTGTCTGTGAAATGAGGACCAAAGTTGAGCCCGTCTCAGCACAATGAGGTAGGCCAAGAACCGGTCCAATATGACTAGCACGTCCCACGGTCATCTTGGGCCACGGCTTTAAAAGAAATCAGCAACACACTATAAACGTTGGTTCAAGAATTCCCCTTCTAAGGAATGACTGAGTCACGTGCCTTCTCATTATCGACATACCTAGAGAGGCTGCAGCGGTAGGGACCGCCACCGGACTTGGAGCTATACAAGCGTGGAACGGCAGTCAATCCTTCAGTAACCTTGTAAGGTGACTGTAGGCTGACACGCTTCGTACGTACAACGATACCGTTTCTTGAATCTGATTCCATTATCGAGGTATTCAATTCCTGCTGGAATTTGGTTTCGACGACTGGCTCTCATCCGAATCGTGCCTATGACCTCCCCGTACTTTGACGCCCACCCGGTCTTAAGGATCAGGAGAATGATGGCTAGGAGCTAGATTCAATATAGACCGATGGTCCCAGCACAAGAGAAGCATTATCAATTATAGTATCAACGAACTCTCCACGTCATCACCAAAGGTTGGTCGAGCATCGCTTGAAGCGAGCACGGAACCGTGCAGAGAATCCTCCCGTGAATCTCTTATCTGTCCATAAAAGCGTGAAACCGTGGAAGCAAACGGAGCGTCCCATTTGCGTGGTGCTGATACGTAATCTACCAACCACTGCGTCCGATTTCGGCACTCCTCTCGGTTGTGATCAGATATGGGAAGAGCGCCAGGTGCAAACCAGTAGGGGCAATAACACACAGAACGGAGTCCCAGTATGCCTGATTGCTGCAATACCGCAGGATTACTGTTGGAGCCCGCGGCGGCCGTAATTGGGTTTACCCTGGTCACAACGCAATAGGGTACCTACAATGTTGCGCCTACTCATTGGGAGAGTGCATACCCACTACCGAGCACCGGAGGTTAGAATCGCACTCCCGGATTCATAATCTTTAGATGGACTTTATCTCATAGGTACTCTGTCTTAAGGCCCATGCCGTTAGGCCGAGCTGCGTTGGAGTGGAGAGTCTTGTAGATTCTAGAATACGCAACAAGCCCCCTCCTCGAACGTTCGTCGGTGCAGGGCTCATCGGCCGATATGCCCGGGGACACAGCTTTGGATCTCAGGGGCAATTCCATAATCCAGACGGCTGGAGCCTCCGGCTACTCGTACCACTTCTGACGCGATTGACGCTATGACAAAGACGGAAAATGCGGCCAGTGTTCACCATCTGCCGATTACAGGATCCAGAGCGTAACCGTGTTTGAGACGTCTATAGCCCGAGTCGTCAACATGAACAATTAACACACGGACGCGGCAGTACCTCGTCGATACAACCAAGAAGTTTAGTGAACGCCTGTGAACCTTCGTATTAAGTATTACGTAGGACACAGTAATCAGGGCAACCTTGCTGTCCCTTACGGAAGTCCGTCCTCACCTATACGGTTTTAAACGTGCTTGGTAGCCATTTACCCTCTACGTTCGGCGGGAAGGCGAAGCCGTTGCACTTAAGTCTATGAAGACAGCTCGTCTGTTCTGTCATTCGCCGTTCGTGAGCAGGTGTAAGATTAGCGGCTACCCAAGATGCCCAAAGATCAGCTATAGTTCCTGCAAGAGCGAGGCGTTACAGATCGGGTGCAAGGTTGGGATGTCACCCCGTTCTCTTTTCAGACGTATTTAAAAGCAAGCATTTCACGATGCCAAGAGCGTCTCGCGAAACAGGTCACTTCGTGAGTGGTTTTGTCAAGGAAACTAGCGGACCCCTGCAGTTTACGAGACGAGGCATCTTAGGACCTAAACCACGAGAAAGCGAGCGACCAGATACACAGCTCTAGCCTTTCATAAACTATTCTACGGGTCCCTTAGGGTTGCCTTAGGGATTTCGGTCCCCCAGACAGACGGCACTGGCAAACAGTTCTTTGCTGTGCGAATGTAAGCCTCACCCCCAAAGCGGTGAGCCCAGTGCAGACCAGAGATTAGATAGTCCCATACACCTCGGAGGAACTCGCTTCTCTAAAGAGGCTCTGGGTGATCCACACTGGCAGACCTACCAATCCGTATGACGAGTTAGTGTCGTGGATTATTGCCATGATGGGGTGGCCTTTTTAATGTGCATTCACTGCGTCTATGACTACGGATCAGTTCGTCACCAAACCAGGACAGTGTCCATTTTAGGAGGCGGTACATTAAGTGCCAAAGTACCCGATTTTTAGTCTATTCTGGCTGGATTCGAAATAGCGTAGCTTATTTCGGCCCATGTGTGCCACGCTCGTAAATATAAACGCCTAGCGCGCGGGTTATCGGGCCAGTTTCTATGGTGTAGGCTCGGACCTCAGTGATAAAAATTCCGTCAAATACCCGTGTCACTTAGACGTCTTGGCTAAAAAAATGGCGACTCGAAGCGGCAAGTAAATATGTGATTCTAGGACCCTTGCACCACTAGACCTGGGAAGTTGTCCCTGGTTTGCGGCTCATTGTCAAGTACCGTCCTTGGCCGAACGTTTGACTTTGTCTAGGCCCCTCCGGCGGCAGGCTAAACATAGTTTGGAGCCTGCATTCAAATCTGTGAGCGGCACCCGGAGGAGTGCGCGCCGATGCCGATACACGGTCGTAGCTGGCCGGCGGGCGCCTCCAGGTACTTGTGCATCGGCGACGTGATCAATAGCCCTCTGCTCGGAAGACCGTGACGTTGTGTTAATAAGTTCACGATTGTAAGACTCGTACTTGAGTGTATACGAGGGCGGACCCTGTCTAAACGGTCGGCACAGGGCGATGTCCTTCCTTGTGAAACGAGGTAAGTTGGCTAAAGCTGAATTTGCAAGTTCTGTAGTGGCCTCGATACGATTCTAAGCCCTACGTCATAGCATTTCTTGTCTTAACAGAACCCTCCCAAACTACACCTCCCGGCGAATAAGGAGTTAGCTTCGCGTTGGCCTCGTCAGAACGGAGTAGATCGAAGCAAGCTCGACGCGCGGGGAGGTGAGCACGTCTCTTTCACTAACTGAGTTTGTTCCAGACTGCTGAGCTTTAATCCGACTTCCCACCGTCACACGACTGGGTTTACCAACTATACCCACTATTAAGGCTAACAGAAGCTATATGACGCGTAGTCTCTGCCTCCCCACGCGGGGTGGTCGGCATAGGTCGTCCGGAACGCTAATCGAGTCAATACGAAACTTATAGGACGCTGTGGTAACAAGACATCCCTCTGCTCAGCTGTCCTGAAACATACTACTGTAGAGCCTATCGCACCATGCTTCGGTGGCCGTCGACCAACGAACACTTTCCTCAATGACTGGGACGTTCGGGGGACTACCGGTTGTGGCCGGAGGCGAACAGACCTCCTGCGGGCGGTGCAAAGGTCTATCGAATTGGAATAACGATGAGTATAGGACAAAATTCTCGTGTATCACAGCATAACCCTACACACTGGGACATCCTGATTCTAGCATTATTAAGAAAATGGCCGGGATTGCCGGAAAGAACATCCTCAGGTGTACAACCACCGGTAAGCCTTGCTACTAGATCATGCATCCGTCAGAGGGCTGTAGCCAGAAGCTGGACGGAAATTAAGTCTGTCGGTTCGGGACTATATCTGCGTTAAGGTTACCATGCTTGATTGGGATCCTGTCAGTCCCTATTCTTCATTCTGAATCGTGATAACGAGAAAGCGATTGCAAATTCTTTAACCAGCGCAGGGTAACTGTAACTAATTCTGGTATAACACCCGTCGACTTCGGCGTTTAGTGACGATGAGATACTCAATGGAGGGATGGCTAACCGTTATTGATTCATCGGGGGCATAACAATCTTCTAAAATCTGGAAACCACTTATTCAGGGTCGCGCCATCGGACCCCGAGCAGAAGAGATCATGTCACTTAAGAGTAGCTCTTAGGCACACTACATCCGAACGTGGCTTTATCTACGCACCGTCTCAAAGTTGGCAGACCCGCTATGTGCGCGAAATGGCGCTGCGGCACTGTCGTATGCATTTCTTGCGTCGTACTCACGTACATCGAATGCCCAGCATCAGCCGTCTTCTGATGCGGGTACTGCTTGCCATTTACAGACGATCCGTGCCATGATTGGATGATACACCTCGTGGAACCCTAAGCTCTCCGCGAAAGAAATTACTCAATCCCTGGTTCTAAGTTTCTGCGACAAAAATACTCTAACTGTTTTTCTCATAACATCCTCACCACAGCTCCGGCTAACAGTTCGATCTCAATCAGCTATGCGTTGCTTAAGACGAGTCGGGCACGCTCGATACCTAGTGCTCTAGCTGGAGCAAGTGTTGCGGTCTAGTGAAAATCTGGTCGTTAGATGGGATAACCATAAACGGAAGTATGAGCTCTACTCTTTAAAGAACGATGTATCGGTCAGCGCGTGAGGCCGATTCGCTGTTATACTCGTAACATACACGTCACAATCATCTGCGTGTTATGTGAGCAAGCGCCGGATGATTGTTGGACTAGTACGCCAGATGTGTGATCCCCGCCGTATAAATCTTTTTTCAACACATTCAGTGGGAGAACTGCGCTCGGTTACCCTACTAGGAGCACATTCCCCAAAATCCCTGCAGCCTTTTCATTTAGTCGTGCAGCCAAGGCGACTAATCGGTTAAAACCCATCCGGGAATGACATGAAACTTGCCACTTCGCGTGGTCAAGCCTTTCCAGAACGCCCTTGCATCTTCAGATGTGTAGCGATTAAATCATACTAGTCGACGGCACGCCGCTTTCAGATGTTACGGAGCGAGCAGGGACTTCGATCTTGACTCCATATGCGAGGCTTGCGGCCGTTCGTTTGCGCCAGTTCGGATTAATAACGCTCCCTCGTGTCCAAGAGCCTCTTGCATAAAAAATTCCTTTGTCTAGATTCCTAGCC"
d = 4

In [15]:
ls = ApproxPatternMatching(pattern, text, d)

In [16]:
# Remove comma in the list
b = ''.join(str(ls).split(','))

## Implement ApproximatePatternCount.

    Input: Strings Pattern and Text as well as an integer d.
    Output: Countd(Text, Pattern).

In [8]:
pattern = "TGT"
text = "CGTGACAGTGTATGGGCATCTTT"
d = 1

In [9]:
ApproxPatternCount(pattern, text, d)

8

## Solve the Frequent Words with Mismatches Problem.

    Input: A string Text as well as integers k and d. (You may assume k ≤ 12 and d ≤ 3.)
    Output: All most frequent k-mers with up to d mismatches in Text.


In [10]:
def ImmediateNeighbors(pattern):
    neighbor = set()
    nset = {'A', 'C', 'G', 'T'}
    for i in range(len(pattern)):
        for n in nset:
            neighbor.add(pattern[:i]+n+pattern[i+1:])
    return neighbor

def Neighbors(pattern, d):
    if d == 0:
        return {pattern}
    ineighbor = ImmediateNeighbors(pattern)
    neighbor = ineighbor
    for j in range(d-1):
        for p in ineighbor:
            neighbor = neighbor.union(ImmediateNeighbors(p))
        ineighbor = neighbor
    return neighbor

def FrequentWordsWithMismatches(text, k, d):
    counts = dict()
    for i in range(len(text)-k+1):
        neighbor = Neighbors(text[i:i+k], d)
        for n in neighbor:
            counts[n] = counts.get(n, 0) + 1
    m = max(counts.values())
    return [t for t, v in counts.items() if v == m] 

In [11]:
k = 4
d = 3
text = "ACGT"
ls = Neighbors(text,d)

In [13]:
len(ls)

175

## Frequent Words with Mismatches and Reverse Complements Problem: Find the most frequent k-mers (with mismatches and reverse complements) in a string.

    Input: A DNA string Text as well as integers k and d.
    Output: All k-mers Pattern maximizing the sum Countd(Text, Pattern)+ Countd(Text, Patternrc) over all possible k-mers.


In [23]:
def ReverseComplement(seq):
    for base in seq:
        if base not in 'ATCGatcg':
            print("Error: NOT a DNA sequence")
            return None
    seq1 = 'ATCGTAGCatcgtagc'
    seq_dict = { seq1[i]:seq1[i+4] for i in range(16) if i < 4 or 8<=i<12 }
    return "".join([seq_dict[base] for base in reversed(seq)])

In [24]:
def FrequentWordsWMARC(text, k, d):
    #Frequent Words with Mismatches and Reverse Complements
    counts = dict()
    for i in range(len(text)-k+1):
        neighbor = Neighbors(text[i:i+k], d)
        for n in neighbor:
            nrc = ReverseComplement(n)
            counts[n] = counts.get(n, 0) + 1
            counts[nrc] = counts.get(nrc, 0) + 1
    m = max(counts.values())
    return [t for t, v in counts.items() if v == m]

In [25]:
text = "GTGGGGGCGTGGGGGCCTTGGGGGACGACCTTGGCTTGTGGGGGGGCGTGGGGGCGGCGGCGACGGCGGCTTCTTCTTGGGGCCTTGTGGGCGGGACGGGGGTGGGCGACGGCCTTGTGGGCTTCTTCTTGACGGCGTGGGCGGCGTGGGCGGCTTGGCGGCGGGACGTGGGCCTTCTTGTGGGCCTTCTTGGCGTGGGCGGCGAC"
k = 6
d = 2

In [26]:
FrequentWordsWMARC(text,k,d)

['GGGGGG', 'CCCCCC']

## 

## Implement Neighbors to find the d-neighborhood of a string.

    Input: A string Pattern and an integer d.
    Output: The collection of strings Neighbors(Pattern, d). (You may return the strings in any order, but each line should contain only one string.)


In [27]:
pattern = "TGCAT"
d = 2
ls = Neighbors(pattern, d)

In [28]:
# Remove comma in the list
c = ''.join(str(ls).split(','))

In [29]:
c

"{'AGCAG' 'TGGAT' 'AGCAC' 'AGCAA' 'TGCAA' 'GGTAT' 'TCCAG' 'TGCCC' 'TTCCT' 'GCCAT' 'TTCAC' 'TTCGT' 'TTTAT' 'CGTAT' 'TGCTC' 'TTCAA' 'TACAG' 'TAGAT' 'TGGCT' 'TCAAT' 'TGAGT' 'TGTTT' 'TGCCG' 'AGCCT' 'TGCGT' 'GGCAA' 'ACCAT' 'TTCAT' 'CTCAT' 'GGGAT' 'GGCTT' 'TGCCA' 'TGCTA' 'CGCAC' 'TACAT' 'GGCAG' 'CGGAT' 'TGGGT' 'CGCAT' 'TTCAG' 'TGGAC' 'TGTAT' 'ATCAT' 'TGAAG' 'TGGAA' 'TGCGC' 'CGCAA' 'GGCAT' 'TGCGA' 'TTAAT' 'TGTCT' 'GGCAC' 'TACAC' 'TTGAT' 'TCGAT' 'AGCTT' 'TGTGT' 'TCCAT' 'GGCGT' 'TGCAT' 'TACAA' 'TCCAC' 'TACCT' 'TACGT' 'TGTAC' 'CACAT' 'AGGAT' 'TGCCT' 'CGCAG' 'AGCGT' 'TCCCT' 'CGAAT' 'TCCGT' 'TGTAG' 'TGGAG' 'TGACT' 'AGCAT' 'CGCTT' 'TAAAT' 'TATAT' 'GTCAT' 'AACAT' 'TGCTT' 'GGAAT' 'CGCGT' 'TCTAT' 'GGCCT' 'AGAAT' 'TTCTT' 'TGGTT' 'TGTAA' 'TGCGG' 'GACAT' 'CGCCT' 'TGCAC' 'TGCAG' 'TCCAA' 'CCCAT' 'TGAAT' 'AGTAT' 'TGAAC' 'TACTT' 'TGCTG' 'TGATT' 'TCCTT' 'TGAAA'}"

In [31]:
c[1]

"'"