In [1]:
import re
from collections import Counter

In [2]:
def find_reading_frames_k_mers(sequence):
    sequence = sequence.upper().replace("\n", "")
    translation_table = str.maketrans("ATCG", "TAGC")
    return {1: {3: Counter(re.findall(".{3}", sequence)).most_common(4), 
                6: Counter(re.findall(".{6}", sequence)).most_common(4), 
                9: Counter(re.findall(".{9}", sequence)).most_common(4)},
            2: {3: Counter(re.findall(".{3}", sequence[1:-2])).most_common(4), 
                6: Counter(re.findall(".{6}", sequence[1:-2])).most_common(4), 
                9: Counter(re.findall(".{9}", sequence[1:-2])).most_common(4)},
            3: {3: Counter(re.findall(".{3}", sequence[2:-1])).most_common(4), 
                6: Counter(re.findall(".{6}", sequence[2:-1])).most_common(4), 
                9: Counter(re.findall(".{9}", sequence[2:-1])).most_common(4)},
            4: {3: Counter(re.findall(".{3}", sequence.translate(translation_table))[::-1]).most_common(4), 
                6: Counter(re.findall(".{6}", sequence.translate(translation_table))[::-1]).most_common(4), 
                9: Counter(re.findall(".{9}", sequence.translate(translation_table))[::-1]).most_common(4)},
            5: {3: Counter(re.findall(".{3}", sequence[2:-1].translate(translation_table))[::-1]).most_common(4), 
                6: Counter(re.findall(".{6}", sequence[2:-1].translate(translation_table))[::-1]).most_common(4), 
                9: Counter(re.findall(".{9}", sequence[2:-1].translate(translation_table))[::-1]).most_common(4)},
            6: {3: Counter(re.findall(".{3}", sequence[1:-2].translate(translation_table))[::-1]).most_common(4), 
                6: Counter(re.findall(".{6}", sequence[1:-2].translate(translation_table))[::-1]).most_common(4), 
                9: Counter(re.findall(".{9}", sequence[1:-2].translate(translation_table))[::-1]).most_common(4)}}

In [3]:
def find_k_mers(sequence, k):
    k_mers = []
    n = len(sequence)
    for i in range(0, n-k+1):
        k_mers.append(sequence[i:i+k])
    return k_mers

In [4]:
def find_non_reading_frames_k_mers(sequence):
    sequence = sequence.upper().replace("\n", "")
    results = {1: {}}
    for j in [3, 6, 9]:
        results[1][j] = Counter(find_k_mers(sequence, j)).most_common(4)
    return results

In [5]:
def print_table(sequence, use_reading_frames=True):
    if use_reading_frames:
        for reading_frame, k_mers_dict in find_reading_frames_k_mers(sequence).items():
            print("{0}|3|{1}         {2}         {3}         {4}\n |6|{5}      {6}      {7}      {8}\n |9|{9}   {10}   {11}   {12}"
                  .format(reading_frame, 
                          k_mers_dict[3][0][0], k_mers_dict[3][1][0], k_mers_dict[3][2][0], k_mers_dict[3][3][0],
                          k_mers_dict[6][0][0], k_mers_dict[6][1][0], k_mers_dict[6][2][0], k_mers_dict[6][3][0],
                          k_mers_dict[9][0][0], k_mers_dict[9][1][0], k_mers_dict[9][2][0], k_mers_dict[9][3][0]))
            print(" " + "="*48)
    else:
        for reading_frame, k_mers_dict in find_non_reading_frames_k_mers(sequence).items():
            print("{0}|3|{1}         {2}         {3}         {4}\n |6|{5}      {6}      {7}      {8}\n |9|{9}   {10}   {11}   {12}"
                  .format(reading_frame, 
                          k_mers_dict[3][0][0], k_mers_dict[3][1][0], k_mers_dict[3][2][0], k_mers_dict[3][3][0],
                          k_mers_dict[6][0][0], k_mers_dict[6][1][0], k_mers_dict[6][2][0], k_mers_dict[6][3][0],
                          k_mers_dict[9][0][0], k_mers_dict[9][1][0], k_mers_dict[9][2][0], k_mers_dict[9][3][0]))
            print(" " + "="*48)

In [6]:
def main():
    sequence_vibrio_cholerae = """atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaac
    ctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgaccacggaaagatgatcaagagaggatgatttct
    tggccatatcgcaatgaatacttgtgacttgtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggatt
    acgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttaggatagacggtttttcatcactgactagccaaa
    gccttactctgcctgacatcgaccgtaaattgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaag
    atcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtttccttaaccctctattttttacggaagaat
    gatcaagctgctgctcttgatcatcgtttc"""
    
    sequence_thermotoga_petrophila = """aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttatt
    aactgaaactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaa
    ttgacgaacaattgcatggaattgaatatatgcaaaacaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggt
    aggtttctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatg
    ggcgttgcagaaaacttaccacctaaaatccagtatccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgg
    gtggtaagttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattat
    ttactactactaataatagcagtataattgatctgaaaagaggtggtaaaaaa"""
    
    vibrio_cholerae_header_1 = "K-Mers for Vibrio Cholerae, with reading frames:"
    print(vibrio_cholerae_header_1)
    print("-"*len(vibrio_cholerae_header_1))
    print_table(sequence_vibrio_cholerae, use_reading_frames=True)
    
    print()
    
    vibrio_cholerae_header_2 = "K-Mers for Vibrio Cholerae, without reading frames:"
    print(vibrio_cholerae_header_2)
    print("-"*len(vibrio_cholerae_header_2))
    print_table(sequence_vibrio_cholerae, use_reading_frames=False)
    
    print()
    
    thermotoga_petrophila_header_1 = "K-Mers for Thermotoga Petrophila, with reading frames:"
    print(thermotoga_petrophila_header_1)
    print("-"*len(thermotoga_petrophila_header_1))
    print_table(sequence_thermotoga_petrophila, use_reading_frames=True)
    
    print()
    
    thermotoga_petrophila_header_2 = "K-Mers for Thermotoga Petrophila, without reading frames:"
    print(thermotoga_petrophila_header_2)
    print("-"*len(thermotoga_petrophila_header_2))
    print_table(sequence_thermotoga_petrophila, use_reading_frames=False)

In [7]:
main()

K-Mers for Vibrio Cholerae, with reading frames:
------------------------------------------------
1|3|ATC         GAT         TCT         TTG
 |6|GATCAA      ATCAAT      CGTAAG      CTTCTA
 |9|ATCAATGAT   CAACGTAAG   CTTCTAAGC   ATGATCAAG
2|3|TGA         CTC         CTT         TCA
 |6|TGACAT      CTCTTG      TCAATG      ATCAAC
 |9|TCAATGATC   AACGTAAGC   TTCTAAGCA   TGATCAAGG
3|3|TGA         GAT         ATC         TTT
 |6|GACATC      TACTCT      TGGCCA      AATGAA
 |9|CAATGATCA   ACGTAAGCT   TCTAAGCAT   GATCAAGGT
4|3|TAG         CTA         AGA         AAC
 |6|CTAGTT      GCAAAG      CTAGTA      CGAGAA
 |9|GAACTAGTA   CGACGACGA      CTAGTT   CCTTCTTA 
5|3|ACT         AAA         CTA         TAG
 |6|TTACTT      ATGAGA      ACCGGT      CTGTAG
 |9|ACTAGTAGC   ACGACGAGA    CTAGTTCG   TTCTTA   
6|3|ACT         GAG         GAA         TAG
 |6|GAGAAC      ACTGTA      TAGTAG      GACGAC
 |9|AACTAGTAG   GACGACGAG     CTAGTTC   CTTCTTA  

K-Mers for Vibrio Cholerae, without reading frames:
---