# Generate dictionary of all possible k-mers of DNA

* DNA sequence consist only of nucleotides named A, C, G, T
* "Sequence" is a certain combination of those nucleotides, such as AGGCCCCTGCTAA
* k-mer is a sequence of certain length "k". For example, 6-mer is a sequence of 6 nucleotides, such as GCCTAA

This will work only if k < 13, as the number of variable 13-mers become too big to fit in the memory

In [3]:
import itertools

In [1]:
def ProduceListOfAllPossibleKMers(length):
    return [''.join(x) for x in itertools.product('ACGT', repeat=length)]

def KMerListToDict(k_mer_list):
    kmer_index = range(0, len(k_mer_list), 1)
    return dict(zip(k_mer_list, kmer_index))

def ProduceDictOfAllPossibleKMers(length):
    kmer_list = ProduceListOfAllPossibleKMers(length)
    kmer_dict = KMerListToDict(kmer_list)
    return kmer_dict

In [4]:
# Producing dictionary of 6-mers, as an example:
hexamer_dict = ProduceDictOfAllPossibleKMers(6)

In [5]:
hexamer_dict

{'GAACGT': 2075,
 'CTTCTT': 2015,
 'CACCCT': 1111,
 'GAACGG': 2074,
 'GAACGC': 2073,
 'GAACGA': 2072,
 'CACCCA': 1108,
 'CTTCTA': 2012,
 'CACCCC': 1109,
 'CTTCTC': 2013,
 'CACCCG': 1110,
 'CTTCTG': 2014,
 'CGTGTG': 1774,
 'TAAGGT': 3115,
 'CGTGTC': 1773,
 'CGTGTA': 1772,
 'GGAAAT': 2563,
 'TAAGGG': 3114,
 'CGTGTT': 1775,
 'TAAGGC': 3113,
 'TAAGGA': 3112,
 'TCACTG': 3358,
 'GTCAAA': 2880,
 'CCCGCT': 1383,
 'GTCAAG': 2882,
 'CTGTCC': 1973,
 'TCAGAG': 3362,
 'CTGTCA': 1972,
 'CTGTCG': 1974,
 'GTATCT': 2871,
 'TCAGAA': 3360,
 'GTCAAT': 2883,
 'GTATCA': 2868,
 'GTATCC': 2869,
 'GTATCG': 2870,
 'CTGTCT': 1975,
 'GGTGTC': 2797,
 'GGTGTA': 2796,
 'GGTGTG': 2798,
 'TATCCT': 3287,
 'CCGGGG': 1450,
 'TTCTGT': 3963,
 'ATTCCT': 983,
 'CCGGGC': 1449,
 'CCGGGA': 1448,
 'TATCCA': 3284,
 'TATCCC': 3285,
 'GTTCTG': 3038,
 'TACACC': 3141,
 'TATCCG': 3286,
 'GGTGTT': 2799,
 'ATTCCG': 982,
 'TTCTGG': 3962,
 'CCGGGT': 1451,
 'ATTCCC': 981,
 'TTCTGC': 3961,
 'ATTCCA': 980,
 'GTTCTA': 3036,
 'TGCACT': 3655,
 

In [8]:
# Size of the resulting dictionary
len(hexamer_dict)

4096

In [9]:
# Number of expected variations
4**6

4096