# HashMap

In [6]:
!ls

drive  sample_data


In [0]:
import time
start = time.time()

In [0]:
end = time.time()
end-start

1.8186612129211426

In [18]:
# generate random kemer DNA sequence, l:length of kmer
import random
def random_kmer(l):
    return ''.join(random.choices("ATCG", k=l))


random_kmer(21)

'ACTTGAAACAACCACACTCCT'

In [0]:
#hashfonction: 
#https://www.biostars.org/p/198696/
#https://www.biostars.org/p/184993/
#example:
#GTCGAATC
#K = G.4^0 + T.4^1 + C.4^2 + G.4^3 + A.4^4 + A.4^5 + T.4^6 + C.4^7
#K = 2*4^0 + 3*4^1 + 1*4^2 + 2*4^3 + 0*4^4 + 0*4^5 + 3*4^6 + 1*4^7
#K = 28830


#get hashcode from a kmer of length = l
#turn each nucleotide into binary, aline all the 2bits to make l*2bits binary code
def dna2code(DNA, l): 
    return sum([('A','C','G','T').index(nt)<<(i*2) for i,nt in zip(range(l),DNA)])

#use hashcode return the original kmer
#turn hashcode into binary, >> &3 will take 2bits at a time, and return a necleotide
def code2dna(bits,l):  
    return ''.join([('A','C','G','T')[bits >> (i*2) &3] for i in range(l) ])

#get the complementary dna string of the kmer
def revcomp(bits, l): 

    return ''.join([('T','G','C','A')[bits >> (i*2) & 3] for i in range(l) ])

#unit test
l = 21
error = 0
for i in range(100000):
    dna = random_kmer(l)
    hashcode = dna2code(dna, l)
    dna_r = code2dna(hashcode, l)
    if dna != dna_r:
        error += 1
print("hashcode back to kmer error = ", error)


hashcode back to kmer error =  0


In [0]:
from random import randrange
from collections import MutableMapping

class HashMap(MutableMapping):
    
    #class item to store (key, value)
    class _Item:
        __slots__ = '_key', '_value'
        
        def __init__( self, k, v = None ):
            self._key = k
            self._value = v
        
        #equal items have the same key
        def __eq__( self, other ):
            return self._key == other._key
        
        def __ne__( self, other ):
            return not( self == other )
        
        def __str__( self ):
            return '<' + str ( self._key ) + ',' + str( self._value ) + '>'

        def key( self ):
            return self._key
        
        def value( self ):
            return self._value
        
    
    _AVAIL = object()
    
    def __init__( self, cap = 11):
        self._T = cap * [None]  #hashtable
        self._n = 0             #nb of elements in table
        self._size = cap        #siez of hashtable
        self._collisions = 0     #collision during probing 
        
        #choose parametres for MAD hash_function
        #nombre premier pour la compression MAD
        self._prime = 109345121
        # MAD h2(k) = (scale*k + shift) % prime % self._size 
        self._scale = 1 + randrange( self._prime - 1 )           #scale entre 1 et p-2
        #on trouve un entier multiplicateur entre 1 et p-2
        #qui n'est pas un multipe de p
        trouve = False
        while not trouve:
            self._scale = 1 + randrange( self._prime - 1 )
            if not ( self._scale % self._prime ) == 0:
                trouve = True
        self._shift = randrange( self._prime )         #shift entre 0 et p-1
    
    
    
    def __len__( self ):
        return self._n 
    
    def __contains__( self, k ):
        for j in self:
            if j == k:
                return True
        return False

    
    def _is_available( self, j ):
        return self._T[j] is None or self._T[j] is HashMap._AVAIL
    
    
    #linear probing
    def _find_slot1( self, j, k ):
        firstAvail = None
        step = 1
        while True:
            #if position j is None or _AVAIL
            if self._is_available( j ):
                if firstAvail is None:
                    firstAvail = j  
                if self._T[j] is None:
                    return ( False, firstAvail )
           
            elif k == self._T[j]._key:
                return ( True, j )
            #on avance circulairement dans la table
            #1 method: linear, step = 1,2,3,4,5
            self._collisions += 1
            j = (j + step) % len( self._T )
            step +=1
    
    
    #quadratic probing
    def _find_slot( self, j, k ):
        firstAvail = None
        step = 0
        while True:
            #if position j is None or _AVAIL
            if self._is_available( j ):
                if firstAvail is None:
                    firstAvail = j  
                if self._T[j] is None:
                    return ( False, firstAvail )
            #continue searching in probe table until find key or first availiable 
            elif k == self._T[j]._key:
                return ( True, j )
            #on avance circulairement dans la table
            #quadratic probing, probe step**2 everytime
            self._collisions += 1
            j = (j + step**2) % self._size
            step +=1
            
    
    #turn a dna kmer with length of l into a hashcode
    def _dna2code(self, DNA, l): 
        return sum([('A','C','G','T').index(nt)<<(i*2) for i,nt in zip(range(l),DNA)])

    
    #use hashcode return the original kmer
    #turn hashcode into binary, >> &3 will take 2bits at a time, and return a necleotide
    def _code2dna(self, bits,l):  
        return ''.join([('A','C','G','T')[bits >> (i*2) &3] for i in range(l) ])

    
    #division hash function
    def _hash_function ( self, dna ):
        # dna2code is used as hash_code
        k = self._dna2code(dna, len(dna))
        return k % self._size
    
    
    #MAD hash function
    def _hash_function1( self, dna ):
        # dna2code is used as hash_code
        k = dna2code(dna, len(dna)) 
        return( hash( k ) * self._scale + self._shift ) % self._prime % self._size
  
    
    def __setitem__( self, k, v ):
        j = self._hash_function( k )
        #self._bucket_setitem( j, k, v )
        found, s = self._find_slot( j, k )
        if not found:
            self._T[s] = self._Item( k, v )
            self._n += 1
        else:
            self._T[s]._value = v  
        if self._n > len( self._T ) * 0.75 :
            self._resize( 2 * len( self._T ) - 1 )



    def __getitem__( self, k ):
        j = self._hash_function( k )
        found, s = self._find_slot( j, k )
        if not found:
            raise KeyError
        else:
            return self._T[s]._value

    
    def __delitem__( self, k ):
        j = self._hash_function( k )
        found, s = self._find_slot( j, k )
        if found: # do not use value to check the condition,
                   # what if value itself is False
            value = self._T[s]._value
            self._T[s] = self._AVAIL
            self._n -= 1
            # need resize? minimal size = 
            if self._n < len(self._T) // 4:
                new_cap = max(self._size, (len(self._T)+1)/2)
                self._resize(new_cap)
        else:
            raise KeyError

    
    def __iter__( self ):
        for j in range( len( self._T ) ):
            if not self._is_available( j ):
                yield self._T[j]._key



    def _resize( self, c ):
        old = list( self.items() )
        self._T = c * [None]
        self._n = 0
        self._size = c
        for (k,v) in old:
            self[k] = v

    def is_empty( self ):
        return len( self ) == 0

    def __str__( self ):
        if self.is_empty():
            return "{}"
        pp = "{"
        for item in self.items():
            pp += str( item )
        pp += "}"
        pp += " size = "
        pp += str( len( self ) )
        return pp

    def get( self, k, d = None ):
        try:
            tmp = self[k]
            return tmp
        except KeyError:
            return d

    def setdefault( self, k, d = None ):
        try:
            tmp = self[k]
            return tmp
        except:
            self[k] = d
            return d


    
    def collisions(self):
        return self._collisions
    
    def load_factor( self ):
        return self._n / self._size

In [19]:
h = HashMap()
h['AA'] = 'AA'
h['AC'] = 'AC'
h['AT'] = 'AT'
h['AG'] = 'AG'
del h['AA']
s = HashMap()
s['AC'] = h
h['TG'] = 'TG'
h['TA'] = 'TA'
h['TC'] = 'TC'
h['TT'] = 'TT'
h['CA'] = 'CA'
for i in h:
    print(i)
h.collisions()

h.load_factor()
h['AG']
s['AC']['AG']
h['AG'] ='haha'
s['AC']['AG']
"CC" in s

TG
AT
CA
TA
AC
TT
TC
AG


False

In [20]:
#unit test
import random
import time
if __name__ == '__main__':

    print( "ProbeHashMap unit testing..." )

    M = HashMap(1000000)

    nb = 10000
    #random.seed( 131341 )

    #Insertion
    avant = time.time()
    for i in range( nb ):
        dna = random_kmer(21)
        M[dna] = dna
    apres = time.time()
    cols = M.collisions()
    print( "$$$$ collision times during probing $$$$", cols)
    print( "Insertion of", nb, "keys in ", apres-avant, "seconds." )

    
    
    #Access
    #random.seed( 131341)
    avant = time.time()
    for i in range( nb ):
        dna = random_kmer(21)
        try:
            x = M[dna]
        except KeyError:
            pass
    apres = time.time()
    print( "Access of", nb, "keys in ", apres-avant, "seconds.")
 
    
    #Delete
    #random.seed( 131341 )
    avant = time.time()
    nbdel = 0
    for i in range( nb ):
        dna = random_kmer(21)
        try:
            del M[dna]
        except KeyError:
            pass
    apres = time.time()
    print( "Delete ", nb, "keys in ", apres-avant, "seconds." )

    print( "End of testing." )

ProbeHashMap unit testing...
$$$$ collision times during probing $$$$ 101
Insertion of 10000 keys in  0.20509886741638184 seconds.
Access of 10000 keys in  0.1757185459136963 seconds.
Delete  10000 keys in  0.17936968803405762 seconds.
End of testing.


# Graph

In [22]:
##### use kmers to find edges
seq = 'ATGCGAGTCTCCACGTCAGTC'
l = len(seq)
k = 7
kmers = [seq[i:i+k] for i in range(l - k + 1)]

def edges(kmers_graph):
    for k in kmers_graph:
        for s in 'ATCG':
            successor = k[1:] + s
            if successor in kmers_graph:
                yield k, successor
                
for i in edges(kmers):
    print(i, "edge = ", i[1][-1])

('ATGCGAG', 'TGCGAGT') edge =  T
('TGCGAGT', 'GCGAGTC') edge =  C
('GCGAGTC', 'CGAGTCT') edge =  T
('CGAGTCT', 'GAGTCTC') edge =  C
('GAGTCTC', 'AGTCTCC') edge =  C
('AGTCTCC', 'GTCTCCA') edge =  A
('GTCTCCA', 'TCTCCAC') edge =  C
('TCTCCAC', 'CTCCACG') edge =  G
('CTCCACG', 'TCCACGT') edge =  T
('TCCACGT', 'CCACGTC') edge =  C
('CCACGTC', 'CACGTCA') edge =  A
('CACGTCA', 'ACGTCAG') edge =  G
('ACGTCAG', 'CGTCAGT') edge =  T
('CGTCAGT', 'GTCAGTC') edge =  C


In [0]:
class DeBrujinGraph:

    def __init__( self, nodes, k=21, c = 11 ):
        """nodes is iterable str"""
        self._k = k
        #create two hashMap to save outcoming and incoming edges
        self._outgoing = HashMap(c)
        self._incoming = HashMap(c)
        
        start = time.time()
        #add node as key into outgoing and incoming, create hashMap for each node
        for N in nodes:   
            if len(N) == k:
                self._outgoing[N] = HashMap()           
                self._incoming[N] = HashMap()
            else: raise ValueError ('kmers leagth need to be', k)
        
        end = time.time()
        print("finish creating outgoing and incoming hashMaps! used time:", end-start)
        
        #find possible successors for each node
        start = time.time()
        for node in nodes:
            for base in 'ATCG':
                successor = node[1:] + base
                if successor in nodes :
                    #print("possible successors of", node, ":", successor )
                    self._outgoing[node][successor] = successor[-1]
                    self._incoming[successor][node] = successor[-1]
        end = time.time()
        print("finish updating hashMaps for all nodes! used time:", end-start)
  
        #for node in self._outgoing :
         #   str = ''
          #  for succ in self._outgoing[node]:
           #     str += succ + ', '
            #print(node,'has succ:',str)
            
        #for node in self._incoming :
         #   str = ''
          #  for pred in self._outgoing[node]:
           #     str += pred + ', '
            #print(node,'has pred:',str)
        print("$$$$$$$$$$$$$ initialization is over!!! $$$$$$$$$$$$$$$$")
     
    
    def nodes( self ):
        return self._outgoing
    
    
    def __contains__( self, node ):
        """check if N:str exist as a node in graph"""
        return node in self._outgoing
    

    def __iter__( self ):
        """return iterable of nodes in the graph"""
        for node in self._outgoing:
            yield node
    
    
    def load_factor( self ):
        """the charge factor of hashtMap"""
        return self._outgoing.load_factor()

    
    def add( self, new_node):
        """add node N:str"""
        """this add will only find possible successors and predecessor of new node N """
        if len(new_node) == self._k:
            self._outgoing[new_node] = HashMap()  
            self._incoming[new_node] = HashMap()
            
            for base in 'ATCG':
                #find all successors of N
                successor = new_node[1:] + base
                print('possible succ :', successor)
                if successor in self.nodes() :
                    print("new_node:",new_node, "successor:", successor, 'successor in self.nodes():',successor in self.nodes())
                    self._outgoing[new_node][successor] = base
                    print("outgoing updated")
                    self._incoming[successor][new_node] = base
                    print("incoming updated")
                        
            #find all predecessors of N
                predecessor = base + new_node[:-1]
                print('possible pred :', predecessor)
                if predecessor in self.nodes() :
                    print("new_node:",new_node, "predecessor:", predecessor)
                    self._outgoing[predecessor][new_node] = base
                    self._incoming[new_node][predecessor] = base
            
        else: raise ValueError ('kmers leagth need to be', self._k)
        
        
    def remove( self, node ):
        """remove node N"""
        if node in self.nodes():
            successors = self.successors(node)  
            predecessors = self.predecessors(node)  
            
            #delet N from the incoming list of its successors
            #delet N from the outgoing list of its predecessors
            for succ in successors:
                del self._outgoing[node][succ]
                del self._incoming[succ][node]
            
            for pred in predecessors:
                del self._outgoing[pred][node]
                del self._incoming[node][pred]
            
            #delet N from hashMap of graph
            del self._outgoing[node] 
            del self._incoming[node]
            
        else: raise ValueError (N, 'does not exist in graph')
    
    
    def predecessors( self, N ):
        """return all predecessor of node N:str"""
        if N in self._incoming:
            return self._incoming[N]
        else:
            raise ValueError (N, 'is not in graph')
            
    
    def successors( self, N ):
        """return all successors of node N:str"""
        if N in self._outgoing:
            return self._outgoing[N]
        else:
            raise ValueError (N, 'is not in graph')
            

In [24]:
#test for graph 
seq = 'ACTGA'
k = 2
kmers = [seq[i:i+k] for i in range(len(seq) - k + 1)]

print(kmers)
print(seq)
graph = DeBrujinGraph(kmers, k=2)

for node in graph.nodes():
    for successor in graph.successors(node):
        print("successor of", node, ":", successor)
for node in graph.nodes():    
    for predecessor in graph.predecessors(node):
        print("predecessor of", node, ":", predecessor)

#test .nodes()
for node in graph.nodes(): print(node)

#test __contains__ 
print("!!!!!!!!!!!!test CC in graph")
print('CC' in graph)

#test __iter__
for node in graph: print(node)
#test load factor
print("load factor:", graph.load_factor())

print('************')

##########################################
print('\r\n','### add GA ###')
graph.add("GA")
for node in graph.nodes(): print(node)
for node in graph.nodes():
    for successor in graph.successors(node):
        print("successors of", node, ":", successor)
for node in graph.nodes():
    for predecessor in graph.predecessors(node):
        print("predecessor of", node, ":", predecessor)
print('!!!!!!!!!! done add !!!!!!!!!!!!!!')


print('\r\n','### remove GA ###')
graph.remove("GA")
for node in graph.nodes(): print(node)
for node in graph.nodes():
    for successor in graph.successors(node):
        print("successors of", node, ":", successor)
for node in graph.nodes():
    for predecessor in graph.predecessors(node):
        print("predecessor of", node, ":", predecessor)
print('!!!!!!!!!! done remove !!!!!!!!!!!!!!')

['AC', 'CT', 'TG', 'GA']
ACTGA
finish creating outgoing and incoming hashMaps! used time: 0.00010967254638671875
finish updating hashMaps for all nodes! used time: 7.843971252441406e-05
$$$$$$$$$$$$$ initialization is over!!! $$$$$$$$$$$$$$$$
successor of TG : GA
successor of CT : TG
successor of GA : AC
successor of AC : CT
predecessor of TG : CT
predecessor of CT : AC
predecessor of GA : TG
predecessor of AC : GA
TG
CT
GA
AC
!!!!!!!!!!!!test CC in graph
False
TG
CT
GA
AC
load factor: 0.36363636363636365
************

 ### add GA ###
possible succ : AA
possible pred : AG
possible succ : AT
possible pred : TG
new_node: GA predecessor: TG
possible succ : AC
new_node: GA successor: AC successor in self.nodes(): True
outgoing updated
incoming updated
possible pred : CG
possible succ : AG
possible pred : GG
TG
CT
GA
AC
successors of TG : GA
successors of CT : TG
successors of GA : AC
successors of AC : CT
predecessor of TG : CT
predecessor of CT : AC
predecessor of GA : TG
predecessor of 

#set google colab directory to drive, to choose file in google drive, use :  drive/My Drive/filename.xxx

In [25]:
from google.colab import drive
drive.mount('/content/drive')
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
drive  sample_data


In [26]:
import gzip
from itertools import islice
with gzip.open('drive/My Drive//GCF_000002985.6_WBcel235_rna.fna.gz', 'rt') as f:
    print(f.readline()) # read one line of the text
    print(f.readline())

>NM_001025782.2 Caenorhabditis elegans Phosphatidylinositol 3-kinase catalytic subunit type 3 (vps-34), partial mRNA

ATGCGAGTCTCCACGTCAGTCAACGGTGGTGTTGGGATTGTTTCGGCCTGTACCCGTTACTGTGTAGCTGATCCTGAACT



In [27]:
with gzip.open('drive/My Drive/reads.fastq.gz', 'rt') as f:
    print(f.readline())
    print(f.readline())
    print(f.readline())
    print(f.readline())

@SLFZSLQY <unknown description>

GTGAAGTTGAGAGGAGGAGATGAGATTACCTATGATTATATGGTTATTGCCATGGGCGTTCAGTTGAGATATGATATGATCAAAGGAGCAAAAGAGGCTC

+

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



In [0]:
def read_fastq(path):
    with gzip.open(path, 'rt') as f:
        for line in f:
            seqid, description = line[1:].rstrip().split(maxsplit=1)
            sequence = f.readline().rstrip()
            _ = f.readline()
            quality = f.readline().rstrip()
            yield seqid, description, sequence, quality

In [0]:
next(read_fastq('drive/My Drive/reads.fastq.gz'))

('SLFZSLQY',
 '<unknown description>',
 'GTGAAGTTGAGAGGAGGAGATGAGATTACCTATGATTATATGGTTATTGCCATGGGCGTTCAGTTGAGATATGATATGATCAAAGGAGCAAAAGAGGCTC',
 '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [29]:
%%time
seq_length =0
seq_nb = 0
seq_diff = 0
seq_set = set()
for _ in read_fastq('drive/My Drive/reads.fastq.gz'):
    seqid, description, sequence, quality = _
    seq_set.add(sequence)
    seq_nb += 1
print('seq_nb = ', seq_nb)
print('seq_diff', seq_diff)
print('set_set size:', len(seq_set))

seq_nb =  107845
seq_diff 0
set_set size: 106960
CPU times: user 490 ms, sys: 22.4 ms, total: 513 ms
Wall time: 516 ms


In [30]:
%%time
k = 21
l = 100 # the length of sequences in Fastq
kmers_fastq = set()
for seq in seq_set:
  [kmers_fastq.add(seq[i:i+k]) for i in range(l - k + 1)]

print("21kmers obtained from frastq file:",len(kmers_fastq))




21kmers obtained from frastq file: 5126513
CPU times: user 4.95 s, sys: 412 ms, total: 5.36 s
Wall time: 5.36 s


In [0]:
%%time
graph_fastq = DeBrujinGraph(kmers_fastq, k=21, c=6000000)

finish creating outgoing and incoming hashMaps! used time: 587.8496370315552
finish updating hashMaps for all nodes! used time: 319.0339255332947
$$$$$$$$$$$$$ initialization is over!!! $$$$$$$$$$$$$$$$
CPU times: user 15min, sys: 6.59 s, total: 15min 7s
Wall time: 15min 6s


# test Yeqiang: import fastq into list, use first 10000

In [0]:
def read_fastq(path):
    with gzip.open(path, 'rt') as f:
        for line in f:
            seqid, description = line[1:].rstrip().split(maxsplit=1)
            sequence = f.readline().rstrip()
            _ = f.readline()
            quality = f.readline().rstrip()
            yield seqid, description, sequence, quality

k = 21
kmers = []            
for _, _, seq, _ in read_fastq('drive/My Drive/reads.fastq.gz'):
    kmers.extend([seq[i:i+k] for i in range(len(seq) - k + 1)])
    


In [33]:
%%time
kmers_test = kmers[0:10000]

print(kmers_test[0:10])

graph_test = DeBrujinGraph(kmers_test, k=21, c=12000)

['GTGAAGTTGAGAGGAGGAGAT', 'TGAAGTTGAGAGGAGGAGATG', 'GAAGTTGAGAGGAGGAGATGA', 'AAGTTGAGAGGAGGAGATGAG', 'AGTTGAGAGGAGGAGATGAGA', 'GTTGAGAGGAGGAGATGAGAT', 'TTGAGAGGAGGAGATGAGATT', 'TGAGAGGAGGAGATGAGATTA', 'GAGAGGAGGAGATGAGATTAC', 'AGAGGAGGAGATGAGATTACC']
finish creating outgoing and incoming hashMaps! used time: 2.4782772064208984
finish updating hashMaps for all nodes! used time: 5.621918439865112
$$$$$$$$$$$$$ initialization is over!!! $$$$$$$$$$$$$$$$
CPU times: user 8.07 s, sys: 31.4 ms, total: 8.1 s
Wall time: 8.1 s


In [34]:
print("number of nodes:", len(graph_test.nodes()))
      
print("number of edges:", graph_test.edges_number())

number of nodes: 10000


AttributeError: ignored

#test of DFS

In [39]:
##### use kmers to find edges
seq = 'ATGCGAGTCTCCACGTCAGTC'
k = 7
kmers7 = [seq[i:i+k] for i in range(len(seq) - k + 1)]

def edges(kmers_graph):
    for k in kmers_graph:
        for s in 'ATCG':
            successor = k[1:] + s
            if successor in kmers_graph:
                yield k, successor
                
for i in edges(kmers7):
    print(i, "edge = ", i[1][-1])

('ATGCGAG', 'TGCGAGT') edge =  T
('TGCGAGT', 'GCGAGTC') edge =  C
('GCGAGTC', 'CGAGTCT') edge =  T
('CGAGTCT', 'GAGTCTC') edge =  C
('GAGTCTC', 'AGTCTCC') edge =  C
('AGTCTCC', 'GTCTCCA') edge =  A
('GTCTCCA', 'TCTCCAC') edge =  C
('TCTCCAC', 'CTCCACG') edge =  G
('CTCCACG', 'TCCACGT') edge =  T
('TCCACGT', 'CCACGTC') edge =  C
('CCACGTC', 'CACGTCA') edge =  A
('CACGTCA', 'ACGTCAG') edge =  G
('ACGTCAG', 'CGTCAGT') edge =  T
('CGTCAGT', 'GTCAGTC') edge =  C


In [40]:
graph_kmer7 = DeBrujinGraph(kmers7, k=7)
#for node in graph_kmer7.successors("ATGCGAG"): print(node)

discovered = HashMap()
start = "ATGCGAG"


def dfs(graph, start, visited):
  for node in graph.successors(start):
    if not node in visited:
      visited.append(node[-1])    
      dfs(graph, node, visited)  
    return visited

#test
visited = dfs(graph_kmer7,"ATGCGAG", ["ATGCGAG"])

dna = ""
for i in visited:
  dna += i

print(dna)

   


finish creating outgoing and incoming hashMaps! used time: 0.0012211799621582031
finish updating hashMaps for all nodes! used time: 0.0036890506744384766
$$$$$$$$$$$$$ initialization is over!!! $$$$$$$$$$$$$$$$
ATGCGAGTCTCCACGTCAGTC
