# HashMap

In [20]:
# generate random kemer DNA sequence, l:length of kmer
import random
def random_kmer(l):
    return ''.join(random.choices("ATCG", k=l))


random_kmer(21)

'CAATCTAGCTTTAAGTGACCC'

In [21]:
#hashfonction: 
#https://www.biostars.org/p/198696/
#https://www.biostars.org/p/184993/
#example:
#GTCGAATC
#K = G.4^0 + T.4^1 + C.4^2 + G.4^3 + A.4^4 + A.4^5 + T.4^6 + C.4^7
#K = 2*4^0 + 3*4^1 + 1*4^2 + 2*4^3 + 0*4^4 + 0*4^5 + 3*4^6 + 1*4^7
#K = 28830


#get hashcode from a kmer of length = l
#turn each nucleotide into binary, aline all the 2bits to make l*2bits binary code
def dna2code(DNA, l): 
    return sum([('A','C','G','T').index(nt)<<(i*2) for i,nt in zip(range(l),DNA)])

#use hashcode return the original kmer
#turn hashcode into binary, >> &3 will take 2bits at a time, and return a necleotide
def code2dna(bits,l):  
    return ''.join([('A','C','G','T')[bits >> (i*2) &3] for i in range(l) ])

#get the complementary dna string of the kmer
def revcomp(bits, l): 

    return ''.join([('T','G','C','A')[bits >> (i*2) & 3] for i in range(l) ])

#unit test
l = 21
error = 0
for i in range(100000):
    dna = random_kmer(l)
    hashcode = dna2code(dna, l)
    dna_r = code2dna(hashcode, l)
    if dna != dna_r:
        error += 1
print("hashcode back to kmer error = ", error)


hashcode back to kmer error =  0


In [105]:
from random import randrange

class HashMap:
    
    #class item to store (key, value)
    class _Item:
        __slots__ = '_key', '_value'
        
        def __init__( self, k, v = None ):
            self._key = k
            self._value = v
        
        #equal items have the same key
        def __eq__( self, other ):
            return self._key == other._key
        
        def __ne__( self, other ):
            return not( self == other )
        
        def __str__( self ):
            return '<' + str ( self._key ) + ',' + str( self._value ) + '>'

        def key( self ):
            return self._key
        
        def value( self ):
            return self._value
        
    
    def __init__( self, cap = 11 ):
        self._T = cap * [None]  #hashtable
        self._n = 0             #nb of elements in table
        self._size = cap        #siez of hashtable
        self._collisions = 0     #collision during probing 
        
        #choose parametres for MAD hash_function
        #nombre premier pour la compression MAD
        self._p = 109345121
        # MAD h2(k) = (ak + b) % p % self._size 
        self._a = 1 + randrange( self._p - 1 )           #scale entre 1 et p-2
        #on trouve un entier multiplicateur entre 1 et p-2
        #qui n'est pas un multipe de p
        trouve = False
        while not trouve:
            self._a = 1 + randrange( self._p - 1 )
            if not ( self._a % self._p ) == 0:
                trouve = True
        self._b = randrange( self._p )         #shift entre 0 et p-1
    
    
    
    def __len__( self ):
        return self._n 
    
    
    _AVAIL = object()
    
    def _is_available( self, j ):
        return self._T[j] is None or self._T[j] is HashMap._AVAIL
    
    
    #linear probing
    def _find_slot1( self, j, k ):
        firstAvail = None
        step = 1
        while True:
            #if position j is None or _AVAIL
            if self._is_available( j ):
                if firstAvail is None:
                    firstAvail = j  
                if self._T[j] is None:
                    return ( False, firstAvail )
           
            elif k == self._T[j]._key:
                return ( True, j )
            #on avance circulairement dans la table
            #1 method: linear, step = 1,2,3,4,5
            self._collisions += 1
            j = (j + step) % len( self._T )
            step +=1
    
    
    #quadratic probing
    def _find_slot( self, j, k ):
        firstAvail = None
        step = 1
        while True:
            #if position j is None or _AVAIL
            if self._is_available( j ):
                if firstAvail is None:
                    firstAvail = j  
                if self._T[j] is None:
                    return ( False, firstAvail )
           
            elif k == self._T[j]._key:
                return ( True, j )
            #on avance circulairement dans la table
            #1 method: linear, step = 1,2,3,4,5
            self._collisions += 1
            j = (j + 2**(step-1)) % len( self._T )
            step +=1
            
    
    #turn a dna kmer with length of l into a hashcode
    def dna2code(DNA, l): 
        return sum([('A','C','G','T').index(nt)<<(i*2) for i,nt in zip(range(l),DNA)])

    
    #use hashcode return the original kmer
    #turn hashcode into binary, >> &3 will take 2bits at a time, and return a necleotide
    def code2dna(bits,l):  
        return ''.join([('A','C','G','T')[bits >> (i*2) &3] for i in range(l) ])

    
    #division hash function
    def _hash_function ( self, dna ):
        # dna2code is used as hash_code
        k = dna2code(dna, len(dna))
        return k % self._size
    
    
    #MAD hash function
    def _hash_function1 ( self, dna ):
        # dna2code is used as hash_code
        k = dna2code(dna, len(dna)) 
        return( hash( k ) * self._a + self._b ) % self._p % self._size

    
    
    def __setitem__( self, k, v ):
        j = self._hash_function( k )
        found, s = self._find_slot( j, k )
        
        # if position j is None or _AVAIL, found = false
        if not found:
            self._T[s] = self._Item( k, v )
            self._n += 1
            
        # if position j is not empty, and k=key, found =true
        # we set new value for this position
        else:
            self._T[s]._value = v

        if self._n > len( self._T ) * 0.75:
            self._resize( 2 * len( self._T ) - 1 )
   

    
    def __getitem__( self, k ):
        j = self._hash_function( k )
        found, s = self._find_slot( j, k )
        # if position j is None or _AVAIL, found = false
        if not found:
            return False
        return self._T[j]._value
    

    
    def __delitem__( self, k ):
        j = self._hash_function( k )
        found, s = self._find_slot( j, k )
        # if position j is None or _AVAIL, found = false
        if not found:
            raise KeyError 

        value = self._T[s]._value
        self._T[s] = HashMap._AVAIL
        self._n -= 1
        return value
    
 

    def __iter__( self ):
        for j in range( len( self._T ) ):
                yield self._T[j]
   


    def _resize( self, c ):
        old = self._T         #copy old hashtable
        self._T = c * [None]  #creat new hashtable
        self._n = 0
        self._size = c
        for i in range(0, len(old)):   #fill new hashtable
            if old[i]:
                k, v = old[i]._key, old[i]._value
                self[k] = v 
    
    def keys(self):
        for i in self._T:
            if i and (i is not HashMap._AVAIL):
                yield i.key()
    
    
    def collisions(self):
        return self._collisions
    
    def load_factor( self ):
        return self._n / self._size

In [108]:
h = HashMap()
h['AA'] = 'AA'
h['AC'] = 'AC'
h['AT'] = 'AT'
h['AG'] = 'AG'
del h['AA']
#h['TG'] = 'TG'
#h['TA'] = 'TA'
#h['TC'] = 'TC'
#h['TT'] = 'TT'
#h['CA'] = 'CA'
for i in h:
    print(i)
h.collisions()
for i in h.keys(): print(i)
h.load_factor()

<object object at 0x000001A093CABF30>
<AT,AT>
None
None
<AC,AC>
None
None
None
<AG,AG>
None
None
AT
AC
AG


0.2727272727272727

In [264]:
#unit test
import random
import time
if __name__ == '__main__':

    print( "ProbeHashMap unit testing..." )

    M = HashMap()

    nb = 10000
    #random.seed( 131341 )

    #Insertion
    avant = time.time()
    for i in range( nb ):
        dna = random_kmer(21)
        M[dna] = dna
    apres = time.time()
    cols = M.collisions()
    print( "$$$$ collision times during probing $$$$", cols)
    print( "Insertion of", nb, "keys in ", apres-avant, "seconds." )

    
    
    #Access
    #random.seed( 131341)
    avant = time.time()
    for i in range( nb ):
        dna = random_kmer(21)
        try:
            x = M[dna]
        except KeyError:
            pass
    apres = time.time()
    print( "Access of", nb, "keys in ", apres-avant, "seconds.")
 
    
    #Delete
    #random.seed( 131341 )
    avant = time.time()
    nbdel = 0
    for i in range( nb ):
        dna = random_kmer(21)
        try:
            del M[dna]
        except KeyError:
            pass
    apres = time.time()
    print( "Delete ", nb, "keys in ", apres-avant, "seconds." )

    print( "End of testing." )

ProbeHashMap unit testing...
$$$$ collision times during probing $$$$ 18960
Insertion of 10000 keys in  0.750889778137207 seconds.
Access of 10000 keys in  0.32099008560180664 seconds.
Delete  10000 keys in  0.3667323589324951 seconds.
End of testing.


# Graph

In [82]:
class DeBrujinGraph:

    def __init__( self, nodes, k=21 ):
        """nodes is iterable str"""
        self._k = 21
        self._nodes = nodes
        
        #create two hashMap to save outcoming and incoming edges
        self._outgoing = HashMap()
        self._incoming = HashMap()
        for N in nodes:
            if len(N) == k:
                self._outgoing[N] = None
                self._incoming[N] = None
            else: raise ValueError ('kmers leagth need to be', k)

        
    
    def __contains__( self, N ):
        """check if N:str exist as a node in graph"""
        return N in self._nodes
    

    def __iter__( self ):
        """return iterable of nodes in the graph"""
        return self._outgoing.keys()
    
    def load_factor( self ):
        """the charge factor of hashtMap"""
        return self._outgoing.load_factor()
    
    
    def add( self, N):
        """add node N:str"""
        if len(N) == k:
            self._outgoing[N] = None
            self._incoming[N] = None
        else: raise ValueError ('kmers leagth need to be', k)
        
        
    
    def remove( self, N ):
        """remove node N"""
        del self._outgoing[N]
        del self._incoming[N]
    
    
    def nodes( self ):
        return self._outgoing.keys()
    
    def predecessors( self, N ):
        """return all predecessor of node N:str"""
        pass
    
    def successors( self, N ):
        """return all successors of node N:str"""
        pass
        

In [84]:
nodes = ["AA", 'AC', 'AT', 'AG', 'TA']
graph=DeBrujinGraph(nodes, k=2)
for i in graph:
    print(i)
print('AC' in graph)
graph.load_factor()

AT
TA
AA
AC
AG
True


0.45454545454545453

str