In [1]:
dna = "AAAGGG"

In [2]:
dna[5]

'G'

In [3]:
dna[5] = "S"

TypeError: 'str' object does not support item assignment

In [5]:
import random

In [10]:
random.randint(0, 2)

0

In [13]:
random.choice(["A", "T", "G", "C"])

'C'

In [17]:
random.choice("ATCG")

'T'

In [None]:
# We want to make SNP in the DNA.

In [19]:
dna = "AAAGTGGTCGCGGACG"
# list(dna)

In [20]:
# naive approach:
def make_snp_loop(dna):
    dna_list = list(dna)    
    mutation_site = random.randint(0, len(dna_list) - 1)
    dna_list[mutation_site] = random.choice("ATCG")
    return ''.join(dna_list)

In [21]:
# semi-naive approach:
def make_snp_fast_loop(dna):
    i = random.randint(0, len(dna)-1)
    return dna[:i] + random.choice("ACGT") + dna[i+1:]

In [23]:
print(dna)
print(make_snp_loop(dna))

AAAGTGGTCGCGGACG
AAAGTGGACGCGGACG


In [25]:
print(dna)
print(make_snp_fast_loop(dna))

AAAGTGGTCGCGGACG
AAAGTGGTCGCGGTCG


In [None]:
# Let's imagine we have huge DNA string.

In [26]:
def generate_string_v1(N, alphabet="ACGT"):
    return ''.join([random.choice(alphabet) for i in range(N)])

In [27]:
long_dna = generate_string_v1(1_000_000)

In [29]:
# long_dna

In [33]:
%%time
nmutations = 10_000
for i in range(nmutations):
    dna = make_snp_loop(long_dna)

KeyboardInterrupt: 

In [31]:
%%time
nmutations = 10_000
for i in range(nmutations):
    dna = make_snp_fast_loop(long_dna)

CPU times: user 1.78 s, sys: 0 ns, total: 1.78 s
Wall time: 1.78 s


In [None]:
# %%timeit
# nmutations = 10_000
# for i in range(nmutations):
#     dna = make_snp_loop(long_dna)

In [32]:
%%timeit
nmutations = 10_000
for i in range(nmutations):
    dna = make_snp_fast_loop(long_dna)

1.98 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
'''
let's create __all the mutation sites__ 
and __all the new bases at these sites__ 
at once (in vector form; without "for"-loop)
''' 

In [None]:
'''
NUMPY ARRAY consists of a "raw" memory buffer that is interpreted as an N-dimentional array through "views". 

VIEW, in the numpy sense, are just a different way of slicing and dicing the same memory buffer 
# without making a copy.

MANIPULATIONS like indexing/reshaping operations on a numpy array will just return an another view of the 
# not changed original memory buffer.
'''

In [34]:
import numpy as np

In [None]:
# 1

In [35]:
new_bases_i = np.random.randint(0, 3, size=3)
new_bases_c = np.random.choice(list("ACGT"), size=3)

In [36]:
new_bases_i

array([2, 1, 2])

In [37]:
new_bases_c

array(['T', 'C', 'T'], dtype='<U1')

In [None]:
# 2

In [38]:
dna = "AAAGGG"

In [39]:
dna

'AAAGGG'

In [40]:
dna = np.array(dna, dtype='c')

In [41]:
dna

array([b'A', b'A', b'A', b'G', b'G', b'G'], dtype='|S1')

In [42]:
print(type(dna[0]))

<class 'numpy.bytes_'>


In [None]:
# 3

In [43]:
dna[new_bases_i]

array([b'A', b'A', b'A'], dtype='|S1')

In [44]:
dna[new_bases_i] = new_bases_c

In [45]:
dna

array([b'A', b'C', b'T', b'G', b'G', b'G'], dtype='|S1')

In [46]:
def make_snp_vector(dna, N):
    dna_np = np.array(dna, dtype='c') 
    
    # Create mutation sites:
    mutation_sites = np.random.randint(0, len(dna) - 1, size=N)                              
    
    # Create new bases:
    new_bases_c = np.random.choice(list("ACGT"), size=N)  
    
    dna_np[mutation_sites] = new_bases_c
       
    return (b''.join(dna_np)).decode()

In [47]:
a = make_snp_vector("AAAGGG", 3)

In [48]:
a

'TAGTGG'

In [49]:
len(long_dna)

1000000

In [50]:
%%timeit
nmutations = 10_000
make_snp_vector(long_dna, nmutations)

188 ms ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
