In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import itertools
import pandas as pd
from scipy.optimize import fmin_l_bfgs_b
from scipy.optimize import check_grad
from scipy.special import logsumexp
from Bio import SeqIO
import random

%load_ext cython

#Load Robert Kern's line profiler
%load_ext line_profiler
import line_profiler

In [3]:
#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)
from Cython.Compiler.Options import get_directive_defaults
directive_defaults = get_directive_defaults()
directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

### Import fasta files

In [4]:
def parse_fasta(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fasta")
    return [str(record.seq) for record in input_seq_iterator]

In [5]:
set_size = 20
bg = parse_fasta('HNRNPA0_1_TGTCGA40NCCGA_AAG_1.fasta.tmp')
plus = parse_fasta('HNRNPA0_4_TGTCGA40NCCGA_AAG_4.fasta.tmp')

bg = random.sample(bg, set_size)
plus = random.sample(plus, set_size)

In [6]:
bg   = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in bg]
plus = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in plus]

### cython

In [67]:
%%cython -a -f -I . --compile-args=-DCYTHON_TRACE=1 


cimport cython
import numpy as np
import itertools
from libc.math cimport exp,pow


cdef int l = 3 #l_A=l_B=3 nucleotides
cdef int l_p = 3 #persistence length is 3 nucleotides 
cdef double cpi = np.pi

cpdef generate_kmer_inx():
    cdef dict vals = {'A':0,'C':1,'G':2,'T':3}
    cdef dict kmer_inx = {}
    
    for p in list(itertools.product(vals.keys(), repeat=l)):
        inx = 0
        for j,base in enumerate(p):
            inx += (4**j)*vals[base] 
        kmer_inx[''.join(p)] = inx
    return kmer_inx

kmer_inx = generate_kmer_inx()


cpdef seq2int_cy(str sequence):
    cdef int L = len(sequence)
    kmer_array = np.zeros(L, dtype=int)
    
    cdef i
    for i in range(l-1,L):
        kmer = sequence[i-l+1:i+1]
        kmer_array[i] = kmer_inx[kmer]
    return kmer_array        


cpdef void assign_za_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double D):
    if i == l-1:
        za[i] = cab * exp(-Ea[x[i]])
        return
    za[i] = zb[i-l] * cab * exp(-Ea[x[i]])
    
cpdef void assign_zb_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double D):
    cdef double z = zb[i-1]
    cdef int j
    
    if i == l-1:
        z += cab*exp(-Eb[x[i]])  
    else:
        for j in range(0,i-l+1):
            z += za[j]*cb_cy(i-j-l, cab, D)*exp(-Eb[x[i]])
            z += zb[j]*cab*np.exp(-Eb[x[i]])      
    zb[i] = z 


@cython.cdivision(True)
cpdef double cb_cy(int d, double cab, double D):
    
    if d < 0:
        return 0
    cdef double sig = 1 / (3*(d+1)*l_p)
    cdef double gaussian = exp(-pow(D,2) / (2 * pow(sig, 2.))) / pow(2*cpi*pow(sig,2),3/2)
    return (cab + gaussian)


cpdef void assign_za_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
                                 double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
                                 double[:] Ea, double[:] Eb, double cab, double D):
    identical = (inx == x[i])
    
    if i == l-1:
        za_Ea_derivatives[inx,i] = -identical*cab*exp(-Ea[x[i]])
        za_Eb_derivatives[inx,i] = 0
        return
    
    za_Ea_derivatives[inx,i] = cab*zb_Ea_derivatives[inx,i-l]*exp(-Ea[x[i]]) - cab*zb[i-l]*identical*exp(-Ea[x[i]])
    za_Eb_derivatives[inx,i] = cab*zb_Eb_derivatives[inx,i-l]*exp(-Ea[x[i]])

    

@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False)    
def assign_zb_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
                                 double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
                                 double[:] Ea, double[:] Eb, double cab, double D):
    cdef int identical = (inx == x[i])
    
    cdef double der_b = zb_Eb_derivatives[inx,i-1]
    cdef double der_a = zb_Ea_derivatives[inx,i-1]
    cdef int j
    
    if i == l-1:
        der_b += -cab*identical*exp(-Eb[x[i]])
        der_a += 0
        
    else:
        for j in range(0,i-l+1):
            der_b += cb_cy(i-j-l, cab, D) * ((za_Eb_derivatives[inx,j]*exp(-Eb[x[i]]) - za[j]*exp(-Eb[x[i]])*identical))
            der_b += cab * (zb_Eb_derivatives[inx,j]*exp(-Eb[x[i]]) - zb[j]*exp(-Eb[x[i]])*identical)
            
            der_a += cb_cy(i-j-l, cab, D) * za_Ea_derivatives[inx,j]*exp(-Eb[x[i]])
            der_a += cab * zb_Ea_derivatives[inx,j]*exp(-Eb[x[i]]) 

    
    zb_Ea_derivatives[inx,i] = der_a
    zb_Eb_derivatives[inx,i] = der_b

cdef extern from "assign_zb_E_derivatives.c":
    pass
    
cdef extern from "assign_zb_E_derivatives.h":
    void assign_zb_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double D)
    
    
cpdef double cb_D_derivative_cy(int d, double D):
    if d < 0:
        return 0
    cdef double sig = 1 / (3*(d+1)*l_p)
    cdef double der = -(D * exp((-np.power(D,2))/(2*np.power(sig,2)))) / (np.power(2*cpi, 3/2)*np.power(sig,5))
    return der

cpdef void assign_za_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, double cab, double D):
    if i == l-1:
        za_D_derivatives[i] = 0
        return
    za_D_derivatives[i] = zb_D_derivatives[i-l]*cab*exp(-Ea[x[i]])
    
cpdef void assign_zb_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, double cab, double D):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_D_derivatives[j]*cb_cy(i-l-j, cab, D) + za[j]*cb_D_derivative_cy(i-l-j, D)
            der += zb_D_derivatives[j]*cab
    der *= exp(-Eb[x[i]])
    der += zb_D_derivatives[i-1]
    
    zb_D_derivatives[i] = der
    
    
cpdef void assign_za_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, double cab, double D):
    if i == l-1:
        za_cab_derivatives[i] = exp(-Ea[x[i]])
        return
    za_cab_derivatives[i] = exp(-Ea[x[i]])*(zb_cab_derivatives[i-l]*cab + zb[i-l])
    
cpdef void assign_zb_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, double cab, double D):
    cdef double der = 0
    cdef int j
    
    if i == l-1:
        der += 1
    else:
        for j in range(0,i-l+1):
            der += za_cab_derivatives[j]*cb_cy(i-l-j, cab, D) + za[j]
            der += zb_cab_derivatives[j]*cab + zb[j]
    der *= exp(-Eb[x[i]])
    der += zb_cab_derivatives[i-1]
    
    zb_cab_derivatives[i] = der
    
    
def DP_Z_cy(double[:] args, long[:] x):
    
    cdef int L = len(x)

    cdef double[:] Ea = args[0:len(kmer_inx)]
    cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
    cdef double cab = args[-2]
    cdef D = args[-1]
    
    #initialization of statistical weigths
    cdef double[:] za = np.zeros(L)
    cdef double[:] zb = np.zeros(L)

    cdef int i
    for i in range(0,l-1):
        zb[i] = 1 

    #initialization of derivatives
    cdef double[:,::1] za_Ea_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Ea_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:,::1] za_Eb_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Eb_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:] za_D_derivatives = np.zeros(L)
    cdef double[:] zb_D_derivatives = np.zeros(L)

    cdef double[:] za_cab_derivatives = np.zeros(L)
    cdef double[:] zb_cab_derivatives = np.zeros(L)


    cdef int inx
    #dynamic programming calculation of z and derivatives 
    for i in range(l-1,L):
        #calculate statistical weights
        assign_za_cy(x, i, za, zb, Ea, Eb, cab, D)
        assign_zb_cy(x, i, za, zb, Ea, Eb, cab, D)
        
        #calculate derivatives
        for inx in range(len(kmer_inx)):
            assign_za_E_derivatives_cy(x, i, inx, za, zb, za_Ea_derivatives, zb_Ea_derivatives, za_Eb_derivatives, zb_Eb_derivatives, Ea, Eb, cab, D)
            #assign_zb_E_derivatives_cy(x, i, inx, za, zb, za_Ea_derivatives, zb_Ea_derivatives, za_Eb_derivatives, zb_Eb_derivatives, Ea, Eb, cab, D)
            assign_zb_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, D)
        
        #print(za_Ea_derivatives[:,i])
        #print(za_Eb_derivatives[:,i])
        
        assign_za_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, D)
        assign_zb_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, D)
        
        assign_za_cab_derivative_cy(x, i, za, zb, za_cab_derivatives, zb_cab_derivatives, Ea, Eb, cab, D)
        assign_zb_cab_derivative_cy(x, i, za, zb, za_cab_derivatives, zb_cab_derivatives, Ea, Eb, cab, D)
        
    cdef double Z_x = zb[L-1] + np.sum(za)
    #print(Z_x)
    #derivative of Z(x)
    d_Ea = zb_Ea_derivatives[:,L-1] + np.sum(za_Ea_derivatives, axis=1)
    d_Eb = zb_Eb_derivatives[:,L-1] + np.sum(za_Eb_derivatives, axis=1)
    d_D = zb_D_derivatives[L-1] + np.sum(za_D_derivatives)
    d_cab = zb_cab_derivatives[L-1] + np.sum(za_cab_derivatives)
    
    
    cdef double[:] gradient = np.concatenate([q.ravel() for q in [d_Ea, d_Eb, np.array([d_cab, d_D])]])
    #print(gradient)
    
    return Z_x, gradient



In [9]:
Ea = np.zeros(len(kmer_inx)) + 1
Eb = np.zeros(len(kmer_inx)) + 1
D = cab = 1

In [10]:
parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([cab, D])]])

#check_grad(lambda input_vector: nLL(input_vector, plus, bg)[0], lambda input_vector: nLL(input_vector, plus, bg)[1], x0 = parameters)
#check_grad(lambda input_vector: DP_Z(input_vector, plus[0])[0], lambda input_vector: DP_Z(input_vector, plus[0])[1], x0 = parameters)

In [11]:
param2 = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([cab, D])]])

In [65]:
#inline
profile = line_profiler.LineProfiler(DP_Z_cy)
profile.runcall(DP_Z_cy, np.exp(param2), seq2int_cy(plus[0]))
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.033383 s
File: /home/salma/.cache/ipython/cython/_cython_magic_57cee2e791a5832d62ba45f5f9b63375.pyx
Function: DP_Z_cy at line 172

Line #      Hits         Time  Per Hit   % Time  Line Contents
   172                                           def DP_Z_cy(double[:] args, long[:] x):
   173                                               
   174         1          3.0      3.0      0.0      cdef int L = len(x)
   175                                           
   176         1          4.0      4.0      0.0      cdef double[:] Ea = args[0:len(kmer_inx)]
   177         1          3.0      3.0      0.0      cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
   178         1          2.0      2.0      0.0      cdef double cab = args[-2]
   179         1          2.0      2.0      0.0      cdef D = args[-1]
   180                                               
   181                                               #initialization of statistical weigths
   1

In [57]:
#Print profiling statistics using the `line_profiler` API
profile = line_profiler.LineProfiler(DP_Z_cy)
profile.runcall(DP_Z_cy, np.exp(param2), seq2int_cy(plus[0]))
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.030982 s
File: /home/salma/.cache/ipython/cython/_cython_magic_99293c4f78b501cc8bebaf6976f29289.pyx
Function: DP_Z_cy at line 172

Line #      Hits         Time  Per Hit   % Time  Line Contents
   172                                           def DP_Z_cy(double[:] args, long[:] x):
   173                                               
   174         1          4.0      4.0      0.0      cdef int L = len(x)
   175                                           
   176         1         12.0     12.0      0.0      cdef double[:] Ea = args[0:len(kmer_inx)]
   177         1          2.0      2.0      0.0      cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
   178         1          1.0      1.0      0.0      cdef double cab = args[-2]
   179         1          2.0      2.0      0.0      cdef D = args[-1]
   180                                               
   181                                               #initialization of statistical weigths
   1

In [37]:
#Print profiling statistics using the `line_profiler` API
profile = line_profiler.LineProfiler(assign_zb_E_derivatives_cy)
profile.runcall(DP_Z_cy, np.exp(param2), seq2int_cy(plus[0]))
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.12387 s
File: /home/salma/.cache/ipython/cython/_cython_magic_33d072b888632d378d07bb8bcda0b485.pyx
Function: assign_zb_E_derivatives_cy at line 84

Line #      Hits         Time  Per Hit   % Time  Line Contents
    84                                           def assign_zb_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
    85                                                                            double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
    86                                                                            double[:] Ea, double[:] Eb, double cab, double D):
    87      2432       1286.0      0.5      1.0      cdef int identical = (inx == x[i])
    88                                               
    89      2432       1100.0      0.5      0.9      cdef double der_b = zb_Eb_derivatives[inx,i-1]
    90      2432       1111.0      0.5

In [87]:
%timeit DP_Z_cy(np.exp(param2), seq2int_cy(plus[0]))

1.48 s ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [66]:
%timeit DP_Z(np.exp(param2), plus[0])

1.94 s ± 31.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
type(plus[0])

str

In [75]:
parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([cab, D])]])
x_opt, fx, info = fmin_l_bfgs_b(DP_Z, x0=parameters, args=[plus[0]], callback=callb, pgtol=1e-20)

7539884.035454134
9.416012357764327
9.405013748886486
9.361096113646028
9.18665069023803
8.508236292237408
6.089788123427523
-0.22201271916948845
-1.107832744421442
-1.3001014577356516
-1.3142096354777362
-1.3145487179956938
-1.314559678822158
-1.314626884830774
-1.3147643868170071
-1.3151626997916672
-1.3161662573831763
-1.3188311722116592
-1.3257900675540903
-1.344235211810069
-1.3929866215567117
-1.498407976432927
-1.6180887336001923
-1.9576430518754833
-3.512805695427795
82.58494887338637
-3.9731261065631966
0.5700804095392666
-4.35534873311788
-4.987109878790998
6109242511316433.0
184208.02822043016
26.977923564693924
-5.4293071102447055
1.5515986114695095e+45
273200726346355.38
211685.9762822214
105.50580280081091
-4.923342817984762
-5.434823661783471
-5.471256299635311
-5.336761325058902
-5.490902339497879
-5.5427619941125315
-5.595420884524325
21.2466445387088
-5.650093021363549
-5.652038912460226
-5.676692862182054
-3.994548518522829
-5.68380484291897
-5.698326324433951
-5.732

  
  import sys
  import sys
  
  
  del sys.path[0]
  del sys.path[0]
  app.launch_new_instance()
  app.launch_new_instance()
  del sys.path[0]
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


nan


  import sys
  
  del sys.path[0]
  
  
  app.launch_new_instance()
  if __name__ == '__main__':


nan
nan
nan
nan
nan
nan


  """
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  """
  if __name__ == '__main__':
  


nan
nan


  """
  # Remove the CWD from sys.path while we load stuff.


nan


  This is separate from the ipykernel package so we can avoid doing imports until


nan
nan
nan
nan
nan


In [127]:
DP_Z(np.exp(param2*3), plus[0])

(1.00002820013304,
 array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -3.80061444e-08,  0.00000000e+00, -3.80054814e-08,  0.00000000e+00,
        -3.80057471e-08,  0.00000000e+00, -7.60115651e-08,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -3.80060591e-08,
         0.00000000e+00,  0.00000000e+00, -3.80057760e-08, -3.80061010e-08,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00, -3.80054770e-08, -1.14016571e-07,  0.00000000e+00,
        -3.80062368e-08,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -7.60116257e-08, -3.80054756e-08,
        -3.80058064e-08,  0.00000000e+00, -7.60109989e-08, -3.80062859e-08,
        -3.80054742e-08, -7.60113758e-08, -1.52022066e-07, -7.60111722e-08,
        -3.80054944e-08,  0.00000000e+00,  0.00000000e+00, -3.80056460e-08,
         0.00000000e+00, -7.60116763e-08,  0.00000000e+00,  0.0000000

In [138]:
p_array = []
def callb(x):
    print('next iteration')
    p_array.append(x)

In [99]:
parameters = np.random.rand(len(parameters))

In [128]:
nLL(param2*3, plus[:5], bg[:5])

[1.0000282 1.0000282 1.0000282 1.0000282 1.0000282]


(11.512925464970223,
 array([ -0.05413761,  -0.10827534,  -0.10827579,  -0.08120614,
         -0.10827599,  -0.05413756,  -0.10827469,  -0.02706907,
         -0.18948208,  -0.08120599,  -0.24362006,  -0.081207  ,
         -0.13534374,  -0.16241328,  -0.18948116,  -0.35189444,
         -0.08120693,  -0.08120651,  -0.13534422,  -0.10827541,
         -0.05413776,  -0.08120664,  -0.05413762,  -0.10827535,
         -0.05413748,  -0.13534407,  -0.18948111,  -0.08120654,
         -0.10827597,  -0.13534447,  -0.10827477,  -0.1894812 ,
         -0.10827507,  -0.02706878,  -0.24361992,  -0.568445  ,
         -0.08120677,  -0.02706876,  -0.18948143,  -0.21654994,
         -0.40603289,  -0.16241266,  -0.48723701,  -0.27068719,
         -0.08120589,  -0.0812066 ,  -0.13534372,  -0.3248253 ,
         -0.1082755 ,  -0.08120687,  -0.13534428,  -0.08120598,
         -0.13534362,  -0.13534463,  -0.10827572,  -0.18948235,
         -0.29775635,  -0.08120598,  -0.40603101,  -0.24361835,
         -0.0812066

In [151]:
parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([cab, D])]])
parameters = 5*np.random.rand(len(parameters))
x_opt, fx, info = fmin_l_bfgs_b(nLL, x0=param2*2.5, args=(plus[0:30], bg[:30]), callback=callb)

[12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396 12



[3.99682391e+04 1.36481713e+00 1.20430812e+04 1.41883901e+00
 1.28439033e+00 1.57032498e+00 1.28540499e+00 1.11452533e+00
 2.04613842e+04 4.35378884e+04 1.22244552e+00 1.39622007e+00
 1.70248699e+00 9.86929069e+07 1.56228650e+00 2.44749172e+11
 3.55369795e+04 1.17367055e+00 3.35413700e+04 1.72560147e+03]
[1.11441936e+00 1.14090144e+00 1.27890052e+00 4.72452343e+07
 1.10487545e+00 1.07436344e+00 1.04972105e+00 1.00000000e+00
 1.12695770e+00 1.20635515e+00 1.10698372e+00 1.18233477e+00
 1.34803072e+00 1.05816604e+00 1.08447590e+00 5.01156135e+04
 1.00000000e+00 1.04358693e+00 1.18429475e+00 1.10222788e+00]
LL part 1: 	-97.509351
final LL: 	-56.558117
[ 1.16590478e-013  7.68475280e-036  9.15673431e-060  2.41581850e-093
 -1.44184402e-006  4.46671000e-029  2.35782090e-037  4.87162832e-124
  4.58887004e-019  3.24510224e-020  1.10751485e-210  9.55879766e-238
  9.02359943e-035  1.02025692e-186  1.92262586e-012 -2.12968016e-002
  8.62971113e-023  8.88790021e-070  4.66162621e-076  1.20756179e-09

In [152]:
info

{'grad': array([-2.86300181e-047, -4.93719643e-216, -0.00000000e+000,
        -0.00000000e+000,  9.15617609e-008, -2.97109924e-153,
        -8.08751861e-230, -0.00000000e+000, -6.08391709e-076,
        -1.37164361e-122, -0.00000000e+000, -0.00000000e+000,
        -9.35180882e-203, -0.00000000e+000, -4.46923296e-044,
        -1.30934826e-003, -5.33351132e-102, -0.00000000e+000,
        -0.00000000e+000, -0.00000000e+000, -0.00000000e+000,
        -0.00000000e+000, -0.00000000e+000, -5.39485003e-096,
        -0.00000000e+000, -0.00000000e+000, -4.67272827e-242,
        -0.00000000e+000, -0.00000000e+000, -0.00000000e+000,
        -0.00000000e+000, -8.84694897e-071, -0.00000000e+000,
        -4.36538910e-091, -0.00000000e+000, -6.18221137e-004,
        -0.00000000e+000,  2.20032972e-003, -0.00000000e+000,
        -0.00000000e+000, -7.20425961e-061, -0.00000000e+000,
        -0.00000000e+000, -0.00000000e+000, -4.14481469e-068,
        -0.00000000e+000, -0.00000000e+000, -0.00000000e+000,


In [153]:
np.exp(x_opt)[0:64]

array([1.25897335e+02, 5.16628156e+02, 1.15637826e+03, 2.45116049e+03,
       5.42501097e+00, 3.71618183e+02, 5.48541001e+02, 3.93462318e+03,
       1.92763993e+02, 3.00599562e+02, 9.89434461e+03, 1.20427942e+04,
       4.86074907e+02, 7.98364390e+03, 1.18403554e+02, 1.59572865e+01,
       2.52960015e+02, 1.49572434e+03, 1.70857014e+03, 2.48672718e+03,
       8.23218920e+02, 1.84368040e+03, 1.54596268e+03, 2.37628505e+02,
       3.72804122e+03, 6.25021071e+03, 5.76755193e+02, 3.56097494e+03,
       2.61317568e+03, 1.99703410e+03, 1.13083785e+03, 1.81282663e+02,
       6.71458165e+03, 2.26069006e+02, 9.54697250e+03, 1.47500801e+01,
       9.90332138e+03, 1.45982585e+01, 4.97593155e+03, 1.75078346e+03,
       1.57933762e+02, 1.63112454e+03, 7.69634619e+03, 3.29183830e+04,
       1.74556796e+02, 5.33022700e+03, 7.18811966e+04, 1.19517569e+04,
       6.61565033e+02, 1.09364893e+03, 1.57197679e+02, 5.33926951e+01,
       1.52531532e+03, 2.32463516e+03, 3.71736246e+03, 1.15403829e+03,
      

In [154]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = np.exp(x_opt)[i]

In [155]:
pd.Series(core1).sort_values(ascending=True)

ACA        5.425011
CCG       14.598258
TAG       14.750080
TTA       15.957287
TAT       53.392695
           ...     
ACG     9903.321378
TTG    11951.756912
TGA    12042.794205
TGG    32918.383039
GTG    71881.196634
Length: 64, dtype: float64

In [156]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = np.exp(x_opt)[i+64]
pd.Series(core2).sort_values(ascending=True)

ACA    2.395342e-85
GTA    1.746732e+01
TTA    1.812445e+01
GAT    7.050780e+02
TGA    1.917073e+03
           ...     
AGG    1.479785e+43
TAG    6.643186e+44
GTT    3.551008e+45
GGT    4.372147e+52
CAG    9.317606e+56
Length: 64, dtype: float64

In [22]:
parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([cab, D])]])
X_p = ['ACCCTGG', 'ACGTTAG']
X_bg = ['ACCCTGG', 'ACCTTGG', 'ACCGGAA']

In [31]:
DP_Z(parameters, seq)

(2724.970354545334,
 array([-4.19253876e+01,  0.00000000e+00,  0.00000000e+00, -4.00341270e+01,
        -4.91977715e+01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00, -1.44404408e+02,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -3.87190212e+01,
        -4.47994075e+01,  0.00000000e+00, -2.19150415e+02,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.59224782e+02,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.43650713e+02,
         0.00000000e+00, -4.38407472e+01, -3.68179906e+01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00, -5.72625244e+01,  0.00000000e+00,  0.000000

In [53]:
parameters.shape

(130,)

In [44]:
seq='AAA'

In [33]:
14000*30000

420000000

In [60]:
Eb

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [23]:
DP_Z(seq, (Ea, Eb, cab, D))

za[2]=  56040947502.27
za[3]=  0.37
za[4]=  0.37
za[5]=  0.37
za[6]=  0.37
za[7]=  0.37
za[8]=  0.37
za[9]=  7584317503.57
za[10]=  15168635006.96
za[11]=  22752952510.53
za[12]=  30337270014.29
za[13]=  40711702002.98
za[14]=  56666362961.41
za[15]=  80991367374.47
za[16]=  117503255484.55
za[17]=  171044993291.61
za[18]=  248512398310.61
za[19]=  359880565329.05
za[20]=  519985494032.59


  


(5981674997388.705,
 (array([-3.31091497,  0.        ,  0.        , -0.73575888,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         -0.73575888,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        , -0.36787944,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         -0.36787944,  0.        ,  0.        , -1.47151776]),
  array([            inf,  9.

In [24]:
za

array([0.00000000e+00, 0.00000000e+00, 5.60409475e+10, 3.67879441e-01,
       3.67879441e-01, 3.67879441e-01, 3.67879441e-01, 3.67879441e-01,
       3.67879441e-01, 7.58431750e+09, 1.51686350e+10, 2.27529525e+10,
       3.03372700e+10, 4.07117020e+10, 5.66663630e+10, 8.09913674e+10,
       1.17503255e+11, 1.71044993e+11, 2.48512398e+11, 3.59880565e+11,
       5.19985494e+11])

In [27]:
Eb

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [100]:
a = np.zeros((len(kmer_inx),L))
len(a[:][L-1])

20

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [114]:
np.sum(a, axis=1)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
def DP_der(x, Ea, Eb, cab, D):

### Calculating log-likelihood

# drafts

In [35]:
x_opt, fx, info = fmin_l_bfgs_b(obj_fun, x0=np.array([12]), args=(12,))

In [31]:
x_opt

array([3.60448e+155])

In [32]:
fx

array([-inf])

In [37]:
from scipy.optimize import check_grad

In [43]:
check_grad(lambda x: obj_fun(x, 12)[0],lambda x: obj_fun(x, 12)[1], np.array([12]))

array([0.])