## Learning bi-partite motifs based on a thermodynamic approach
### Implements the dynamic programming and the gradient descent

In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import itertools
import pandas as pd
from scipy.optimize import fmin_l_bfgs_b
from scipy.optimize import check_grad
from scipy.special import logsumexp
from Bio import SeqIO
import random

%load_ext cython

#Load Robert Kern's line profiler
%load_ext line_profiler
import line_profiler

In [2]:
#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)
from Cython.Compiler.Options import get_directive_defaults
directive_defaults = get_directive_defaults()
directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

### cython

In [20]:
%%cython -f -I . --compile-args=-DCYTHON_TRACE=1 


cimport cython
import numpy as np
import itertools
from libc.math cimport exp,pow


cdef int l = 3 #l_A=l_B=3 nucleotides
cdef int l_p = 3 #persistence length is 3 nucleotides 
cdef double cpi = np.pi

cpdef generate_kmer_inx():
    cdef dict vals = {'A':0,'C':1,'G':2,'T':3}
    cdef dict kmer_inx = {}
    
    for p in list(itertools.product(vals.keys(), repeat=l)):
        inx = 0
        for j,base in enumerate(p):
            inx += (4**j)*vals[base] 
        kmer_inx[''.join(p)] = inx
    return kmer_inx

kmer_inx = generate_kmer_inx()
inx_kmer = {y:x for x,y in kmer_inx.items()}

cpdef seq2int_cy(str sequence):
    cdef int L = len(sequence)
    kmer_array = np.zeros(L, dtype=int)
    
    cdef i
    for i in range(l-1,L):
        kmer = sequence[i-l+1:i+1]
        kmer_array[i] = kmer_inx[kmer]
    return kmer_array        


cpdef void assign_za_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    za[i] = (zb[i-l] + np.sum(za[:i-l+1])) * cab * exp(-Ea[x[i]])
    
cpdef void assign_zb_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    cdef double z = zb[i-1]
    cdef int j

    for j in range(0,i-l+1):
        z += za[j]*cb_c(i-j-l, sf, D, sig)*exp(-Eb[x[i]])
    z += zb[i-l]*cab*np.exp(-Eb[x[i]])      
    zb[i] = z 


cpdef void assign_za_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
                                 double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
                                 double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    identical = (inx == x[i])
    
    za_Ea_derivatives[inx,i] = cab*(zb_Ea_derivatives[inx,i-l] + np.sum(za_Ea_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]]) - cab*(zb[i-l] + np.sum(za[:i-l+1]))*identical*exp(-Ea[x[i]])
    za_Eb_derivatives[inx,i] = cab*(zb_Eb_derivatives[inx,i-l]+ np.sum(za_Eb_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]])


cdef extern from "src_helper.c":
    pass
    
cdef extern from "src_helper.h":
    cdef void assign_zb_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double sf, double D , double sig)
    cdef void assign_za_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double sf, double D , double sig)
    cdef double cb_c(int, double, double, double)
    cdef double cb_D_derivative_c(int, double, double, double);
    cdef double cb_sig_derivative_c(int, double, double, double);
    cdef double cb_sf_derivative_c(int, double, double, double);
    
    
cpdef void assign_za_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    za_D_derivatives[i] = (zb_D_derivatives[i-l]+np.sum(za_D_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])
    

cpdef void assign_za_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    za_sig_derivatives[i] = (zb_sig_derivatives[i-l]+np.sum(za_sig_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_za_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    za_sf_derivatives[i] = (zb_sf_derivatives[i-l]+np.sum(za_sf_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_zb_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j

    for j in range(0,i-l+1):
        der += za_D_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_D_derivative_c(i-l-j, sf, D, sig)
    der += zb_D_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_D_derivatives[i-1]
    
    zb_D_derivatives[i] = der
    
cpdef void assign_zb_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j

    for j in range(0,i-l+1):
        der += za_sig_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sig_derivative_c(i-l-j, sf, D, sig)
    der += zb_sig_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sig_derivatives[i-1]
    
    zb_sig_derivatives[i] = der


cpdef void assign_zb_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    for j in range(0,i-l+1):
        der += za_sf_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sf_derivative_c(i-l-j, sf, D, sig)
    der += zb_sf_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sf_derivatives[i-1]
    
    zb_sf_derivatives[i] = der

    
    
cpdef void assign_za_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    za_cab_derivatives[i] = exp(-Ea[x[i]])*((zb_cab_derivatives[i-l]+ np.sum(za_cab_derivatives[:i-l+1]))*cab + zb[i-l] + np.sum(za[:i-l+1]))
    
cpdef void assign_zb_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j

    for j in range(0,i-l+1):
        der += za_cab_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]
        der += zb_cab_derivatives[j]*cab + zb[j]
    der *= exp(-Eb[x[i]])
    der += zb_cab_derivatives[i-1]
    
    zb_cab_derivatives[i] = der
    
    
def DP_Z_cy(double[:] args, long[:] x):
    
    cdef int L = len(x)
    cdef double cab = 1.0

    cdef double[:] Ea = args[0:len(kmer_inx)]
    cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
    cdef double sf = args[-3]
    cdef double D = args[-2]
    cdef double sig = args[-1]
    
    #initialization of statistical weigths
    cdef double[:] za = np.zeros(L)
    cdef double[:] zb = np.zeros(L)

    cdef int i
    for i in range(0,l):
        zb[i] = 1 

    #initialization of derivatives
    cdef double[:,::1] za_Ea_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Ea_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:,::1] za_Eb_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Eb_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:] za_sf_derivatives = np.zeros(L)
    cdef double[:] zb_sf_derivatives = np.zeros(L)
    
    cdef double[:] za_D_derivatives = np.zeros(L)
    cdef double[:] zb_D_derivatives = np.zeros(L)

    cdef double[:] za_sig_derivatives = np.zeros(L)
    cdef double[:] zb_sig_derivatives = np.zeros(L)


    cdef int inx
    #dynamic programming calculation of z and derivatives 
    for i in range(l,L):
        #calculate statistical weights
        assign_za_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        assign_zb_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        
        #calculate derivatives
        for inx in range(len(kmer_inx)):
            assign_za_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, sf, D, sig)
            assign_zb_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, sf, D, sig)
        
        
        assign_za_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        
    #print(np.asarray(za))
    #print(np.asarray(zb))
    
    Z_x = zb[L-1] + np.sum(za)
    
    #derivative of Z(x)
    d_Ea = zb_Ea_derivatives[:,L-1] + np.sum(za_Ea_derivatives, axis=1)
    d_Eb = zb_Eb_derivatives[:,L-1] + np.sum(za_Eb_derivatives, axis=1)
    
    d_sf = zb_sf_derivatives[L-1] + np.sum(za_sf_derivatives)
    d_D = zb_D_derivatives[L-1] + np.sum(za_D_derivatives)
    d_sig = zb_sig_derivatives[L-1] + np.sum(za_sig_derivatives)
    
    
    gradient = np.concatenate([q.ravel() for q in [d_Ea, d_Eb, np.array([d_sf, d_D, d_sig])]])
    
    return Z_x, gradient



### implementation of the LL object

In [16]:
class nLL:
    def __init__(self, seqs_p, seqs_bg):
        
        self.N_p = len(seqs_p)
        self.N_bg = len(seqs_bg)

        #calculate background probabilities:

        #include positive sequences in bg sequences if not there
        X_bg_t = list(set(seqs_p + seqs_bg))  #number of unique sequences
        
        counts = np.zeros(len(X_bg_t))
        for i, x in enumerate(X_bg_t):
            counts[i] = seqs_bg.count(x)
            
        counts = counts + 1 #pseudocount to make sure 
        counts = counts/np.sum(counts)

        p_bg = dict(zip(X_bg_t, counts))

        self.pbg_xp = np.array([p_bg[x] for x in seqs_p])
        self.pbg_xbg = np.array([p_bg[xbg] for xbg in seqs_bg])
        
        #add a padding nucleotide to the beginning to make the calculations stable, binding starts at
        #position i=l so the padded nucleotide has no effect.
        self.X_p = [seq2int_cy('A' + x) for x in seqs_p] 
        self.X_bg = [seq2int_cy('A' + x) for x in seqs_bg]
        
        
    def __call__(self, parameters):
        
        n_pos = 3
        #exp parameters to make sure they are positive
        args = parameters.copy()
        args[-n_pos:] = np.exp(args[-n_pos:])
        #print(args[-10:])
    
        #implement LL and derivatives   
        z_x = np.zeros(self.N_p)
        d_z_x = np.zeros((2*len(kmer_inx)+ n_pos, self.N_p))

        z_xbg = np.zeros(self.N_bg)
        d_z_xbg = np.zeros((2*len(kmer_inx)+ n_pos, self.N_bg))


        for i, xp in enumerate(self.X_p):
            z_x[i], d_z_x[:,i] = DP_Z_cy(args, xp)


        for i, xbg in enumerate(self.X_bg):
            z_xbg[i], d_z_xbg[:,i] = DP_Z_cy(args, xbg)
        
        ll = np.sum(np.log(self.pbg_xp) + np.log(np.ones(self.N_p) - (np.ones(self.N_p)/z_x)))
        ll -= self.N_p * logsumexp( np.log(self.pbg_xbg) + np.log(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg)) )
        
        dll = np.sum(d_z_x/(z_x*(z_x-1)), axis=1)
        dll -= self.N_p * ( np.sum((self.pbg_xbg * d_z_xbg)/(z_xbg*z_xbg), axis=1 ) / np.sum(self.pbg_xbg*(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg))))

        #exp modify dLL
        dll[-n_pos:] = dll[-n_pos:]*args[-n_pos:]

        #regularize
        if True:
            comp = -3
            reg = 1e-6 
            ll -= np.power(args[comp],2)*reg
            dll[comp] -= 2*reg*args[comp]

        return -ll, -dll 

### Import fasta files

In [6]:
def parse_fasta(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fasta")
    return [str(record.seq) for record in input_seq_iterator]

In [7]:
def swap_cores (args):
    core1_args = args[:len(kmer_inx)]
    core2_args = args[len(kmer_inx):len(kmer_inx)*2]
    return np.concatenate([x.ravel() for x in [core2_args, core1_args, np.array([args[-2], args[-1]])]])

In [8]:
if False:
    set_size = 100
    bg = parse_fasta('HNRNPA0_1_TGTCGA40NCCGA_AAG_1.fasta.tmp')
    plus = parse_fasta('HNRNPA0_4_TGTCGA40NCCGA_AAG_4.fasta.tmp')

    bg = random.sample(bg, set_size)
    plus = random.sample(plus, set_size)
    
    bg   = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in bg]
    plus = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in plus]
    
else:
    bg = parse_fasta('negatives_toy.fasta')
    plus = parse_fasta('positives_toy.fasta')

In [9]:
p_array = []
def callb(x):
    print('next iteration')
    p_array.append(x)

In [11]:
Ea = np.zeros(len(kmer_inx)) + 10
Eb = np.zeros(len(kmer_inx)) + 10
sf = np.log(100)
D = np.log(3)
sig = np.log(3)

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])

In [22]:
nll_obj = nLL(plus[:20],bg[:20])

In [24]:
%lprun -T lprof0 -f DP_Z_cy DP_Z_cy(parameters, seq2int_cy(plus[0]))


*** Profile printout saved to text file 'lprof0'. 


Timer unit: 1e-06 s

Total time: 0.044824 s
File: /home/salma/.cache/ipython/cython/_cython_magic_e2ebc85239078311a48cd1f94450c1fd.pyx
Function: DP_Z_cy at line 158

Line #      Hits         Time  Per Hit   % Time  Line Contents
   158                                           def DP_Z_cy(double[:] args, long[:] x):
   159                                               
   160         1          8.0      8.0      0.0      cdef int L = len(x)
   161         1          4.0      4.0      0.0      cdef double cab = 1.0
   162                                           
   163         1         13.0     13.0      0.0      cdef double[:] Ea = args[0:len(kmer_inx)]
   164         1          3.0      3.0      0.0      cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
   165         1          3.0      3.0      0.0      cdef double sf = args[-3]
   166         1          3.0      3.0      0.0      cdef double D = args[-2]
   167         1          2.0      2.0      0.0      cdef double sig 

In [12]:
%lprun -T lprof0 -f DP_Z_cy DP_Z_cy(parameters, seq2int_cy(plus[0]))


*** Profile printout saved to text file 'lprof0'. 


Timer unit: 1e-06 s

Total time: 0.046286 s
File: /home/salma/.cache/ipython/cython/_cython_magic_17c95ea7c9a3ee48200f609a94c3d19f.pyx
Function: DP_Z_cy at line 191

Line #      Hits         Time  Per Hit   % Time  Line Contents
   191                                           def DP_Z_cy(double[:] args, long[:] x):
   192                                               
   193         1          5.0      5.0      0.0      cdef int L = len(x)
   194         1          3.0      3.0      0.0      cdef double cab = 1.0
   195                                           
   196         1         13.0     13.0      0.0      cdef double[:] Ea = args[0:len(kmer_inx)]
   197         1          4.0      4.0      0.0      cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
   198         1          3.0      3.0      0.0      cdef double sf = args[-3]
   199         1          2.0      2.0      0.0      cdef double D = args[-2]
   200         1          2.0      2.0      0.0      cdef double sig 

In [14]:
%lprun -T lprof0 -f DP_Z_cy DP_Z_cy(parameters, seq2int_cy(plus[0]))


*** Profile printout saved to text file 'lprof0'. 


Timer unit: 1e-06 s

Total time: 0.251515 s
File: /home/salma/.cache/ipython/cython/_cython_magic_4657f4f0d0abadfe7840ea32594c0d58.pyx
Function: DP_Z_cy at line 188

Line #      Hits         Time  Per Hit   % Time  Line Contents
   188                                           def DP_Z_cy(double[:] args, long[:] x):
   189                                               
   190         1          4.0      4.0      0.0      cdef int L = len(x)
   191         1         10.0     10.0      0.0      cdef double cab = 1.0
   192                                           
   193         1          4.0      4.0      0.0      cdef double[:] Ea = args[0:len(kmer_inx)]
   194         1          3.0      3.0      0.0      cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
   195         1          2.0      2.0      0.0      cdef double sf = args[-3]
   196         1          2.0      2.0      0.0      cdef double D = args[-2]
   197         1          2.0      2.0      0.0      cdef double sig 

In [21]:
param = parameters
intrep = seq2int_cy(plus[20])

for comp in [0,127,128,129,130]:
    epsilon = 1e-5
    x_fwd = param.copy()
    x_fwd[comp] += epsilon
    x_rev = param.copy()
    x_rev[comp] -= epsilon

    fx_fwd = DP_Z_cy(x_fwd, intrep)[0]
    fx_rev = DP_Z_cy(x_rev, intrep)[0]
    
    print('component %d'%comp)
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(DP_Z_cy(param, intrep)[1][comp])

component 0
0.0
0.0
component 127
-0.00013661644038265308
-0.00013661643089069622
component 128
4.9238391142125686e-08
4.9221867283524847e-08
component 129
9.86766224286839e-08
9.87268101479782e-08
component 130
-6.508127370352668e-08
-6.508425866948812e-08


In [23]:
param = np.log(parameters)
param = parameters
for comp in [0, 127,128,129,130]:
    epsilon = 1e-7
    x_fwd = param.copy()
    x_fwd[comp] += epsilon
    x_rev = param.copy()
    x_rev[comp] -= epsilon

    fx_fwd = nll_obj(x_fwd)[0]
    fx_rev = nll_obj(x_rev)[0]
    
    print('component %d\t, 1=%f, 2=%f'%(comp, fx_fwd, nll_obj(param)[0]))
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(nll_obj(param)[1][comp])

component 0	, 1=73.787589, 2=73.787589
0.05244743306320743
0.05244675990695298
component 127	, 1=73.787589, 2=73.787589
0.6708569344482385
0.6708735725800907
component 128	, 1=73.787589, 2=73.787589
0.020000143763354572
0.00020000000000004343
component 129	, 1=73.787589, 2=73.787589
-3.552713678800501e-07
9.107298248878239e-18
component 130	, 1=73.787589, 2=73.787589
2.842170943040401e-07
-1.3010426069826055e-17


### optimization

In [33]:
nll_obj = nLL(plus[:50],bg[:50])

In [34]:
nll_obj(parameters)

[ 10.  10.  10.  10.  10.  10.  10. 100.   3.   3.]


(230.26850929940463,
 array([-5.27271502e-02, -3.96211494e-02, -1.44626075e-01, -2.62563374e-02,
         2.04473770e-04, -1.05285738e-01, -3.02493595e-01,  1.29853990e-02,
        -1.84064927e-01, -1.31368294e-01, -1.31364440e-01,  1.31424130e-02,
        -1.31652508e-01, -6.57835969e-02, -1.18322460e-01,  1.31429269e-01,
        -1.05075769e-01, -1.18244380e-01, -6.57575341e-02, -1.31562143e-01,
        -3.94793933e-02,  2.10145364e-01, -6.57486474e-02,  7.88001297e-02,
        -1.84335228e-01, -1.32911550e-02, -9.20614062e-02, -1.29581127e-02,
        -1.44686349e-01, -1.57875218e-01, -2.64476639e-02,  3.28875324e-01,
        -5.26497101e-02, -7.89104339e-02, -7.87167571e-02, -1.44765727e-01,
        -2.10430712e-01, -1.70977820e-01, -1.31434130e-01,  7.88079780e-02,
        -5.24901103e-02, -3.41953712e-01,  2.62463015e-02,  2.23701632e-01,
        -1.83968737e-01, -1.05146801e-01,  3.94758438e-02,  4.07791664e-01,
        -1.18371215e-01, -1.31491646e-01, -1.05071781e-01,  1.31415

In [35]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=parameters, callback=callb)

[ 10.  10.  10.  10.  10.  10.  10. 100.   3.   3.]
[ 9.98672275  9.97350059 10.02206653  9.92930857  9.84087484  9.92480472
  9.4296879  99.99327943  3.          3.        ]
[ 9.93361376  9.86750295 10.11033266  9.64654286  9.20437419  9.62402358
  7.14843951 99.96640168  3.          3.        ]
[ 9.72117778  9.44351241 10.46339716  8.51548     6.65837158  8.42089905
 -1.97655406 99.85896291  3.          3.        ]
next iteration
[  9.69586576   9.2635615   10.46140421   7.94794417   4.82162656
   7.82329112   0.93569699 104.24285468   2.99229675   2.90117079]
[  9.59461771   8.54375787  10.4534324    5.67780083  -2.52535356
   5.43285941  12.5847012  123.78934766   2.96168106   2.53735633]
[  9.6761463    9.12336972  10.45985159   7.50580213   3.39069984
   7.35772124   3.20450283 107.79109585   2.98630921   2.82643906]
[  9.68899553   9.21471886  10.46086328   7.79390243   4.32309346
   7.66108715   1.72614613 105.46559965   2.99020934   2.87491269]
next iteration
[ 9.29188785e+00 



[  9.58326505   7.4942963   10.12639695   2.17660819 -14.63431118
   1.76898896  44.08691306 236.12773063   2.93680839   1.48063101]
[  9.66828742   8.87776117  10.39535553   6.69371312   0.61014323
   6.50707542  10.02282082 123.50030818   2.97967444   2.52454355]
[  9.6874614    9.18975588  10.45601024   7.71239663   4.04802537
   7.575594     2.34079227 106.70623094   2.9894276    2.84736576]
[  9.68848321   9.20638257  10.45924263   7.76668389   4.23123554
   7.63253705   1.93140483 105.87829113   2.98994826   2.86568398]
next iteration
[  9.70511413   9.24744987  10.43553352   7.8797908    4.61836693
   7.75645075   2.24459772 123.18557025   3.05073841   2.54181547]
next iteration
[9.92335937e+00 9.74920878e+00 1.01360656e+01 9.23279253e+00
 9.59522183e+00 9.27164977e+00 6.43335465e+00 1.26369575e+04
 4.72353740e+00 6.33966473e-02]
[  9.71796298   9.27699013  10.41790282   7.95944664   4.91137138
   7.84565569   2.49120414 161.79278299   3.13027734   2.04533927]
[  9.72928388   9.

In [36]:
info

{'grad': array([-3.55226834e-02, -4.55455168e-02, -6.28983648e-02, -4.39573758e-02,
        -6.71505348e-02, -7.09834189e-02, -3.84740143e-02, -1.12660675e-01,
        -2.96560843e-02, -3.81223169e-02, -6.82263723e-02, -1.22145047e-01,
        -4.23412296e-02, -4.12157102e-02, -3.52418134e-02, -2.84102677e-01,
        -6.60510670e-02, -4.67674534e-02, -4.57688825e-02, -5.79782876e-02,
        -3.25702328e-02, -6.87648981e-02, -8.70746755e-02, -7.75628868e-02,
        -3.49900102e-02, -1.04719329e-01, -5.30466122e-02, -6.05017554e-02,
        -4.62648752e-02, -2.42495983e-02, -4.65048330e-02, -2.32951520e-01,
        -3.28419442e-02, -5.54197362e-02, -5.57546996e-02, -3.68996160e-02,
        -3.43664175e-02, -3.35458671e-02, -5.24807428e-02, -9.44157141e-02,
        -9.04641157e-02, -3.80593964e-02, -1.05945519e-02, -6.91492667e-02,
        -2.89486617e-02, -5.25587665e-02, -3.84599617e-02, -2.19286658e+00,
        -5.01809660e-02, -5.93137081e-02, -2.79398452e-02, -2.11490288e-01,
    

In [37]:
np.exp(x_opt[-3:])

array([1.61235495e+03, 3.88910255e+00, 3.27218686e-01])

In [38]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = x_opt[i]

pd.Series(core1).sort_values(ascending=True)

TTT     4.575371
TTG     6.978363
CTT     7.310049
TTC     8.120488
GTT     8.597464
         ...    
AGA    10.817140
AGC    10.823344
ACG    10.932757
GCA    11.369056
CGG    11.548851
Length: 64, dtype: float64

In [39]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = x_opt[i+64]
pd.Series(core2).sort_values(ascending=True)

TTT     4.570920
TTG     6.882138
CTT     7.382377
TTC     8.131902
GTT     8.597951
         ...    
AGA    10.818225
AGC    10.823157
ACG    10.933075
GCA    11.369471
CGG    11.549135
Length: 64, dtype: float64

In [150]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=swap_cores(x_opt), callback=callb, pgtol=1e-6)

[16.67470408 15.90506218 15.40663549 16.65659094 15.37226262 17.57691134
 18.71241176 15.26813967  0.1109494  12.18249396]
[1.00049347 1.00158774 1.00191142 1.00104375 1.0011884 ]
[1.00036705 1.00108433 1.0007672  1.000221   1.00044074]
LL part 1: 	-2692.932507
LL part 1+2: 	-1157.083439
final LL: 	-1157.817550
[ 3.14916388e-02  1.14712603e-01  3.86449691e-01 -5.30209290e-02
 -5.47356707e-01 -9.00305482e-02 -8.20593600e-04 -2.97083249e+00
 -1.16208575e-02  0.00000000e+00]


[16.67664332 15.91180111 15.42863752 16.65333    15.34122263 17.57106865
 18.71235506 15.10155447  0.11094464 12.18249396]
[1.00367846 1.03367406 1.04881998 1.04999608 1.03876285]
[1.00031698 1.04702534 1.02999541 1.00019554 1.01977034]
LL part 1: 	-2124.225046
LL part 1+2: 	-1257.274054
final LL: 	-1258.009818
[-0.44872529 -1.10902925 -2.03071211 -0.41545092 -1.93731208 -0.28037227
 -0.06505667 -4.48391213  1.17257189  0.        ]


[16.67513866 15.90657217 15.41156353 16.65586009 15.36530093 17.57560179
 18.712399

In [151]:
info

{'grad': array([ 4.25035451e-04, -6.66331167e-03,  5.91623252e-04,  3.84238633e-04,
         4.23758515e-04,  4.24073351e-05,  3.86692959e-04,  3.87300639e-04,
         4.04712869e-04,  4.05248002e-04,  1.23800643e-03,  1.35259609e-03,
         4.83486950e-04,  4.60462314e-04,  3.38799483e-04,  8.36429012e-04,
         4.48109885e-04,  4.63599984e-04,  5.16712942e-04,  4.29845175e-04,
         2.59687636e-04, -4.21601759e-04,  4.33498156e-04,  2.64493714e-04,
         2.97510054e-04,  3.68259523e-04,  4.48981639e-04,  8.28952190e-04,
         3.25727334e-04,  3.13447837e-04,  4.34578312e-04,  3.61050889e-04,
         2.92269055e-04,  4.24710325e-04,  5.82334889e-04,  7.20973130e-04,
         3.65241616e-04,  3.78229397e-04,  5.87795164e-04,  4.93249480e-04,
         4.72402124e-04,  4.07938996e-04,  4.21739776e-04,  7.16801648e-04,
        -1.63260416e-01,  3.36408903e-04,  1.92325791e-01,  3.22293958e-04,
         5.28064355e-04,  1.03602740e-03,  1.14848670e-03,  1.87847456e-03,
    

In [13]:
np.exp(x_opt[-4:])

array([4.77134003e+00, 7.31678021e+03, 5.98350414e+00, 1.82418325e-02])

In [14]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = np.exp(x_opt)[i]

pd.Series(core1).sort_values(ascending=True)

TTT      5.660539
AAA      8.399421
AGA      9.168449
CTA      9.809132
ATA     10.137845
          ...    
TCT     25.512313
GAT     49.844746
CTT    147.737640
TTG    304.650022
GTT    775.787288
Length: 64, dtype: float64

In [15]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = np.exp(x_opt)[i+64]
pd.Series(core2).sort_values(ascending=True)

TTT       4.771340
GCG       6.881357
CGC       8.018314
AAA       9.203048
ATA       9.421745
          ...     
TCT      25.495074
GAT      50.030763
CTT     155.782047
TTG     210.852680
GTT    1408.126797
Length: 64, dtype: float64

### debug mode

In [18]:
def set_kmer_arg (args, kmer, domain, value):
    inx = kmer_inx[kmer]
    inx = inx + (domain-1)*len(kmer_inx)
    args_m = args
    args_m[inx] = value
    return args_m

In [38]:
Ea = np.zeros(len(kmer_inx)) + 8
Eb = np.zeros(len(kmer_inx)) + 8
sf=10000
D=3
sig=1

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])


parameters = set_kmer_arg(parameters, 'TTT', 1, 0.1)
parameters = set_kmer_arg(parameters, 'TTT', 2, 0.1)
#parameters = set_kmer_arg(parameters, 'AAA', 1, 3)
#parameters = set_kmer_arg(parameters, 'AAA', 2, 3)

100