## Learning bi-partite motifs based on a thermodynamic approach
### Implements the dynamic programming and the gradient descent

In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import itertools
import pandas as pd
from scipy.optimize import fmin_l_bfgs_b
from scipy.optimize import check_grad
from scipy.special import logsumexp
from Bio import SeqIO
import random

%load_ext cython

#Load Robert Kern's line profiler
%load_ext line_profiler
import line_profiler

In [2]:
#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)
from Cython.Compiler.Options import get_directive_defaults
directive_defaults = get_directive_defaults()
directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

### cython

In [88]:
%%cython -f -I . --compile-args=-DCYTHON_TRACE=1 


cimport cython
import numpy as np
import itertools
from libc.math cimport exp,pow


cdef int l = 3 #l_A=l_B=3 nucleotides
cdef int l_p = 3 #persistence length is 3 nucleotides 
cdef double cpi = np.pi

cpdef generate_kmer_inx():
    cdef dict vals = {'A':0,'C':1,'G':2,'T':3}
    cdef dict kmer_inx = {}
    
    for p in list(itertools.product(vals.keys(), repeat=l)):
        inx = 0
        for j,base in enumerate(p):
            inx += (4**j)*vals[base] 
        kmer_inx[''.join(p)] = inx
    return kmer_inx

kmer_inx = generate_kmer_inx()
inx_kmer = {y:x for x,y in kmer_inx.items()}

cpdef seq2int_cy(str sequence):
    cdef int L = len(sequence)
    kmer_array = np.zeros(L, dtype=int)
    
    cdef i
    for i in range(l-1,L):
        kmer = sequence[i-l+1:i+1]
        kmer_array[i] = kmer_inx[kmer]
    return kmer_array        


cpdef void assign_za_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    if i == l-1:
        za[i] = cab * exp(-Ea[x[i]])
        return
    za[i] = (zb[i-l] + np.sum(za[:i-l+1])) * cab * exp(-Ea[x[i]])
    
cpdef void assign_zb_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    cdef double z = zb[i-1]
    cdef int j
    
    if i == l-1:
        z += cab*exp(-Eb[x[i]])  
    else:
        for j in range(0,i-l+1):
            z += za[j]*cb_c(i-j-l, sf, D, sig)*exp(-Eb[x[i]])
        z += zb[i-l]*cab*np.exp(-Eb[x[i]])      
    zb[i] = z 


cpdef void assign_za_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
                                 double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
                                 double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    identical = (inx == x[i])
    
    if i == l-1:
        za_Ea_derivatives[inx,i] = -identical*cab*exp(-Ea[x[i]])
        za_Eb_derivatives[inx,i] = 0
        return
    
    za_Ea_derivatives[inx,i] = cab*(zb_Ea_derivatives[inx,i-l] + np.sum(za_Ea_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]]) - cab*(zb[i-l] + np.sum(za[:i-l+1]))*identical*exp(-Ea[x[i]])
    za_Eb_derivatives[inx,i] = cab*(zb_Eb_derivatives[inx,i-l]+ np.sum(za_Eb_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]])


cdef extern from "assign_zb_E_derivatives.c":
    pass
    
cdef extern from "assign_zb_E_derivatives.h":
    cdef void assign_zb_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double sf, double D , double sig)
    cdef double cb_c(int, double, double, double)
    cdef double cb_D_derivative_c(int, double, double, double);
    cdef double cb_sig_derivative_c(int, double, double, double);
    cdef double cb_sf_derivative_c(int, double, double, double);
    
    
cpdef void assign_za_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    if i == l-1:
        za_D_derivatives[i] = 0
        return
    za_D_derivatives[i] = (zb_D_derivatives[i-l]+np.sum(za_D_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])
    

cpdef void assign_za_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    if i == l-1:
        za_sig_derivatives[i] = 0
        return
    za_sig_derivatives[i] = (zb_sig_derivatives[i-l]+np.sum(za_sig_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_za_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    if i == l-1:
        za_sf_derivatives[i] = 0
        return
    za_sf_derivatives[i] = (zb_sf_derivatives[i-l]+np.sum(za_sf_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_zb_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_D_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_D_derivative_c(i-l-j, sf, D, sig)
        der += zb_D_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_D_derivatives[i-1]
    
    zb_D_derivatives[i] = der
    
cpdef void assign_zb_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_sig_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sig_derivative_c(i-l-j, sf, D, sig)
        der += zb_sig_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sig_derivatives[i-1]
    
    zb_sig_derivatives[i] = der


cpdef void assign_zb_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_sf_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sf_derivative_c(i-l-j, sf, D, sig)
        der += zb_sf_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sf_derivatives[i-1]
    
    zb_sf_derivatives[i] = der

    
    
cpdef void assign_za_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    if i == l-1:
        za_cab_derivatives[i] = exp(-Ea[x[i]])
        return
    za_cab_derivatives[i] = exp(-Ea[x[i]])*((zb_cab_derivatives[i-l]+ np.sum(za_cab_derivatives[:i-l+1]))*cab + zb[i-l] + np.sum(za[:i-l+1]))
    
cpdef void assign_zb_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    
    if i == l-1:
        der += 1
    else:
        for j in range(0,i-l+1):
            der += za_cab_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]
            der += zb_cab_derivatives[j]*cab + zb[j]
    der *= exp(-Eb[x[i]])
    der += zb_cab_derivatives[i-1]
    
    zb_cab_derivatives[i] = der
    
    
def DP_Z_cy(double[:] args, long[:] x):
    
    cdef int L = len(x)
    cdef double cab = 1.0

    cdef double[:] Ea = args[0:len(kmer_inx)]
    cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
    cdef double sf = args[-3]
    cdef double D = args[-2]
    cdef double sig = args[-1]
    
    #initialization of statistical weigths
    cdef double[:] za = np.zeros(L)
    cdef double[:] zb = np.zeros(L)

    cdef int i
    for i in range(0,l-1):
        zb[i] = 1 

    #initialization of derivatives
    cdef double[:,::1] za_Ea_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Ea_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:,::1] za_Eb_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Eb_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:] za_sf_derivatives = np.zeros(L)
    cdef double[:] zb_sf_derivatives = np.zeros(L)
    
    cdef double[:] za_D_derivatives = np.zeros(L)
    cdef double[:] zb_D_derivatives = np.zeros(L)

    cdef double[:] za_sig_derivatives = np.zeros(L)
    cdef double[:] zb_sig_derivatives = np.zeros(L)


    cdef int inx
    #dynamic programming calculation of z and derivatives 
    for i in range(l-1,L):
        #calculate statistical weights
        assign_za_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        assign_zb_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        
        #calculate derivatives
        for inx in range(len(kmer_inx)):
            assign_za_E_derivatives_cy(x, i, inx, za, zb, za_Ea_derivatives, zb_Ea_derivatives, za_Eb_derivatives, zb_Eb_derivatives, Ea, Eb, cab, sf, D, sig)
            assign_zb_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, sf, D, sig)
        
        
        assign_za_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        
    #print(np.asarray(za))
    #print(np.asarray(zb))
    
    Z_x = zb[L-1] + np.sum(za)
    
    #derivative of Z(x)
    d_Ea = zb_Ea_derivatives[:,L-1] + np.sum(za_Ea_derivatives, axis=1)
    d_Eb = zb_Eb_derivatives[:,L-1] + np.sum(za_Eb_derivatives, axis=1)
    
    d_sf = zb_sf_derivatives[L-1] + np.sum(za_sf_derivatives)
    d_D = zb_D_derivatives[L-1] + np.sum(za_D_derivatives)
    d_sig = zb_sig_derivatives[L-1] + np.sum(za_sig_derivatives)
    
    
    gradient = np.concatenate([q.ravel() for q in [d_Ea, d_Eb, np.array([d_sf, d_D, d_sig])]])
    
    return Z_x, gradient



### implementation of the LL object

In [47]:
class nLL:
    def __init__(self, seqs_p, seqs_bg):
        
        self.N_p = len(seqs_p)
        self.N_bg = len(seqs_bg)

        #calculate background probabilities:

        #include positive sequences in bg sequences if not there
        X_bg_t = list(set(seqs_p + seqs_bg))  #number of unique sequences
        
        counts = np.zeros(len(X_bg_t))
        for i, x in enumerate(X_bg_t):
            counts[i] = seqs_bg.count(x)
            
        counts = counts + 1 #pseudocount to make sure 
        counts = counts/np.sum(counts)

        p_bg = dict(zip(X_bg_t, counts))

        self.pbg_xp = np.array([p_bg[x] for x in seqs_p])
        self.pbg_xbg = np.array([p_bg[xbg] for xbg in seqs_bg])
        
        self.X_p = [seq2int_cy(x) for x in seqs_p]
        self.X_bg = [seq2int_cy(x) for x in seqs_bg]
        
        
    def __call__(self, args):
        
        #exp parameters to make sure they are positive
        args = np.exp(args)
        print(args[-10:])
    
        #implement LL and derivatives   
        z_x = np.zeros(self.N_p)
        d_z_x = np.zeros((2*len(kmer_inx)+3, self.N_p))

        z_xbg = np.zeros(self.N_bg)
        d_z_xbg = np.zeros((2*len(kmer_inx)+3, self.N_bg))


        for i, xp in enumerate(self.X_p):
            z_x[i], d_z_x[:,i] = DP_Z_cy(args, xp)
        print(z_x[:5])

        for i, xbg in enumerate(self.X_bg):
            z_xbg[i], d_z_xbg[:,i] = DP_Z_cy(args, xbg)
        print(z_xbg[:5])

        ll = np.sum(np.log(self.pbg_xp) + np.log(np.ones(self.N_p) - (np.ones(self.N_p)/z_x)))
        print("LL part 1: \t%f"%ll)
        ll -= self.N_p * logsumexp( np.log(self.pbg_xbg) + np.log(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg)) )
        print("LL part 1+2: \t%f"%ll)
        
        dll = np.sum(d_z_x/(z_x*(z_x - 1)), axis=1) 
        dll -= self.N_p * ( np.sum((self.pbg_xbg * d_z_xbg)/(z_xbg*z_xbg), axis=1 ) / np.sum(self.pbg_xbg*(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg))))
        #exp modify dLL
        dll = dll*args

        #regularize
        reg = 1e-5 
        ll -= np.sum(np.power(args[:-3],2)*reg)
        dll[:-3] -= 2*reg*args[:-3]

        print("final LL: \t%f"%ll)
        print(dll[-10:])
        print("============\n\n")
        return -ll, -dll 

In [76]:
DP_Z_cy(parameters, intrep)

1.0
[0.00000000e+00 0.00000000e+00 9.04837418e-01 3.35462628e-04
 3.35462628e-04 9.42540904e-04 9.42765974e-04 9.42991045e-04
 9.98600281e-04 1.24564239e-03 4.45756629e+00 1.89997231e-03
 1.95634080e-03 1.94089764e-02]
[ 1.          1.          1.90483742  1.90517288  1.90550834  2.0703346
  2.80581398  4.01803599  4.7544024   4.92118885 52.48919592 52.49454097
 52.49974114 53.32980028]


(58.721212745059695,
 array([-5.96603208e+00,  0.00000000e+00,  0.00000000e+00, -7.38949579e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.68915764e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -3.32849042e-03,  0.00000000e+00,  0.00000000e+00,  0.00000

In [89]:
for comp in range(len(parameters)):
    epsilon = 1e-9
    x_fwd = parameters.copy()
    x_fwd[comp] += epsilon
    x_rev = parameters.copy()
    x_rev[comp] -= epsilon

    fx_fwd = DP_Z_cy(x_fwd, intrep)[0]
    fx_rev = DP_Z_cy(x_rev, intrep)[0]
    
    print('component %d'%comp)
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(DP_Z_cy(parameters, intrep)[1][comp])

component 0
-5.96601523739082
-5.96603208474583
component 1
0.0
0.0
component 2
0.0
0.0
component 3
-0.7389537870494678
-0.7389495789804362
component 4
0.0
0.0
component 5
0.0
0.0
component 6
0.0
0.0
component 7
0.0
0.0
component 8
0.0
0.0
component 9
0.0
0.0
component 10
0.0
0.0
component 11
0.0
0.0
component 12
0.0
0.0
component 13
0.0
0.0
component 14
0.0
0.0
component 15
-0.16891732457224862
-0.16891576365870753
component 16
0.0
0.0
component 17
0.0
0.0
component 18
0.0
0.0
component 19
0.0
0.0
component 20
0.0
0.0
component 21
0.0
0.0
component 22
0.0
0.0
component 23
0.0
0.0
component 24
0.0
0.0
component 25
0.0
0.0
component 26
0.0
0.0
component 27
0.0
0.0
component 28
0.0
0.0
component 29
0.0
0.0
component 30
0.0
0.0
component 31
0.0
0.0
component 32
0.0
0.0
component 33
0.0
0.0
component 34
0.0
0.0
component 35
0.0
0.0
component 36
0.0
0.0
component 37
0.0
0.0
component 38
0.0
0.0
component 39
0.0
0.0
component 40
0.0
0.0
component 41
0.0
0.0
component 42
0.0
0.0
component 43


In [80]:
for comp in range(len(parameters)):
    epsilon = 1e-9
    x_fwd = parameters.copy()
    x_fwd[comp] += epsilon
    x_rev = parameters.copy()
    x_rev[comp] -= epsilon

    fx_fwd = DP_Z_cy(x_fwd, intrep)[0]
    fx_rev = DP_Z_cy(x_rev, intrep)[0]
    
    print('component %d'%comp)
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(DP_Z_cy(parameters, intrep)[1][comp])

component 0
-5.96601523739082
-5.96603208474583
component 1
0.0
0.0
component 2
0.0
0.0
component 3
-0.7389537870494678
-0.7389495789804362
component 4
0.0
0.0
component 5
0.0
0.0
component 6
0.0
0.0
component 7
0.0
0.0
component 8
0.0
0.0
component 9
0.0
0.0
component 10
0.0
0.0
component 11
0.0
0.0
component 12
0.0
0.0
component 13
0.0
0.0
component 14
0.0
0.0
component 15
-0.16891732457224862
-0.16891576365870753
component 16
0.0
0.0
component 17
0.0
0.0
component 18
0.0
0.0
component 19
0.0
0.0
component 20
0.0
0.0
component 21
0.0
0.0
component 22
0.0
0.0
component 23
0.0
0.0
component 24
0.0
0.0
component 25
0.0
0.0
component 26
0.0
0.0
component 27
0.0
0.0
component 28
0.0
0.0
component 29
0.0
0.0
component 30
0.0
0.0
component 31
0.0
0.0
component 32
0.0
0.0
component 33
0.0
0.0
component 34
0.0
0.0
component 35
0.0
0.0
component 36
0.0
0.0
component 37
0.0
0.0
component 38
0.0
0.0
component 39
0.0
0.0
component 40
0.0
0.0
component 41
0.0
0.0
component 42
0.0
0.0
component 43


In [78]:
nll_obj = nLL(['TTTAAAAATTTAAAAAAAAAAAAAAAA']*10,['AAAAAAAAAAAAAAAAAAAAAAAAAAA']*10)
#x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=np.log(parameters), callback=callb)
#check_grad(lambda input_vector: nll_obj(input_vector)[0], lambda input_vector: nll_obj(input_vector)[1], x0 = np.log(parameters))
intrep = seq2int_cy('TTTAAAAATTTAAA')
check_grad(lambda input_vector: DP_Z_cy(input_vector, intrep)[0], lambda input_vector: DP_Z_cy(input_vector, intrep)[1], x0 = parameters)

1.0
[0.00000000e+00 0.00000000e+00 9.04837418e-01 3.35462628e-04
 3.35462628e-04 9.42540904e-04 9.42765974e-04 9.42991045e-04
 9.98600281e-04 1.24564239e-03 4.45756629e+00 1.89997231e-03
 1.95634080e-03 1.94089764e-02]
[ 1.          1.          1.90483742  1.90517288  1.90550834  2.0703346
  2.80581398  4.01803599  4.7544024   4.92118885 52.48919592 52.49454097
 52.49974114 53.32980028]
1.0
[0.00000000e+00 0.00000000e+00 9.04837418e-01 3.35462628e-04
 3.35462628e-04 9.42540904e-04 9.42765974e-04 9.42991045e-04
 9.98600281e-04 1.24564239e-03 4.45756629e+00 1.89997231e-03
 1.95634080e-03 1.94089764e-02]
[ 1.          1.          1.90483742  1.90517288  1.90550834  2.0703346
  2.80581398  4.01803599  4.7544024   4.92118885 52.48919592 52.49454097
 52.49974114 53.32980028]
1.0
[0.00000000e+00 0.00000000e+00 9.04837418e-01 3.35462628e-04
 3.35462628e-04 9.42540890e-04 9.42765960e-04 9.42991031e-04
 9.98600281e-04 1.24564239e-03 4.45756629e+00 1.89997231e-03
 1.95634080e-03 1.94089760e-02]
[

51.13546359379765

### Import fasta files

In [9]:
def parse_fasta(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fasta")
    return [str(record.seq) for record in input_seq_iterator]

In [10]:
def swap_cores (args):
    core1_args = args[:len(kmer_inx)]
    core2_args = args[len(kmer_inx):len(kmer_inx)*2]
    return np.concatenate([x.ravel() for x in [core2_args, core1_args, np.array([args[-2], args[-1]])]])

In [11]:
if False:
    set_size = 100
    bg = parse_fasta('HNRNPA0_1_TGTCGA40NCCGA_AAG_1.fasta.tmp')
    plus = parse_fasta('HNRNPA0_4_TGTCGA40NCCGA_AAG_4.fasta.tmp')

    bg = random.sample(bg, set_size)
    plus = random.sample(plus, set_size)
    
    bg   = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in bg]
    plus = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in plus]
    
else:
    bg = parse_fasta('negatives_toy.fasta')
    plus = parse_fasta('positives_toy.fasta')

In [12]:
Ea = np.zeros(len(kmer_inx)) + 1
Eb = np.zeros(len(kmer_inx)) + 1
sf=10000
D=3
sig=1

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])

In [324]:
p = 'TTGTGGTGTTAGTGTTAGTGTTAGGTTAGATCATTTGTCT'
DP_Z_cy(np.exp(param2), seq2int_cy(p))

<MemoryView of 'ndarray' object>
<MemoryView of 'ndarray' object>


(55.53573882331846,
 array([  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,  -2.35129631,
         -2.36383122,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,  -9.45685511,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
         -2.35908573,   0.        ,  -2.40397879,   0.        ,
          0.        ,   0.        ,   0.        ,  -9.45569485,
          0.        ,   0.        ,   0.        ,   0.        ,
         -2.3637359 ,   0.        ,   0.        ,  -2.38007508,
          0.        ,   0.        ,  -9.4407371 ,  -5.13057607,
          0.        ,  -2.33646005,  -2.36254966,   0.        ,
          0.        ,   0.        ,   0.        ,  -2.67120619,
         -4.72739389,   0.        ,  -4.75550881, -11.85723375,
         -2.31671574

In [17]:
p_array = []
def callb(x):
    print('next iteration')
    p_array.append(x)

### debug mode

In [18]:
def set_kmer_arg (args, kmer, domain, value):
    inx = kmer_inx[kmer]
    inx = inx + (domain-1)*len(kmer_inx)
    args_m = args
    args_m[inx] = value
    return args_m

In [38]:
Ea = np.zeros(len(kmer_inx)) + 8
Eb = np.zeros(len(kmer_inx)) + 8
sf=10000
D=3
sig=1

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])


parameters = set_kmer_arg(parameters, 'TTT', 1, 0.1)
parameters = set_kmer_arg(parameters, 'TTT', 2, 0.1)
#parameters = set_kmer_arg(parameters, 'AAA', 1, 3)
#parameters = set_kmer_arg(parameters, 'AAA', 2, 3)

In [29]:
plus[0]

'TTTAAAAATTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'

In [40]:
len(parameters)

131

### optimization

In [159]:
nll_obj = nLL(plus,bg)

In [160]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=param2*2.5, callback=callb, pgtol=1e-6)

[12.18249396 12.18249396 12.18249396 12.18249396 12.18249396 12.18249396
 12.18249396 12.18249396 12.18249396 12.18249396]
[1.04656568 1.04656568 1.04656568 1.04656568 1.04656568]
[1.04656568 1.04656568 1.04656568 1.04656568 1.04656568]
LL part 1: 	-1694.251541
LL part 1+2: 	-2067.291406
final LL: 	-2067.481375
[-2.43649879e-04 -2.43649879e-04 -2.43649879e-04 -2.43649879e-04
 -1.02293654e+02 -2.43649879e-04 -2.43649879e-04 -1.00896187e+02
  8.65618259e-14  0.00000000e+00]


[12.18248859 12.18248859 12.18248859 12.18248859 10.12424217 12.18248859
 12.18248859 10.1498713  12.18249396 12.18249396]
[1.0223853  1.1116833  1.0635911  1.11380084 1.06946167]
[1.001326 1.001326 1.001326 1.001326 1.001326]
LL part 1: 	-1655.925050
LL part 1+2: 	-1326.063207
final LL: 	-1326.258622
[-2.43649772e-04 -2.43649772e-04 -2.43649772e-04 -2.43649772e-04
 -3.53264111e+02 -2.43649772e-04 -2.43649772e-04 -3.41694101e+02
 -1.31764938e+01  0.00000000e+00]


next iteration
[12.18244594 12.18244594 12.18244594 



[  4457.31822512  39723.34864748 150944.27930316 258357.31790884
   1020.79212559]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[1.21814941e+01 1.21814941e+01 1.21814941e+01 1.21814941e+01
 5.23857614e-10 1.21814941e+01 1.21814941e+01 6.26820475e-10
 9.94476600e-02 1.21824940e+01]
[ 6.72099242 36.22612137 29.44148811 53.53132281 15.51768093]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[1.21785084e+01 1.21785084e+01 1.21785084e+01 1.21785084e+01
 1.12143523e-40 1.21785084e+01 1.21785084e+01 2.25064614e-40
 4.30801040e-08 1.21824940e+01]
[1.00000223 1.00001012 1.00000602 1.00001054 1.00000629]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[1.21665727e+001 1.21665727e+001 1.21665727e+001 1.21665727e+001
 2.35514654e-163 1.21665727e+001 1.21665727e+001 3.74079056e-162
 1.51706937e-033 1.21824940



[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[1.21189472e+001 1.21189472e+001 1.21189472e+001 1.21189472e+001
 0.00000000e+000 1.21189472e+001 1.21189472e+001 0.00000000e+000
 2.33302951e-135 1.21824940e+001]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[11.93030198 11.93030198 11.93030198 11.93030198  0.         11.93030198
 11.93030198  0.          0.         12.18249396]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[11.20463236 11.20463236 11.20463236 11.20463236  0.         11.20463236
 11.20463236  0.          0.         12.18249396]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
LL part 1: 	-inf
LL part 1+2: 	nan
final LL: 	nan
[nan nan nan nan nan nan nan nan nan nan]


[ 8.71730635  8.71730635  8.71730635  8.71730635  0.          8.71730635
  8.7

In [138]:
info

{'grad': array([-5.73132208e-02, -1.48192664e-02, -2.42624338e-02, -4.01594228e-03,
        -8.83977210e-03, -6.18784205e-03, -1.50715923e-02, -1.39540418e-01,
        -1.25599982e-02, -1.41777851e-02, -1.70532880e-02, -5.07777626e-02,
        -3.29505468e-03, -9.20764627e-03, -1.22333706e-02, -1.00348780e-02,
        -1.03515169e-02, -4.37883660e-02, -4.11222803e-02, -3.78385409e-03,
        -1.26331718e-02, -7.20158923e-03, -8.39085519e-03, -6.39315131e-03,
        -5.78574659e-03, -3.01334535e-02, -7.10651155e-02, -1.40072871e-02,
        -2.35820504e-02, -1.94812717e-02, -5.31155055e-02, -1.26712635e-02,
        -6.17278326e-03, -5.61468187e-03, -4.16588604e-02, -9.42807162e-02,
        -2.08817177e-02, -3.95012651e-03, -5.23931173e-02, -1.57740848e-01,
        -5.54548980e-03, -3.49173205e-02, -9.39691468e-02,  2.04503572e-01,
         2.72558908e+00, -1.77413176e-02, -8.32375215e-01, -3.14051097e-03,
        -3.74134652e-02, -7.83788667e-03, -9.52592627e-02, -9.63656943e-03,
    

In [139]:
np.exp(x_opt)

array([13.62434413, 16.17833624, 15.51623646, 18.07160195, 17.19380988,
       17.46652039, 16.22047811, 11.58017905, 11.08443739, 16.33998695,
       16.31720277, 14.34912359, 18.30955393, 16.96309512, 16.60322971,
       16.88166264, 16.81768955, 14.52548222, 14.77471025, 18.40896997,
       16.58091322, 17.02931335, 17.18523058, 17.50568042, 17.54770894,
       15.30009582, 13.5213115 , 16.3442903 , 15.76836075, 15.94910944,
       14.21808502, 16.44762376, 17.71564556, 17.83229201, 14.75088055,
       13.16897773, 15.79329589, 18.23020295, 14.25130503, 11.60955227,
       17.8851755 , 15.04489659, 13.36595656, 10.13468818,  9.56143208,
       16.20864404,  9.29000086, 18.56364232, 14.44405744, 17.19499306,
       12.02112234, 17.05156007, 16.46589852, 14.63662606, 14.11965807,
       18.01331079, 16.67470408, 15.90506218, 15.40663549, 16.65659094,
       15.37226262, 17.57691134, 18.71241176, 15.26813967, 21.37705825,
       12.36227914, 29.69525752, 20.81604837, 21.76687965, 18.14

In [142]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = np.exp(x_opt)[i]

pd.Series(core1).sort_values(ascending=True)

GTG     9.290001
ATG     9.561432
TGG    10.134688
AGA    11.084437
TCA    11.580179
         ...    
CCG    18.230203
ATA    18.309554
TAC    18.408970
TTG    18.563642
GTT    18.712412
Length: 64, dtype: float64

In [143]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = np.exp(x_opt)[i+64]
pd.Series(core2).sort_values(ascending=True)

ATG     9.793199
TTT    11.045775
GCT    11.534288
GTG    11.657797
CCC    11.962178
         ...    
GAT    57.855526
GGA    62.401687
TGT    65.391878
TGA    68.228791
TAT    95.083585
Length: 64, dtype: float64

In [150]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=swap_cores(x_opt), callback=callb, pgtol=1e-6)

[16.67470408 15.90506218 15.40663549 16.65659094 15.37226262 17.57691134
 18.71241176 15.26813967  0.1109494  12.18249396]
[1.00049347 1.00158774 1.00191142 1.00104375 1.0011884 ]
[1.00036705 1.00108433 1.0007672  1.000221   1.00044074]
LL part 1: 	-2692.932507
LL part 1+2: 	-1157.083439
final LL: 	-1157.817550
[ 3.14916388e-02  1.14712603e-01  3.86449691e-01 -5.30209290e-02
 -5.47356707e-01 -9.00305482e-02 -8.20593600e-04 -2.97083249e+00
 -1.16208575e-02  0.00000000e+00]


[16.67664332 15.91180111 15.42863752 16.65333    15.34122263 17.57106865
 18.71235506 15.10155447  0.11094464 12.18249396]
[1.00367846 1.03367406 1.04881998 1.04999608 1.03876285]
[1.00031698 1.04702534 1.02999541 1.00019554 1.01977034]
LL part 1: 	-2124.225046
LL part 1+2: 	-1257.274054
final LL: 	-1258.009818
[-0.44872529 -1.10902925 -2.03071211 -0.41545092 -1.93731208 -0.28037227
 -0.06505667 -4.48391213  1.17257189  0.        ]


[16.67513866 15.90657217 15.41156353 16.65586009 15.36530093 17.57560179
 18.712399

In [151]:
info

{'grad': array([ 4.25035451e-04, -6.66331167e-03,  5.91623252e-04,  3.84238633e-04,
         4.23758515e-04,  4.24073351e-05,  3.86692959e-04,  3.87300639e-04,
         4.04712869e-04,  4.05248002e-04,  1.23800643e-03,  1.35259609e-03,
         4.83486950e-04,  4.60462314e-04,  3.38799483e-04,  8.36429012e-04,
         4.48109885e-04,  4.63599984e-04,  5.16712942e-04,  4.29845175e-04,
         2.59687636e-04, -4.21601759e-04,  4.33498156e-04,  2.64493714e-04,
         2.97510054e-04,  3.68259523e-04,  4.48981639e-04,  8.28952190e-04,
         3.25727334e-04,  3.13447837e-04,  4.34578312e-04,  3.61050889e-04,
         2.92269055e-04,  4.24710325e-04,  5.82334889e-04,  7.20973130e-04,
         3.65241616e-04,  3.78229397e-04,  5.87795164e-04,  4.93249480e-04,
         4.72402124e-04,  4.07938996e-04,  4.21739776e-04,  7.16801648e-04,
        -1.63260416e-01,  3.36408903e-04,  1.92325791e-01,  3.22293958e-04,
         5.28064355e-04,  1.03602740e-03,  1.14848670e-03,  1.87847456e-03,
    

In [154]:
np.exp(x_opt[-2:])

array([ 0.05994846, 12.18249396])

In [152]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = np.exp(x_opt)[i]

pd.Series(core1).sort_values(ascending=True)

ATG     5.212093
GTG     5.758114
GCT    12.667188
CAA    14.274350
TTT    15.249153
         ...    
GAT    57.424335
GGA    61.900322
TGT    64.841495
TGA    67.629805
TAT    93.923728
Length: 64, dtype: float64

In [153]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = np.exp(x_opt)[i+64]
pd.Series(core2).sort_values(ascending=True)

ATG     7.736923
TTT     8.789405
GCT     9.380505
CCC     9.612922
CAA    10.099961
         ...    
TAC    19.663105
GAT    20.707656
TAG    20.964750
AAA    26.136718
GTG    26.638079
Length: 64, dtype: float64

100