## Learning bi-partite motifs based on a thermodynamic approach
### Implements the dynamic programming and the gradient descent

In [70]:
import os
import numpy as np
from matplotlib import pyplot as plt
import itertools
import pandas as pd
from scipy.optimize import fmin_l_bfgs_b
from scipy.optimize import check_grad
from scipy.special import logsumexp
from Bio import SeqIO
import random
import multiprocessing
import multiprocessing as mp
import ctypes
%load_ext cython

#Load Robert Kern's line profiler
%load_ext line_profiler
import line_profiler

The cython extension is already loaded. To reload it, use:
  %reload_ext cython
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [2]:
#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)
from Cython.Compiler.Options import get_directive_defaults
directive_defaults = get_directive_defaults()
directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

### cython

In [3]:
%%cython -f -I . --compile-args=-DCYTHON_TRACE=1 


cimport cython
import numpy as np
import itertools
from libc.math cimport exp,pow


cdef int l = 3 #l_A=l_B=3 nucleotides
cdef int l_p = 3 #persistence length is 3 nucleotides 
cdef double cpi = np.pi

cpdef generate_kmer_inx():
    cdef dict vals = {'A':0,'C':1,'G':2,'T':3}
    cdef dict kmer_inx = {}
    
    for p in list(itertools.product(vals.keys(), repeat=l)):
        inx = 0
        for j,base in enumerate(p):
            inx += (4**j)*vals[base] 
        kmer_inx[''.join(p)] = inx
    return kmer_inx

kmer_inx = generate_kmer_inx()
inx_kmer = {y:x for x,y in kmer_inx.items()}

cpdef seq2int_cy(str sequence):
    cdef int L = len(sequence)
    kmer_array = np.zeros(L, dtype=int)
    
    cdef i
    for i in range(l-1,L):
        kmer = sequence[i-l+1:i+1]
        kmer_array[i] = kmer_inx[kmer]
    return kmer_array        


cpdef void assign_za_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    za[i] = (zb[i-l] + np.sum(za[:i-l+1])) * cab * exp(-Ea[x[i]])
    
cpdef void assign_zb_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    cdef double z = zb[i-1]
    cdef int j

    for j in range(0,i-l+1):
        z += za[j]*cb_c(i-j-l, sf, D, sig)*exp(-Eb[x[i]])
    z += zb[i-l]*cab*np.exp(-Eb[x[i]])      
    zb[i] = z 


cpdef void assign_za_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
                                 double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
                                 double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    identical = (inx == x[i])
    
    za_Ea_derivatives[inx,i] = cab*(zb_Ea_derivatives[inx,i-l] + np.sum(za_Ea_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]]) - cab*(zb[i-l] + np.sum(za[:i-l+1]))*identical*exp(-Ea[x[i]])
    za_Eb_derivatives[inx,i] = cab*(zb_Eb_derivatives[inx,i-l]+ np.sum(za_Eb_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]])


cdef extern from "src_helper.c":
    pass
    
cdef extern from "src_helper.h":
    cdef void assign_zb_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double sf, double D , double sig)
    cdef void assign_za_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double sf, double D , double sig)
    cdef double cb_c(int, double, double, double)
    cdef double cb_D_derivative_c(int, double, double, double);
    cdef double cb_sig_derivative_c(int, double, double, double);
    cdef double cb_sf_derivative_c(int, double, double, double);
    
    
cpdef void assign_za_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    za_D_derivatives[i] = (zb_D_derivatives[i-l]+np.sum(za_D_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])
    

cpdef void assign_za_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    za_sig_derivatives[i] = (zb_sig_derivatives[i-l]+np.sum(za_sig_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_za_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    za_sf_derivatives[i] = (zb_sf_derivatives[i-l]+np.sum(za_sf_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_zb_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j

    for j in range(0,i-l+1):
        der += za_D_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_D_derivative_c(i-l-j, sf, D, sig)
    der += zb_D_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_D_derivatives[i-1]
    
    zb_D_derivatives[i] = der
    
cpdef void assign_zb_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j

    for j in range(0,i-l+1):
        der += za_sig_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sig_derivative_c(i-l-j, sf, D, sig)
    der += zb_sig_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sig_derivatives[i-1]
    
    zb_sig_derivatives[i] = der


cpdef void assign_zb_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    for j in range(0,i-l+1):
        der += za_sf_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sf_derivative_c(i-l-j, sf, D, sig)
    der += zb_sf_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sf_derivatives[i-1]
    
    zb_sf_derivatives[i] = der

    
    
cpdef void assign_za_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    za_cab_derivatives[i] = exp(-Ea[x[i]])*((zb_cab_derivatives[i-l]+ np.sum(za_cab_derivatives[:i-l+1]))*cab + zb[i-l] + np.sum(za[:i-l+1]))
    
cpdef void assign_zb_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j

    for j in range(0,i-l+1):
        der += za_cab_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]
        der += zb_cab_derivatives[j]*cab + zb[j]
    der *= exp(-Eb[x[i]])
    der += zb_cab_derivatives[i-1]
    
    zb_cab_derivatives[i] = der
    
    
def DP_Z_cy(double[:] args, long[:] x):
    
    cdef int L = len(x)
    cdef double cab = 1.0

    cdef double[:] Ea = args[0:len(kmer_inx)]
    cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
    cdef double sf = args[-3]
    cdef double D = args[-2]
    cdef double sig = args[-1]
    
    #initialization of statistical weigths
    cdef double[:] za = np.zeros(L)
    cdef double[:] zb = np.zeros(L)

    cdef int i
    for i in range(0,l):
        zb[i] = 1 

    #initialization of derivatives
    cdef double[:,::1] za_Ea_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Ea_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:,::1] za_Eb_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Eb_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:] za_sf_derivatives = np.zeros(L)
    cdef double[:] zb_sf_derivatives = np.zeros(L)
    
    cdef double[:] za_D_derivatives = np.zeros(L)
    cdef double[:] zb_D_derivatives = np.zeros(L)

    cdef double[:] za_sig_derivatives = np.zeros(L)
    cdef double[:] zb_sig_derivatives = np.zeros(L)


    cdef int inx
    #dynamic programming calculation of z and derivatives 
    for i in range(l,L):
        #calculate statistical weights
        assign_za_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        assign_zb_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        
        #calculate derivatives
        for inx in range(len(kmer_inx)):
            assign_za_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, sf, D, sig)
            assign_zb_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, sf, D, sig)
        
        
        assign_za_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)

    
    Z_x = zb[L-1] + np.sum(za)
    
    #derivative of Z(x)
    d_Ea = zb_Ea_derivatives[:,L-1] + np.sum(za_Ea_derivatives, axis=1)
    d_Eb = zb_Eb_derivatives[:,L-1] + np.sum(za_Eb_derivatives, axis=1)
    
    d_sf = zb_sf_derivatives[L-1] + np.sum(za_sf_derivatives)
    d_D = zb_D_derivatives[L-1] + np.sum(za_D_derivatives)
    d_sig = zb_sig_derivatives[L-1] + np.sum(za_sig_derivatives)
    
    
    gradient = np.concatenate([q.ravel() for q in [d_Ea, d_Eb, np.array([d_sf, d_D, d_sig])]])
    
    return Z_x, gradient



### implementation of the LL object

In [108]:
class nLL:
    def __init__(self, seqs_p, seqs_bg):
        
        self.N_p = len(seqs_p)
        self.N_bg = len(seqs_bg)

        #calculate background probabilities:

        #include positive sequences in bg sequences if not there
        X_bg_t = list(set(seqs_p + seqs_bg))  #number of unique sequences
        
        counts = np.zeros(len(X_bg_t))
        for i, x in enumerate(X_bg_t):
            counts[i] = seqs_bg.count(x)
            
        counts = counts + 1 #pseudocount to make sure 
        counts = counts/np.sum(counts)

        p_bg = dict(zip(X_bg_t, counts))

        self.pbg_xp = np.array([p_bg[x] for x in seqs_p])
        self.pbg_xbg = np.array([p_bg[xbg] for xbg in seqs_bg])
        
        #add a padding nucleotide to the beginning to make the calculations stable, binding starts at
        #position i=l so the padded nucleotide has no effect.
        self.X_p = [seq2int_cy('A' + x) for x in seqs_p] 
        self.X_bg = [seq2int_cy('A' + x) for x in seqs_bg]

        
    def assign_z_p(self, tup):
            i, args = tup
            d_z_x_np = np.frombuffer(dz.get_obj(), dtype=np.float64).reshape(-1, self.N_p)
            z[i], d_z_x_np[:,i] = DP_Z_cy(args, self.X_p[i])
            
    def assign_z_bg(self, tup):
            i, args = tup
            d_z_xbg_np = np.frombuffer(dz.get_obj(), dtype=np.float64).reshape(-1, self.N_bg)
            z[i], d_z_xbg_np[:,i] = DP_Z_cy(args, self.X_bg[i])

          

        
    def __call__(self, parameters):
        
        #number of positive variables (stacked at the end)
        n_pos = 3
        
        #exp parameters to make sure they are positive
        args = parameters.copy()
        args[-n_pos:] = np.exp(args[-n_pos:])
    
    
        #define weights and derivatives as a multiprocessing array
        z_x = mp.Array(ctypes.c_double, self.N_p)
        d_z_x = mp.Array(ctypes.c_double, (2*len(kmer_inx)+ n_pos)*self.N_p)

        z_xbg = mp.Array(ctypes.c_double, self.N_bg)
        d_z_xbg = mp.Array(ctypes.c_double, (2*len(kmer_inx)+ n_pos)*self.N_bg) 
        
        #parallelizing
        with multiprocessing.Pool(initializer=init, initargs=(z_x,d_z_x), processes=8) as pool:
            pool.map(self.assign_z_p, [(i, args) for i in range(len(self.X_p))])
        with multiprocessing.Pool(initializer=init, initargs=(z_xbg, d_z_xbg), processes=8)  as pool:
            pool.map(self.assign_z_bg, [(i, args) for i in range(len(self.X_bg))])
        
        #= convert to np array ======
        d_z_x = np.frombuffer(d_z_x.get_obj(), dtype=np.float64).reshape(-1, self.N_p)
        d_z_xbg = np.frombuffer(d_z_xbg.get_obj(), dtype=np.float64).reshape(-1, self.N_bg)
        z_x = np.frombuffer(z_x.get_obj(), dtype=np.float64)
        z_xbg = np.frombuffer(z_xbg.get_obj(), dtype=np.float64)
        #============================
        
        #calculate log likelihood of model given arg parameters
        ll = np.sum(np.log(self.pbg_xp) + np.log(np.ones(self.N_p) - (np.ones(self.N_p)/z_x)))
        ll -= self.N_p * logsumexp( np.log(self.pbg_xbg) + np.log(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg)) )
        
        #calculate partial derivatives of model given arg parameters
        dll = np.sum(d_z_x/(z_x*(z_x-1)), axis=1)
        dll -= self.N_p * ( np.sum((self.pbg_xbg * d_z_xbg)/(z_xbg*z_xbg), axis=1 ) / np.sum(self.pbg_xbg*(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg))))

        #exp modify dLL for positive elements
        dll[-n_pos:] = dll[-n_pos:]*args[-n_pos:]

        #regularize some parameters
        if True:
            comp = -3
            reg = 1e-6 
            ll -= np.power(args[comp],2)*reg
            dll[comp] -= 2*reg*args[comp]
        #print("final ll:\t%f"%ll)
        return -ll, -dll 

#make the arrays global to all processes
def init(z_array, dz_array):
    global z
    z = z_array    
    global dz
    dz = dz_array

In [111]:
nll_obj = nLL(plus[:],bg[:])

In [112]:
%%timeit
nll_obj(parameters)

2.8 s ± 98.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [114]:
14.4/2.8

5.142857142857143

In [113]:
nll_obj(parameters)

(11923.776281172548,
 array([-1.66432846, -0.7504278 , -0.46526762, -0.91652181,  1.0775965 ,
         0.36521704,  1.1508437 ,  0.26494546,  0.19421479, -2.23689731,
        -0.41180329, -1.11258778, -1.16359732, -0.20154932, -7.52626024,
         3.6382629 , -9.97326102, -0.33553561, -0.12723398, -0.05891434,
        -0.17750136, -0.34185463, -0.51621316, -0.26502416,  3.13370956,
        -0.64685342, -2.81504438, -0.06425341, -0.47673245, -1.5396405 ,
        -1.54800309,  0.36357865, -1.65011999,  5.64776344, -6.52232789,
        -0.82177948, -1.02199147, -0.95196026, -0.24487099, -0.49608936,
         1.2917448 , -2.07978095, -0.21500654, -8.53646402, -0.74087075,
        -0.9580317 , -3.02107251,  1.02129394, -1.75096918, -1.57177815,
        -0.45626894, -0.50845231, -0.16564203, -6.69376256, -1.34291179,
        -2.58066867,  0.3872987 , -3.12303035, -0.63805507, -2.8021936 ,
         3.1728725 ,  3.85977448,  0.37304372, 49.56234451, -6.81422543,
        -3.00216273, -0.195455

### Import fasta files

In [6]:
def parse_fasta(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fasta")
    return [str(record.seq) for record in input_seq_iterator]

In [7]:
def swap_cores (args):
    core1_args = args[:len(kmer_inx)]
    core2_args = args[len(kmer_inx):len(kmer_inx)*2]
    return np.concatenate([x.ravel() for x in [core2_args, core1_args, np.array([args[-2], args[-1]])]])

In [8]:
if False:
    set_size = 100
    bg = parse_fasta('HNRNPA0_1_TGTCGA40NCCGA_AAG_1.fasta.tmp')
    plus = parse_fasta('HNRNPA0_4_TGTCGA40NCCGA_AAG_4.fasta.tmp')

    bg = random.sample(bg, set_size)
    plus = random.sample(plus, set_size)
    
    bg   = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in bg]
    plus = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in plus]
    
else:
    bg = parse_fasta('negatives_toy.fasta')
    plus = parse_fasta('positives_toy.fasta')

In [9]:
p_array = []
def callb(x):
    core1 = {}
    for i in range(len(kmer_inx)):
        core1[inx_kmer[i]] = x[i]

    print(pd.Series(core1).sort_values(ascending=True)[:2]) 
    
    core2 = {}
    for i in range(len(kmer_inx)):
        core2[inx_kmer[i]] = x_opt[i+64]
    pd.Series(core2).sort_values(ascending=True)
    print(pd.Series(core2).sort_values(ascending=True)[:2])
    
    print('next iteration ======')
    p_array.append(x)

In [104]:
np.random.seed(0)
Ea = np.random.uniform(low=8, high=12, size=len(kmer_inx))
Eb = np.random.uniform(low=8, high=12, size=len(kmer_inx))
sf = np.log(100)
D = np.log(3)
sig = np.log(3)

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])

In [18]:
%%timeit
nll_obj(parameters)

172 ms ± 4.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
%%timeit
nll_obj(parameters)

175 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%lprun -T lprof0 -f nll_obj nll_obj(parameters)

  profile = LineProfiler(*funcs)



*** Profile printout saved to text file 'lprof0'. 


Timer unit: 1e-06 s

In [24]:
#%lprun -T lprof0 -f DP_Z_cy DP_Z_cy(parameters, seq2int_cy(plus[0]))


*** Profile printout saved to text file 'lprof0'. 


Timer unit: 1e-06 s

Total time: 0.044824 s
File: /home/salma/.cache/ipython/cython/_cython_magic_e2ebc85239078311a48cd1f94450c1fd.pyx
Function: DP_Z_cy at line 158

Line #      Hits         Time  Per Hit   % Time  Line Contents
   158                                           def DP_Z_cy(double[:] args, long[:] x):
   159                                               
   160         1          8.0      8.0      0.0      cdef int L = len(x)
   161         1          4.0      4.0      0.0      cdef double cab = 1.0
   162                                           
   163         1         13.0     13.0      0.0      cdef double[:] Ea = args[0:len(kmer_inx)]
   164         1          3.0      3.0      0.0      cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
   165         1          3.0      3.0      0.0      cdef double sf = args[-3]
   166         1          3.0      3.0      0.0      cdef double D = args[-2]
   167         1          2.0      2.0      0.0      cdef double sig 

In [21]:
param = parameters
intrep = seq2int_cy(plus[20])

for comp in [0,127,128,129,130]:
    epsilon = 1e-5
    x_fwd = param.copy()
    x_fwd[comp] += epsilon
    x_rev = param.copy()
    x_rev[comp] -= epsilon

    fx_fwd = DP_Z_cy(x_fwd, intrep)[0]
    fx_rev = DP_Z_cy(x_rev, intrep)[0]
    
    print('component %d'%comp)
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(DP_Z_cy(param, intrep)[1][comp])

component 0
0.0
0.0
component 127
-0.00013661644038265308
-0.00013661643089069622
component 128
4.9238391142125686e-08
4.9221867283524847e-08
component 129
9.86766224286839e-08
9.87268101479782e-08
component 130
-6.508127370352668e-08
-6.508425866948812e-08


In [23]:
param = np.log(parameters)
param = parameters
for comp in [0, 127,128,129,130]:
    epsilon = 1e-7
    x_fwd = param.copy()
    x_fwd[comp] += epsilon
    x_rev = param.copy()
    x_rev[comp] -= epsilon

    fx_fwd = nll_obj(x_fwd)[0]
    fx_rev = nll_obj(x_rev)[0]
    
    print('component %d\t, 1=%f, 2=%f'%(comp, fx_fwd, nll_obj(param)[0]))
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(nll_obj(param)[1][comp])

component 0	, 1=73.787589, 2=73.787589
0.05244743306320743
0.05244675990695298
component 127	, 1=73.787589, 2=73.787589
0.6708569344482385
0.6708735725800907
component 128	, 1=73.787589, 2=73.787589
0.020000143763354572
0.00020000000000004343
component 129	, 1=73.787589, 2=73.787589
-3.552713678800501e-07
9.107298248878239e-18
component 130	, 1=73.787589, 2=73.787589
2.842170943040401e-07
-1.3010426069826055e-17


### optimization

In [143]:
nll_obj = nLL(plus[:],bg[:])

In [136]:
nll_obj(x_opt)

(10762.132756694307,
 array([-2.43838993e-188, -1.43548797e-009, -4.36820392e-007,
        -8.79197854e-012, -0.00000000e+000, -0.00000000e+000,
        -0.00000000e+000, -0.00000000e+000, -0.00000000e+000,
        -5.17918883e-025, -5.23631474e-007, -1.14425774e-015,
        -1.39215471e-011,  4.69984208e-017, -5.11358298e-010,
        -1.60404660e-012, -5.27760648e-006, -2.68120169e-006,
        -3.23605253e-080, -2.36217542e-072, -6.82879528e-219,
        -0.00000000e+000, -0.00000000e+000, -4.19366858e-007,
        -0.00000000e+000, -6.12813106e-008,  4.80864058e-005,
         8.68714346e-006, -6.51063363e-028, -1.53136644e-019,
        -4.14529667e-020, -8.55977640e-010,  4.31776434e-009,
        -0.00000000e+000, -6.33438616e-006, -1.27008259e-234,
        -9.98289744e-018, -0.00000000e+000, -0.00000000e+000,
        -1.89043326e-009, -0.00000000e+000, -3.31379332e-160,
        -0.00000000e+000, -6.30121288e-006, -0.00000000e+000,
        -3.64920765e-006, -1.15266654e-005, -1.24

In [None]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=parameters, callback=callb, factr=1e-5, iprint=5)

final ll:	-12029.782352
final ll:	-11947.724499
final ll:	-11263.221551
final ll:	-12080.285799
final ll:	-11303.367192
final ll:	-11239.488776
TTT    3.960656
ACA    7.534279
dtype: float64
CAG    15.934364
ATT    19.855461
dtype: float64
final ll:	-11220.152125
TTT    4.050875
ACA    7.060870
dtype: float64
CAG    15.934364
ATT    19.855461
dtype: float64
final ll:	-11874.414236
final ll:	-11212.952036
TTT    3.917881
ACA    6.875318
dtype: float64
CAG    15.934364
ATT    19.855461
dtype: float64
final ll:	-11196.728916
TTT    3.476117
ACA    6.405373
dtype: float64
CAG    15.934364
ATT    19.855461
dtype: float64
final ll:	-11188.232978
TTT    3.183166
ACA    5.713168
dtype: float64
CAG    15.934364
ATT    19.855461
dtype: float64


In [132]:
info

{'grad': array([-2.43838993e-188, -1.43548797e-009, -4.36820392e-007,
        -8.79197854e-012, -0.00000000e+000, -0.00000000e+000,
        -0.00000000e+000, -0.00000000e+000, -0.00000000e+000,
        -5.17918883e-025, -5.23631474e-007, -1.14425774e-015,
        -1.39215471e-011,  4.69984208e-017, -5.11358298e-010,
        -1.60404660e-012, -5.27760648e-006, -2.68120169e-006,
        -3.23605253e-080, -2.36217542e-072, -6.82879528e-219,
        -0.00000000e+000, -0.00000000e+000, -4.19366858e-007,
        -0.00000000e+000, -6.12813106e-008,  4.80864058e-005,
         8.68714346e-006, -6.51063363e-028, -1.53136644e-019,
        -4.14529667e-020, -8.55977640e-010,  4.31776434e-009,
        -0.00000000e+000, -6.33438616e-006, -1.27008259e-234,
        -9.98289744e-018, -0.00000000e+000, -0.00000000e+000,
        -1.89043326e-009, -0.00000000e+000, -3.31379332e-160,
        -0.00000000e+000, -6.30121288e-006, -0.00000000e+000,
        -3.64920765e-006, -1.15266654e-005, -1.24270603e-007,


In [133]:
np.exp(x_opt[-3:])

array([  0.        , 274.69876301,   1.50312961])

In [134]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = x_opt[i]

pd.Series(core1).sort_values(ascending=True)

TTT       15.745735
GGC       20.736311
TGC       24.264608
ATT       26.623853
GTG       29.507769
           ...     
CAT    20582.429169
CCC    20587.266985
AGG    21012.819731
AGC    34125.421585
GCA    67359.040698
Length: 64, dtype: float64

In [135]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = x_opt[i+64]
pd.Series(core2).sort_values(ascending=True)

CAG       15.934364
ATT       19.855461
GGG       20.591120
GGC       21.319339
TGC       21.378634
           ...     
CCA    21764.177260
CCG    21948.297360
AGG    25356.048772
AGT    27048.038192
CGG    40098.271539
Length: 64, dtype: float64

### debug mode

In [18]:
def set_kmer_arg (args, kmer, domain, value):
    inx = kmer_inx[kmer]
    inx = inx + (domain-1)*len(kmer_inx)
    args_m = args
    args_m[inx] = value
    return args_m

In [38]:
Ea = np.zeros(len(kmer_inx)) + 8
Eb = np.zeros(len(kmer_inx)) + 8
sf=10000
D=3
sig=1

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])


parameters = set_kmer_arg(parameters, 'TTT', 1, 0.1)
parameters = set_kmer_arg(parameters, 'TTT', 2, 0.1)
#parameters = set_kmer_arg(parameters, 'AAA', 1, 3)
#parameters = set_kmer_arg(parameters, 'AAA', 2, 3)

100

In [85]:
from multiprocessing.pool import Pool

val = mp.Value('i')
def initializer(value):
    # The identity function
    global value

def count_bits(i):
    value = value + i

with Pool(initializer=initializer, initargs=(val,)) as pool:
    pool.map(count_bits, [1, 2, 3, 4, 1, 2, 3, 4])
    

SyntaxError: name 'value' is parameter and global (<ipython-input-85-ae802292acc9>, line 9)

In [82]:
int_ls = [1, 2, 3, 4, 1, 2, 3, 4]
parallel_bit_counter(int_ls)

TypeError: __init__() got an unexpected keyword argument 'expect_initret'