In [1]:
import numpy as np
import warnings
from munch import Munch
import itertools
from mv_Viterbi import mv_Viterbi
from cst_aggregate import cst_aggregate

### Create the HMM

In [4]:
hidden_states = ['pro','ex1','ex2','int','dis','enh']
emit_states = ['A','T', 'C', 'G']
hidden_size, emit_size = len(hidden_states), len(emit_states)

hmm_mat = np.array([
    [.6,.1,.1,.1,.1,0], #promoter
    [0,.4,.2,.2,.1,.1], #exon1
    [.0,.1,.6,.1,.1,.1], #exon2
    [.2,.1,.1,.5,0,.1], #intron
    [0,1/3, 1/3, 0,1/3,0], #disease
    [0,.25,.25,.25,0,.25] #enhancer
])

emit_mat = np.array([ #
    [.1,.1,.4,.4], #CG rich promoter
    [.2,.2,.5,.1], #Exon 1 favors C
    [.5,.1,.2,.2], #Exon 2 favors A
    [.25,.25,.25,.25], #Intron 
    [.4,.1,.4,.1], #Disease favors AC
    [.4,.4,.1,.1] #AT rich enhancer
])

init_vec = np.array(
    [.2,0,0,.8,0,0]
)

hmm_transition = {}
for i in range(hidden_size):
    for j in range(hidden_size):
        hmm_transition[hidden_states[i],hidden_states[j]] = hmm_mat[i,j].item()

hmm_emit = {}
for i in range(hidden_size):
    for j in range(emit_size):
        hmm_emit[hidden_states[i],emit_states[j]] = emit_mat[i,j].item()
        
hmm_startprob = {}
for i in range(hidden_size):
    hmm_startprob[hidden_states[i]] = init_vec[i]

hmm = Munch(states = hidden_states, emits = emit_states, tprob = hmm_transition, eprob = hmm_emit, initprob = hmm_startprob)

### Stay > = 5

In [3]:
def update_fun(r,k , r_past):
    '''
    r = hidden_states x [1,2,3,4,5]
    '''
    prev, count = r_past #r is a tuple
    if k == prev:
        new_count = count + 1
    else:
        new_count = 1
        
    consistency = (count == 5) or (k = prev) #0 if transition to new state without staying 3

    return (r == (k,new_count)) and consistency

def init_fun(k, r):
    '''
    initial "prob" of r = (m1,m2) from k. is just indicator
    '''

    return r == (k,1)
    
# def eval_fun(r, sat):
#     '''
#     Constraint is a boolean emissions of the final auxillary state. In this case, is just m1^T: ie. tau_a >= tau_b for all time.
#     '''
#     return int(r[1] == sat) 

### Promoter Must Occur in First 30

In [6]:
def update_fun(r,k , r_past):
    '''
    r = Boolean
    tracks if 'pro' has occured yet or not
    '''
    if k == 'pro':
        occur = True
        

    return r == (r_past or occur) 

def init_fun(k, r):

    return r == (k == 'pro')

def eval_fun(r, sat):
    return r == sat

#### Visit Dis Exactly Once 

In [7]:
def update_fun(r,k , r_past):
    '''
    r = [0,1,2]
    tracks if 'pro' has occured yet or not
    '''
    if k == 'dis':
        count = max(r + 1, 2)
        

    return r == count 

def init_fun(k, r):

    return r == int(k == 'dis')

def eval_fun(r, sat):
    return r == 1 #must be exactly 1.

### Promoter < Disease < Enhancer

In [None]:
def update_fun(r,k , r_past):
    '''
    r = Boolean_pro x Bool_dis x Bool_enh
    trcks that they occur in sequence
    '''
    occur_pro, occur_dis, occur_enh = r_past
    consist = True
    
    pro_new = (k == 'pro' or occur_pro)
    dis_new = (k == 'dis' or occur_dis)
    enh_new = (k == 'pro' or occur_enh)

    if k == 'dis':
        consist = occur_pro

    if k == 'enh':
        consist = occur_dis 

    return (r == (pro_new, dis_new,enh_new)) and consist

def init_fun(k, r):

    return r == ( k == 'pro', k == 'dis', k == 'enh')


### Inference when the Constraint is Satisfied

Here, we constrain $C=1$: $a$ must happen before $c$. As predicted, when encountering an initial sequence of $C$'s, our model choose $b$ since $c$ is not allowed and $b$ has a higher chance of emitting $C$. Provided the initial number of $C$'s is at most 2, we'll see this behavior. We can increase the admissable length of $b$'s by decreasing the emission probabilities $a,A$ and $c,C$ if we want.

In [13]:
obs = ['C','A','C','A','C']

In [14]:
opt_aug, opt_state = mv_Viterbi(obs, hmm, precedence_cst, sat = True)

In [15]:
opt_state

['b', 'a', 'c', 'a', 'c']

### Inference when the COnstraint is NOT Satsified

Now, we observe $C= 0$: that the constrain is not satisifed. It's logical negation is just that $c$ happens before $a$, and the inferene situation is symmetric. We see that encountering a small initial sequence of $A$'s makes us choose $b$ for the same reasons as above.

In [8]:
obs = ['A','A','C','A','C','A','C']

In [9]:
opt_aug, opt_state = mv_Viterbi(obs, hmm, precedence_cst, sat = False)

In [10]:
opt_state

['b', 'b', 'c', 'a', 'c', 'a', 'c']

# Occurence Constraint

Now, we create anothe constraint class that enforce that state $b$ must be visited at some point. This is equivalent to replacing just one of $a$ or $c$ in the unconstrained MAP with $b$, at any time point.

In [11]:
def update_fun2(r,k , r_past):
    '''
    m1 = = tau_b or b . tracks if b has occured
    '''
    m1 = (k == 'b') or r_past[0]

    return int(r == (m1,))

def init_fun2(k, r):
    '''
    initial "prob" of r = m1,m2 from k. is just indicator
    '''
    m1 = k == 'b'

    return int(r == (m1,))
    
def cst_fun2(r, sat):
    '''
    Constraint is a boolean emissions of the final auxillary state. In this case
    '''
    
    return int(r[0]  == sat) 

In [12]:
occurence_cst = Munch(name = 'b must occur', aux_size = 1, update_fun = update_fun2, init_fun = init_fun2, cst_fun = cst_fun2)

In [13]:
obs = ['C','C','A','C','A','C']

In [14]:
opt_aug, opt_state = mv_Viterbi(obs, hmm, occurence_cst, sat = True)

In [15]:
opt_state

['b', 'c', 'a', 'c', 'a', 'c']

## Occurent Constraint is False

If we condition on the constraint being false, this is equivalent to "$b$ is never visited". Since unconstrained inference will never return $b$, setting the constriant to be False will give the same answer as unconstrained inference.

In [17]:
obs = ['C','C','A','C','A','C']

In [18]:
opt_aug, opt_state = mv_Viterbi(obs, hmm, occurence_cst, sat = False)
opt_state

['c', 'c', 'a', 'c', 'a', 'c']

# Conditioning on Multiple Constraints and Their Values

Now, we'll introduce both the precendence constraint "$a$ happens before $c$" and "$b$ must happen at some point" into our model. Again, these are modeled as binary emissions, so we can play with their truth configurations.

In [34]:
cst_list = [precedence_cst,occurence_cst]
combined_cst = cst_aggregate(cst_list)
combined_cst.name

['a occurs before c', 'b must occur']

# Both True

First, we assume both constraints are true. Note that the below observation sequence is chosen so that the precendence constraint already makes $b$ appear first, so the occurence constraint is satsified automatically. Therefore, the answer should be the same as just conditioning on the precendence constraint

In [25]:
obs = ['C','C','A','C','A','C']

In [26]:
opt_aug, opt_state = mv_Viterbi(obs, hmm, combined_cst, sat = (True,True))

In [27]:
opt_state

['b', 'b', 'a', 'c', 'a', 'c']

### Precendence True, Occurence False

Now here's an interesting scenario. The occurence constraint being unsatisfied is equivalent to $b$ never occuring. Now, when the precendence constraint kicks in, we can only choose $a$ or $c$. This means that any initial sequence of $C$ emissions is forced to return $a$, as opposed to $b$ if we were just enforcing the precendence constraint by itself.

In [32]:
obs = ['C','C','A','C','A','C']

In [33]:
opt_aug, opt_state = mv_Viterbi(obs, hmm, combined_cst, sat = (True,False))
opt_state

['a', 'c', 'a', 'c', 'a', 'c']