# Part d

In [1]:
import itertools
import warnings
import time

# Our numerical workhorses
import numpy as np
from numpy import trapz
import pandas as pd
import scipy.stats as st
import scipy.special

# The MCMC Hammer
import emcee

# BE/Bi 103 utilities
import bebi103

# Import plotting tools
import matplotlib.pyplot as plt
import seaborn as sns
import corner

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def log_likelihood(tau, t, m):
    """
    Takes in tau (should be an array of length m), 
    data (Pandas data series), various values for m. 
    We are very proud of this function.
    """   
    # set up array to store the log likelihoods calculated for each
    # value of t
    log_like = np.empty((len(t), 1))
    
    num_handler = np.empty((len(t), m)) # the numerator of the scaling factor
    exp_handler = np.empty((len(t), m)) # the exponent part of the numerators
    den_elem = np.empty(m) # temporary array to store elements of the denominator 
                           # of the scaling factor
    den_handler = np.empty((len(t), m)) # denominator of the scaling factor
    
    # iterate over j for all data points 
    for j in range(m):
        num_handler[:,j] = tau[j]**(m-2)
        exp_handler[:,j] = -t/tau[j]
        den_elem = tau[j] - tau
        # use only non-zero elements when taking the product 
        # in the denominator of the scaling factor 
        den_prod = np.prod(den_elem[:j]) * np.prod(den_elem[j+1:])
        den_handler[:,j] = den_prod 
        den_elem = np.empty(m)

    scaling_factor = num_handler / den_handler

    # store the log likelihoods in an array 
    log_like = scipy.misc.logsumexp(exp_handler, axis=1, b=scaling_factor)            

    # return the sum of the log likelihoods
    return np.sum(log_like)

def log_prior(tau, m, tau_min, tau_max):
    """
    Log prior for model defined above. Takes in m and tau, 
    returns log prior.
    """
    # check that tau is ordered least to greatest
    for i in range(m-1):
        if not (tau[i] + 1e-6) < tau[i+1]:
            return -np.inf
    
    # check that tau is not outside the boundaries we define
    if tau[0] < tau_min or tau[-1] > tau_max:
        return -np.inf
    
    prior = np.empty(m)
    # calculate and store the elements of the normalized prior 
    # associated with each tau 
    for i in range(m):
        if m == 1:
            prior[i] = tau[i] * np.log(tau_max / tau_min)
        
        elif m > 1 and i == 0:
            prior[i] = tau[i] * np.log(tau[i+1] / tau_min)
        
        elif m > 1 and i == m-1:
            prior[i] = tau[i] * np.log(tau_max / tau[i-1])
        
        else:
            prior[i] = tau[i] * np.log(tau[i+1] / tau[i-1])
            
    return -np.log(np.prod(prior))
        
def tau_start(m, n_walkers, n_temps):
    """
    Generates starting points for each tau.
    """
    p = np.empty((n_temps, n_walkers, m))
    
    for i in range(m):
        p[:,:,i] = np.random.exponential(300, (n_temps,n_walkers))
    return p

In [3]:
def sample_ptmcmc(data, m, model, tau_min=1, tau_max=1800, n_temps=20, n_walkers=100, n_burn=100, 
                  n_steps=500, threads=None):
    """
    Sample posterior using PTMCMC.
    """
    # arguments for likelihood
    loglargs = (data, m)
    
    # arguments for prior
    logpargs = (m, tau_min, tau_max)
    
    # starting points for the parameters
    p0 = tau_start(m, n_walkers, n_temps)
    
    # column headings for outputted DataFrames
    columns = {'m = 1': ['tau_1'],
               'm = 2': ['tau_1', 'tau_2'],
               'm = 3': ['tau_1', 'tau_2', 'tau_3'],
               'm = 4': ['tau_1', 'tau_2', 'tau_3', 'tau_4'],
               'm = 5': ['tau_1', 'tau_2', 'tau_3', 'tau_4', 'tau_5']}
    
    return bebi103.run_pt_emcee(
            log_likelihood, log_prior, n_burn, n_steps,
            n_temps=n_temps, p0=p0, loglargs=loglargs, logpargs=logpargs, 
            threads=threads, columns=columns[model], return_lnZ=True)

In [4]:
def log_posterior(p, t, m=3, tau_min=1, tau_max=1800):
    """
    Calculates the log posterior using our best model.
    """
    tau_1, tau_2, tau_3 = p
    
    # check that tau is ordered least to greatest
    if not (tau_1 + 1e-6) < tau_2 and (tau_2 + 1e-6) < tau_3:
        return -np.inf
    
    # check that tau is not outside the boundaries we define
    if tau_1 < tau_min or tau_3 > tau_max:
        return -np.inf
    
    return log_likelihood(p, t, m) + log_prior(p, m, tau_min, tau_max)

def sample_mcmc(data, n_walkers=100, n_burn=100, 
                  n_steps=500, threads=None):
    """
    Sample posterior using MCMC. Written for use
    with our best model.
    """
    #p0[:,:,0] = np.random.exponential(300, (n_walkers, 3)) 
    p0 = np.empty((n_walkers, 3))
    p0[:,:] = np.array([10, 20, 30])
    columns = ['tau_1', 'tau_2', 'tau_3'] 

    args = (data,)

    return bebi103.run_ensemble_emcee(log_posterior, n_burn=n_burn, 
                                      n_steps=n_steps, n_walkers=n_walkers, 
                                      p0=p0, columns=columns, 
                                      args=args, threads=3)                    

In [4]:
df = pd.read_csv("./data/gardner_hw6/gardner_mt_catastrophe_only_tubulin.csv", comment="#")
df.head()

Unnamed: 0,12 uM,7 uM,9 uM,10 uM,14 uM
0,25.0,35,25,50,60
1,40.0,45,40,60,75
2,40.0,50,40,60,75
3,45.429,50,45,75,85
4,50.0,55,50,75,115


In [None]:
start = time.time()
df_7uM, lnZ_7uM, dlnZ_7uM = sample_ptmcmc(df['7 uM'].dropna(), 3, 'm = 3', 
                                    n_temps=20, n_walkers=100, 
                                    n_burn=1000, n_steps=5000, threads=3)
print('The first PTMCMC took ', time.time()-start, ' seconds.')

start = time.time()
df_9uM, lnZ_9uM, dlnZ_9uM = sample_ptmcmc(df['9 uM'].dropna(), 3, 'm = 3', 
                                    n_temps=20, n_walkers=100, 
                                    n_burn=1000, n_steps=5000, threads=3)
print('The second PTMCMC took ', time.time()-start, ' seconds.')

start = time.time()
df_10uM, lnZ_10uM, dlnZ_10uM = sample_ptmcmc(df['10 uM'].dropna(), 3, 'm = 3', 
                                    n_temps=20, n_walkers=100, 
                                    n_burn=1000, n_steps=5000, threads=3)
print('The third PTMCMC took ', time.time()-start, ' seconds.')

start = time.time()
df_14uM, lnZ_14uM, dlnZ_14uM = sample_ptmcmc(df['14 uM'].dropna(), 3, 'm = 3', 
                                    n_temps=20, n_walkers=100, 
                                    n_burn=1000, n_steps=5000, threads=3)
print('The fourth PTMCMC took ', time.time()-start, ' seconds.')

In [None]:
df7uM_map = df_7uM[(df_7uM.beta_ind==0) & (df_7uM.lnlike != -np.inf)]

tau_map_7uM = np.array([df7uM_map['tau_1'].median(), df7uM_map['tau_2'].median(),
                     df7uM_map['tau_3'].median()])

post7uM = post_plot(tau_map1, np.arange(1800), 1, 1, 1800)
post7uM_area = trapz(post1, dx=1)
post7uM_norm = post1 / post1_area
post7uM_cdf = np.cumsum(post1_norm)

df7_dropna = df['7 uM'].dropna()

# Build ECDF
y = np.arange(len(df7_dropna)) / len(df7_dropna)
x = np.sort(df7_dropna.values)

# Plot ECDF and theoretical CDFs for different values of m
plt.plot(np.arange(1800), post7uM_cdf, color='r')
plt.plot(x, y, '.', color='k')