In [71]:
import numpy as np
import pandas as pd

import bebi103

import altair as alt
import altair_catplot as altcat

import bokeh.io
import bokeh.plotting
from bokeh.layouts import row, column
from bokeh.models import Range1d
import bokeh.application
import bokeh.application.handlers
bokeh.io.output_notebook()

# Simple File Processing 

I had special names for all of my genotypes that I used to record data. For the purposes of plotting and for others understanding the data I renamed everything and made new dataframes.  


Implicit in all strains except N2 is syIs231 (which is hs:lin-3c). 

Implicit in all rescue strains except N2 and "+" is affl-2(sy975). 


**You may need to alter folders for files depending on your setup**

# Making Plots


First we load in our data.

In [114]:
# load data 

df_mut = pd.read_csv('data/mutant_lin3c.csv')
df_rescue = pd.read_csv('data/rescue_lin3c.csv')

df_mut['Genotype'].unique()

array(['N2', '+', 'hsf-1(sy1198)', 'hsf-1(sy441)', 'affl-2(sy975)',
       'affl-1(sy1220) affl-2(sy975)', 'affl-1(sy1202)'], dtype=object)

We define an order for the mutant strains and plot the results. 

In [91]:
order = [('N2', 'Before'),
 ('N2', 'After'),
 ('+', 'Before'),
 ('+', 'After'),
 ('hsf-1(sy1198)', 'Before'),
 ('hsf-1(sy1198)', 'After'),
 ('hsf-1(sy441)', 'Before'),
 ('hsf-1(sy441)', 'After'),
 ('affl-2(sy975)', 'Before'),
 ('affl-2(sy975)', 'After'),
 ('affl-1(sy1202)', 'Before'),
 ('affl-1(sy1202)', 'After'),
 ('affl-1(sy1220) affl-2(sy975)', 'Before'),
 ('affl-1(sy1220) affl-2(sy975)', 'After')]

In [92]:

p = bebi103.viz.jitter(df_mut,
                       ['Genotype', 'Heat Shock'],
                       'Fraction Pumping',
                       horizontal=False,
                       plot_width=600,
                       plot_height = 300,
                       #palette=pp[::2],
                       line_color='black',
                       size = 7,
                     
                       
                       y_axis_label=None,
                       color_column = 'Heat Shock',
                       order = order,
                       palette = [bokeh.palettes.Colorblind[3][1], 
                                  bokeh.palettes.Colorblind[3][0]],
                       
                       
                      # color = pp[0],
                     
                       x_axis_label = 'Genotype',
                      )
p.xgrid.grid_line_color = None

p.y_range = Range1d(-.05,1.1)
p.yaxis.axis_label_text_font_size = '8pt'
p.yaxis.axis_line_width = 2
p.xaxis.axis_line_width = 2

p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks

p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks

bokeh.io.show(p)

We define an order for the AFFL-2 rescue strains and plot the results. 

In [117]:
order1 = [
 ('N2', 'Before'),
 ('N2', 'After'),

 ('+', 'Before'),
 ('+', 'After'),
  ('affl-2(sy975)', 'Before'),
 ('affl-2(sy975)', 'After'),
 ('AFFL-2::GFP', 'Before'),
 ('AFFL-2::GFP', 'After'),
  ('AFFL-2 Del::GFP', 'Before'),
 ('AFFL-2 Del::GFP', 'After'),
('AFFL-2 Del+NLS::GFP', 'Before'),
 ('AFFL-2 Del+NLS::GFP', 'After'),
 
 ('AFFL-2 FUSLC+NLS::GFP', 'Before'),
 ('AFFL-2 FUSLC+NLS::GFP', 'After'),
 ('AFFL-2 FUSLC*+NLS::GFP', 'Before'),
 ('AFFL-2 FUSLC*+NLS::GFP', 'After')]

In [118]:

p = bebi103.viz.jitter(df_rescue,
                       ['Genotype', 'Heat Shock'],
                       'Fraction Pumping',
                       horizontal=False,
                       plot_width=600,
                       plot_height = 300,
                       #palette=pp[::2],
                       
                       color_column = 'Heat Shock',
                       line_color = 'black',
                       size = 7,
                       order = order1,
                       palette = [bokeh.palettes.Colorblind[3][1], 
                                  bokeh.palettes.Colorblind[3][0]],
                       
                       
                      # color = pp[0],
                       #show_legend = True,
                     
                       x_axis_label = 'Genotype',
                      )
p.xgrid.grid_line_color = None

p.y_range = Range1d(-.05,1.1)
p.yaxis.axis_label_text_font_size = '8pt'
p.yaxis.axis_line_width = 2
p.xaxis.axis_line_width = 2

p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks

p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks

bokeh.io.show(p)

As you can see, the labels are very messy. Therefore, we used powerpoint to create our own custom labels. 

# Bayesian analysis 

**Note: excpet for N2, implicit in each strains background is syIs231, which contains the hs:lin-3 transgene**

**"+" = hs:lin-3 in an N2 background**


Our statistical model is as follows: 

Likelihood:

$N_{pump} \sim \text{Binomial}(N, \theta)$

Where $\theta$ is the probability of pumping, $N$ is the number of worms. 

Our prior is:

$\theta \sim \text{Beta}(\alpha, \beta)$.

Where $\alpha$ and $\beta$ depend on each strain. Below are the priors for each the mutant strains and each condition. 


For Before HS:

$\theta_{before} \sim \text{Beta}(10,1)$


For N2 and the follow mutants:
affl-2(sy975hsf-1(sy1198), hsf-1(sy44), affl-2(sy975) affl-1 (sy1220):

$\theta_{after} \sim \text{Beta}(10,1)$


For "+", affl-2(sy1220)

$\theta_{after} \sim \text{Beta}(1,10)$


We write our model in the Stan program below. 

In [95]:
model_code = """
data {
  // Measured data
  int N;
  int n;
  
  // Prior parameters for theta
  real alpha_theta;
  real beta_theta;
}


parameters {
  real<lower=0, upper=1> theta;
}

model {
  theta ~ beta(alpha_theta, beta_theta);
  n ~ binomial(N, theta);
}
"""

sm = bebi103.stan.StanModel(model_code=model_code)

Using cached StanModel.


In [96]:
def model_df(df):
    '''Formats dataframe for use in stan model.'''
    df['N'] = df['Yes'].values + df['No'].values
    df['n'] = df['Yes'].values
    grouped_df = df.groupby(['Genotype', 'Heat Shock']).sum().reset_index()
    return grouped_df


In [97]:
# now create dictionaries for model parameters 
def get_params(df, sleeping = [], somewhat_sleeping = [], somewhat_not_sleeping = []):
    '''Creates dictionaries of stan model parameters for different strains'''
    params = {}
    for genotype in df['Genotype'].unique():
        df_gene = df.loc[df['Genotype'] == genotype, :].reset_index()
        if genotype in sleeping:
            params[genotype] = [df_gene['N'].values[0], df_gene['n'].values[0], 1, 10]
        elif genotype in somewhat_sleeping:
            params[genotype] = [df_gene['N'].values[0], df_gene['n'].values[0], 2, 4]
        elif genotype in somewhat_not_sleeping:
            params[genotype] = [df_gene['N'].values[0], df_gene['n'].values[0], 8.5, 3]
        else:
            params[genotype] = [df_gene['N'].values[0], df_gene['n'].values[0], 10, 1]
    return params
    

def model_run(params_list, model):
    '''Runs stan model and prints diagnostics'''
    N, n, theta_a, theta_b = params_list
    info_dict = {'N': N, 'n': n, 'alpha_theta': theta_a, 'beta_theta': theta_b}
    samples = model.sampling(data=info_dict)
    print(bebi103.stan.check_all_diagnostics(samples))
    return samples  

def plot_params(samples_dic):
    '''Plots Posterior Samples from Stan Models for Different strains '''
    ecdfs = []
    hists = []
    for genotype in samples_dic:
        Title = 'Strain {}, Posterior Samples'.format(genotype)
        samples = samples_dic[genotype]
        hists.append(bebi103.viz.histogram(samples, title=Title, plot_height = 150, plot_width = 250))
        ecdfs.append(bebi103.viz.ecdf(samples, title=Title, plot_height = 150, plot_width = 250))
    for ecdf in ecdfs[1:]:
        ecdf.x_range, ecdf.y_range = ecdfs[0].x_range, ecdfs[0].y_range
    
    for hist in hists[1:]:
        hist.x_range, hist.y_range = hists[0].x_range, hists[0].y_range
    
    bokeh.io.show(column(row(*hists), row(*ecdfs)))

First we look at the data for our mutants. 

In [98]:
# create dataframe with parameters for stan model
df_mut_model = model_df(df_mut)
print(df_mut_model['Genotype'].unique())
# extract data from after and before heat shock 
df_mut_no_HS = df_mut_model.loc[df_mut_model['Heat Shock'] == 'Before', :].reset_index()

df_mut_HS = df_mut_model.loc[df_mut_model['Heat Shock'] == 'After', :].reset_index()



# get dictionary of parameters for each strain
sleeping = ['+',  'affl-1(sy1202)']


mut_HS_params = get_params(df_mut_HS, sleeping = sleeping)

mut_no_HS_params = get_params(df_mut_no_HS)

['+' 'N2' 'affl-1(sy1202)' 'affl-1(sy1220) affl-2(sy975)' 'affl-2(sy975)'
 'hsf-1(sy1198)' 'hsf-1(sy441)']


For each genotype we run the model for our data before and after heat shock to obtain samples for the parameter $\theta_{before}$ and $\theta_{after}$ which denote the probability of a worm pumping before and after the 15 minute heat shock. We then subtract our samples for $\theta_{before}$ and  $\theta_{after}$ to obtain samples for $$\theta_{diff} = \theta_{before} - \theta_{after}.$$ 

In [99]:
samples_before = {}
samples_after = {}
samples_diff = {}
for genotype in df_mut_model['Genotype'].unique():
    print('Genotype: ', genotype)
    print('Before Heat Shock')
    samples_b = model_run(mut_no_HS_params[genotype], sm)
    samples_before[genotype] = samples_b['theta']
    print('After Heat Shock')
    samples_a = model_run(mut_HS_params[genotype], sm)
    samples_after[genotype] = samples_a['theta']
    samples_diff[genotype] = np.abs(samples_b['theta'] - samples_a['theta'])
    print('')

Genotype:  +
Before Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0

Genotype:  N2
Before Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations 

All of our diagnostics look great, so we will plot the posterior distributions and find credible regions  for $\theta_{before}, \theta_{after},$ and $\theta_{diff}$ 

First we plot the posterior distributions for $\theta_{before}$ 

In [100]:
plot_params(samples_before)

We see that the posterior distributions are all shifted to the right towards $\theta = 1$. Furthermore, the shape and location of all distributions are similar, which indicates that all strains display similar pumping behavior prior to heat shock. 

Next we plot the posterior distributions for $\theta_{after}$ 

In [101]:
plot_params(samples_after)

Now we see that the location of the posterior distributions differ between strains. 

We will next plot the posterior distributions for $\theta_{diff}$. 


In [102]:
plot_params(samples_diff)

Again we see that the location of the posterior distributions differ between strains. 

Next, we display the median of the samples along with a 95% confidence interval. 

In [103]:
conditions = ['Before HS', 'After HS', 'Diff']
percs, strPercs = [0.025, 0.5, 0.975], ['2point5', 'Median', '97point5']

cred_reg_dfs = {}


for genotype in df_mut_model['Genotype'].unique():
    all_samples = {'Before HS': samples_before[genotype], 'After HS': samples_after[genotype], 
                   'Diff': samples_diff[genotype]}
  
    vals = np.array([np.percentile(all_samples[cond], percs) for cond in conditions])
    df = pd.DataFrame(vals, columns = strPercs)
    df['Condition'] = conditions
    cred_reg_dfs[genotype] = df

In [104]:
conditions = ['Before HS', 'After HS', 'Diff']
percs, strPercs = [0.025, 0.5, 0.975], ['2point5', 'Median', '97point5']

for genotype in df_mut_model['Genotype'].unique():
    
    # create dictionary with all samples from different conditions 
    all_samples = {'Before HS': samples_before[genotype], 'After HS': samples_after[genotype], 
                   'Diff': samples_diff[genotype]}
    
    # Print results
    print('Genotype: ', genotype)
    print('-' * 20)
    for cond in conditions:
        vals = all_samples[cond]
        vals_per = np.percentile(all_samples[cond], percs)
        print("\t{}: Median = {:.3f}, 95% Credible Region = [{:.3f}, {:.3f}]"\
              .format(cond, vals_per[1], vals_per[0], vals_per[2]))
    print()

Genotype:  +
--------------------
	Before HS: Median = 0.942, 95% Credible Region = [0.927, 0.951]
	After HS: Median = 0.000, 95% Credible Region = [0.000, 0.000]
	Diff: Median = 0.927, 95% Credible Region = [0.902, 0.934]

Genotype:  N2
--------------------
	Before HS: Median = 0.918, 95% Credible Region = [0.886, 0.923]
	After HS: Median = 0.912, 95% Credible Region = [0.891, 0.921]
	Diff: Median = 0.000, 95% Credible Region = [0.000, 0.000]

Genotype:  affl-1(sy1202)
--------------------
	Before HS: Median = 0.925, 95% Credible Region = [0.893, 0.932]
	After HS: Median = 0.000, 95% Credible Region = [0.000, 0.000]
	Diff: Median = 0.910, 95% Credible Region = [0.876, 0.919]

Genotype:  affl-1(sy1220) affl-2(sy975)
--------------------
	Before HS: Median = 0.931, 95% Credible Region = [0.911, 0.937]
	After HS: Median = 0.919, 95% Credible Region = [0.888, 0.926]
	Diff: Median = 0.000, 95% Credible Region = [0.000, 0.000]

Genotype:  affl-2(sy975)
--------------------
	Before HS: Media

# Rescue Experiments 

We repeat the above analysis for the rescue experiment strains. 

Recall that our statistical model is as follows:


$N_{pump} \sim \text{Binomial}(N, \theta)$

$\theta \sim \text{Beta}(\alpha, \beta)$.

Where $\alpha$ and $\beta$ depend on each strain. Below are the priors for each the mutant strains and each condition. 


For Before HS of all strains:

$\theta_{before} \sim \text{Beta}(10,1)$


For N2, affl-2(sy975):

$\theta_{after} \sim \text{Beta}(10,1)$


For syIs231, SUP-45::GFP

$\theta_{after} \sim \text{Beta}(1,10)$

In [105]:
# create dataframe with parameters for stan model
df_resc_model = model_df(df_rescue)
print(df_resc_model['Genotype'].unique())

# extract data from after and before heat shock 
df_resc_HS = df_resc_model.loc[df_resc_model['Heat Shock'] == 'After', :].reset_index()

df_resc_no_HS = df_resc_model.loc[df_resc_model['Heat Shock'] == 'Before', :].reset_index()

# get dictionary of parameters for each strain

sleeping_rescue = ['+', 'AFFL-2::GFP']
somewhat_sleeping_rescue = ['AFFL-2 Del+NLS::GFP','AFFL-2 Del::GFP', 
                            'AFFL-2 FUSC*+NLS::GFP', 'AFFL-2 FUSC+NLS::GFP']

resc_HS_params = get_params(df_resc_HS, sleeping = sleeping_rescue,
                            somewhat_sleeping = somewhat_sleeping_rescue)

resc_no_HS_params = get_params(df_resc_no_HS)

['+' 'AFFL-2 Del+NLS::GFP' 'AFFL-2 Del::GFP' 'AFFL-2 FUSLC+NLS::GFP'
 'AFFL-2::GFP' 'AFFL02 FUSLC*+NLS::GFP' 'N2' 'affl-2(sy975)']


Next we obtains samples for $\theta_{before}, \theta_{after} \text{ and } \theta_{diff}$ for each strian. 

In [106]:
samples_before = {}
samples_after = {}
samples_diff = {}
for genotype in df_resc_model['Genotype'].unique():
    print('Genotype: ', genotype)
    print('Before Heat Shock')
    samples_b = model_run(resc_no_HS_params[genotype], sm)
    samples_before[genotype] = samples_b['theta']
    print('After Heat Shock')
    samples_a = model_run(resc_HS_params[genotype], sm)
    samples_after[genotype] = samples_a['theta']
    samples_diff[genotype] = np.abs(samples_b['theta'] - samples_a['theta'])
    print('')

Genotype:  +
Before Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0
After Heat Shock




n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
1.0 of 4000 (0.025%) iterations ended with a divergence.
  Try running with larger adapt_delta to remove divergences.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
4

Genotype:  AFFL-2 Del+NLS::GFP
Before Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0

Genotype:  AFFL-2 Del::GFP
Before Heat Shock




n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
1.0 of 4000 (0.025%) iterations ended with a divergence.
  Try running with larger adapt_delta to remove divergences.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
4
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0

Genotype:  AFFL-2 FUSLC+NLS::GFP
Before Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable



n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
2.0 of 4000 (0.05%) iterations ended with a divergence.
  Try running with larger adapt_delta to remove divergences.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
4
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0

Genotype:  AFFL02 FUSLC*+NLS::GFP
Before Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0.0 of 4000 (0.0%) iterations ended with a divergence.
0 of 4000 (0.0%) iterations saturated the maximum tree depth of 10.
E-BFMI indicated no pathological behavior.
0
After Heat Shock
n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable

All of our diagnostics look great, so we will plot the posterior distributions and find credible regions  for $\theta_{before}, \theta_{after},$ and $\theta_{diff}$ 

First we plot the posterior distributions for $\theta_{before}$ 

In [107]:
plot_params(samples_before)

We see that the posterior distributions are all shifted to the right towards $\theta = 1$. Furthermore, the shape and location of all distributions are similar, which indicates that all strains display similar pumping behavior prior to heat shock. 

Next we plot the posterior distributions for $\theta_{after}$ 

In [108]:
plot_params(samples_after)

Now we see that the location of the posterior distributions differ between strains. 

We will next plot the posterior distributions for $\theta_{diff}$. 


In [109]:
plot_params(samples_diff)

Again we see that the location of the posterior distributions differ between strains. 

Next, we display the median of the samples along with a 95% confidence interval. 

In [110]:
conditions = ['Before HS', 'After HS', 'Diff']
percs, strPercs = [0.025, 0.5, 0.975], ['2point5', 'Median', '97point5']

for genotype in df_resc_model['Genotype'].unique():
    
    # create dictionary with all samples from different conditions 
    all_samples = {'Before HS': samples_before[genotype], 'After HS': samples_after[genotype], 
                   'Diff': samples_diff[genotype]}
    
    # Print results
    print('Genotype: ', genotype)
    print('-' * 20)
    for cond in conditions:
        vals = all_samples[cond]
        vals_per = np.percentile(all_samples[cond], percs)
        print("\t{}: Median = {:.3f}, 95% Credible Region = [{:.3f}, {:.3f}]"\
              .format(cond, vals_per[1], vals_per[0], vals_per[2]))
    print()

Genotype:  +
--------------------
	Before HS: Median = 0.946, 95% Credible Region = [0.909, 0.951]
	After HS: Median = 0.000, 95% Credible Region = [0.000, 0.000]
	Diff: Median = 0.928, 95% Credible Region = [0.899, 0.938]

Genotype:  AFFL-2 Del+NLS::GFP
--------------------
	Before HS: Median = 0.951, 95% Credible Region = [0.915, 0.956]
	After HS: Median = 0.100, 95% Credible Region = [0.077, 0.109]
	Diff: Median = 0.693, 95% Credible Region = [0.662, 0.702]

Genotype:  AFFL-2 Del::GFP
--------------------
	Before HS: Median = 0.937, 95% Credible Region = [0.907, 0.947]
	After HS: Median = 0.455, 95% Credible Region = [0.429, 0.466]
	Diff: Median = 0.268, 95% Credible Region = [0.220, 0.276]

Genotype:  AFFL-2 FUSLC+NLS::GFP
--------------------
	Before HS: Median = 0.946, 95% Credible Region = [0.935, 0.953]
	After HS: Median = 0.161, 95% Credible Region = [0.147, 0.175]
	Diff: Median = 0.598, 95% Credible Region = [0.544, 0.608]

Genotype:  AFFL-2::GFP
--------------------
	Before 

In [66]:
%load_ext watermark
%watermark -v -p numpy,bokeh,bebi103,pandas,altair,jupyterlab

CPython 3.7.3
IPython 7.5.0

numpy 1.16.4
bokeh 1.2.0
bebi103 0.0.41
pandas 0.24.2
altair 2.4.1
jupyterlab 0.34.12
