# HSF-1 Data Analysis 

In [1]:
import numpy as np
import pandas as pd

import numba
import bebi103
import itertools as it

import bokeh.io
from bokeh.io import export_svgs
import bokeh.plotting
from bokeh.layouts import row, column
import bokeh.application
import bokeh.application.handlers

pp = bokeh.palettes.Set2[4]

bokeh.io.output_notebook()

Features requiring DataShader will not work and you will get exceptions.
  Features requiring DataShader will not work and you will get exceptions.""")


First we load our data from CSVs into pandas dataframes. 

In [3]:
# Load in DataFrames

# intensities of nuclei before HS
df_ints_nuc= pd.read_csv('../../data/full_int_noHS.csv', comment='#')

# intensities of granules after HS
df_ints_gran = pd.read_csv('../../data/full_df_int_dog.csv', comment='#')

# number of granules per nulci after HS 
df_nums = pd.read_csv('../../data/full_df_nums_dog.csv', comment='#')


Below are some helper functions we used to perform our analysis 

In [4]:
@numba.jit(nopython=True)
def draw_bs_sample(data):
    """
    Draw a bootstrap sample from a 1D data set.
    """
    return np.random.choice(data, size=len(data))


@numba.jit(nopython=True)
def draw_bs_reps_mean(data, size=1):
    """
    Draw boostrap replicates of the mean from 1D data set.
    """
    out = np.zeros(size)
    for i in range(size):
        out[i] = np.mean(draw_bs_sample(data))
    return out


@numba.jit(nopython=True)
def draw_perm_sample(x, y):
    """Generate a permutation sample."""
    concat_data = np.concatenate((x, y))
    np.random.shuffle(concat_data)
    return concat_data[:len(x)], concat_data[len(x):]

@numba.jit(nopython=True)
def draw_perm_reps(x, y, stat_fun, size=1):
    """
    Generate array of permuation replicates.
    """
    return np.array([stat_fun(*draw_perm_sample(x, y)) for _ in range(size)])


@numba.jit(nopython=True)
def draw_perm_reps_diff_mean(x, y, size=1):
    """
    Generate array of permuation replicates.
    """
    out = np.empty(size)
    for i in range(size):
        x_perm, y_perm = draw_perm_sample(x, y)
        out[i] = np.mean(x_perm) - np.mean(y_perm)
    return out


## Analysis of Nuclear HSF-1::GFP Before Heat Shock 

First we plot our original data as an ECDF

In [6]:
p = bebi103.viz.ecdf_collection(data = df_ints_nuc, plot_width = 600,
                                         # title = 'Nuclear HSF-1::GFP Intensity Before Heat Shock',
                                          cats = ['genotype'], 
                                          val ='mean intensity (a.u.)',
                               x_axis_label='mean intensity (a.u.)', palette = pp)
bokeh.io.show(p)

In [9]:
order = ['WT', 'D28', '1202single', '1220double']                 
p1 = bebi103.viz.jitter(data = df_ints_nuc,
                                          
                                          cats = ['genotype'], 
                                          val ='mean intensity (a.u.)',
                               y_axis_label='Mean intensity (a.u.)', palette = pp,
                        order = order,
                     
                       )

bebi103.viz.box(data=df_ints_nuc, 
                    cats=['genotype'],
                    val='mean intensity (a.u.)',
                    y_axis_label=None,
                order = order, 
                    box_kwargs={'fill_alpha': 0.2}, palette = pp, p= p1, display_outliers = False)
                  

bokeh.io.show(p1)
p1.output_backend = "svg"
export_svgs(p1, filename="hsf1_nuc_intensity.svg")

['hsf1_nuc_intensity.svg']

In [7]:
# Draw replicates

for genotype in df_ints_nuc['genotype'].unique():
    genotype_ints = df_ints_nuc.loc[df_ints_nuc['genotype'] == genotype, 'mean intensity (a.u.)'].values
   
    bs_reps_mean = draw_bs_reps_mean( genotype_ints, size=10000)
    # 95% confidence intervals
    mean_genotype_conf_int = np.percentile(bs_reps_mean, [2.5, 97.5])
    print("Genotype: ", genotype)
    print("""
    Mean nuclei intensity before HS 95% conf int :   [{0:.4f}, {1:.4f}]
    """.format(*(tuple(mean_genotype_conf_int))))

Genotype:  WT

    Mean nuclei intensity before HS 95% conf int :   [0.0047, 0.0053]
    
Genotype:  D28

    Mean nuclei intensity before HS 95% conf int :   [0.0042, 0.0045]
    
Genotype:  1220double

    Mean nuclei intensity before HS 95% conf int :   [0.0046, 0.0051]
    
Genotype:  1202single

    Mean nuclei intensity before HS 95% conf int :   [0.0043, 0.0048]
    


## Permutation test

In [8]:

for g1, g2 in it.combinations(df_ints_nuc['genotype'].unique(), 2):
    g1_ints = df_ints_nuc.loc[df_ints_nuc['genotype'] == g1, 
                              'mean intensity (a.u.)'].values
    g2_ints = df_ints_nuc.loc[df_ints_nuc['genotype'] == g2, 
                              'mean intensity (a.u.)'].values
  
    diff_mean = np.mean(g1_ints) - np.mean(g2_ints)

    # Draw replicates
    perm_reps = draw_perm_reps_diff_mean(g1_ints, g2_ints, size=10**5)
 
    # Compute p-value
    p_val = np.sum(np.abs(perm_reps) >= np.abs(diff_mean)) / len(perm_reps)
    
    print("Genotype 1: ", g1)
    print("Genotype 2: ", g2)
    print("""
    p value :   {0:.5f}
    """.format(p_val))

Genotype 1:  WT
Genotype 2:  D28

    p value :   0.00009
    
Genotype 1:  WT
Genotype 2:  1220double

    p value :   0.39324
    
Genotype 1:  WT
Genotype 2:  1202single

    p value :   0.03024
    
Genotype 1:  D28
Genotype 2:  1220double

    p value :   0.00076
    
Genotype 1:  D28
Genotype 2:  1202single

    p value :   0.15837
    
Genotype 1:  1220double
Genotype 2:  1202single

    p value :   0.11504
    


## Analysis of Granule HSF-1::GFP After Heat Shock 

First we plot our original data as an ECDF

In [10]:
p = bebi103.viz.ecdf_collection(data = df_ints_gran, plot_width = 600,
                                          title = 'Granule HSF-1::GFP Intensity After Heat Shock',
                                          cats = ['genotype'], 
                                          val ='mean intensity (a.u.)',
                               x_axis_label='mean intensity (a.u.)', palette = pp)
bokeh.io.show(p)

In [11]:
p1 = bebi103.viz.jitter(data = df_ints_gran,
                                          
                                          cats = ['genotype'], 
                                          val ='mean intensity (a.u.)',
                               y_axis_label='Mean intensity (a.u.)', palette = pp,
                        alpha = .2,
                        order = order,
                     
                       )

bebi103.viz.box(data=df_ints_gran, 
                    cats=['genotype'],
                    val='mean intensity (a.u.)',
                    y_axis_label=None,
                order = order,
                    box_kwargs={'fill_alpha': 0.2}, palette = pp, p= p1,
               display_outliers = False)
                  

bokeh.io.show(p1)
p1.output_backend = "svg"
export_svgs(p1, filename="hsf1_gran_intensity.svg")

['hsf1_gran_intensity.svg']

Now I construct 95 % confidence intervals for the mean of the mean intensity of the granules after to heat shock for each strain. 

In [12]:
# Draw replicates

for genotype in df_ints_gran['genotype'].unique():
    genotype_ints = df_ints_gran.loc[df_ints_gran['genotype'] == genotype, 'mean intensity (a.u.)'].values
   
    bs_reps_mean = draw_bs_reps_mean( genotype_ints, size=10000)
    # 95% confidence intervals
    mean_genotype_conf_int = np.percentile(bs_reps_mean, [2.5, 97.5])
    print("Genotype: ", genotype)
    print("""
    Mean nuclei intensity before HS 95% conf int :   [{0:.4f}, {1:.4f}]
    """.format(*(tuple(mean_genotype_conf_int))))

Genotype:  1202single

    Mean nuclei intensity before HS 95% conf int :   [0.0154, 0.0164]
    
Genotype:  WT

    Mean nuclei intensity before HS 95% conf int :   [0.0138, 0.0146]
    
Genotype:  D28

    Mean nuclei intensity before HS 95% conf int :   [0.0120, 0.0127]
    
Genotype:  1220double

    Mean nuclei intensity before HS 95% conf int :   [0.0140, 0.0146]
    


## Permutation test

In [13]:

for g1, g2 in it.combinations(df_ints_gran['genotype'].unique(), 2):
    g1_ints = df_ints_gran.loc[df_ints_gran['genotype'] == g1, 
                              'mean intensity (a.u.)'].values
    g2_ints = df_ints_gran.loc[df_ints_gran['genotype'] == g2, 
                              'mean intensity (a.u.)'].values
  
    diff_mean = np.mean(g1_ints) - np.mean(g2_ints)

    # Draw replicates
    perm_reps = draw_perm_reps_diff_mean(g1_ints, g2_ints, size=10**5)
 
    # Compute p-value
    p_val = np.sum(np.abs(perm_reps) >= np.abs(diff_mean)) / len(perm_reps)
    
    print("Genotype 1: ", g1)
    print("Genotype 2: ", g2)
    print("""
    p value :   {0:.5f}
    """.format(p_val))

Genotype 1:  1202single
Genotype 2:  WT

    p value :   0.00000
    
Genotype 1:  1202single
Genotype 2:  D28

    p value :   0.00000
    
Genotype 1:  1202single
Genotype 2:  1220double

    p value :   0.00000
    
Genotype 1:  WT
Genotype 2:  D28

    p value :   0.00000
    
Genotype 1:  WT
Genotype 2:  1220double

    p value :   0.67430
    
Genotype 1:  D28
Genotype 2:  1220double

    p value :   0.00000
    


# Analysis of number of granules per nuclei after heat shock

First we plot our original data as an ECDF

In [14]:
p = bebi103.viz.ecdf_collection(data = df_nums, plot_width = 600,
                                          title = 'Number of HSF-1::GFP Granules After Heat Shock',
                                          cats = ['genotype'], 
                                          val ='num',
                               x_axis_label='number of granules', palette = pp)
bokeh.io.show(p)

We see that there are some cells with greater than 40 granules. By visual inspection I did not see any cells with such a high density of granules, so I do not think that these are representative of the distribution. They could have been due to blurring due to the worm twitching, which causes there to be duplicates of the granules. Thus I removed data points with greater than 40 granules (there were only three of these).

In [15]:
df_nums_filt = df_nums.loc[df_nums['num'] < 40, :]

p = bebi103.viz.ecdf_collection(data = df_nums_filt, plot_width = 600,
                                          title = 'Number of HSF-1::GFP Granules After Heat Shock',
                                          cats = ['genotype'], 
                                          val ='num',
                               x_axis_label='number of granules', palette = pp)
bokeh.io.show(p)

I know plot hte box and jitter plots of the data.

In [16]:
                   
p1 = bebi103.viz.jitter(data = df_nums_filt,
                                          
                                          cats = ['genotype'], 
                                          val ='num',
                               y_axis_label='Granules per nucleus', palette = pp,
                        order = order,
                     
                       )

bebi103.viz.box(data=df_nums_filt, 
                    cats=['genotype'],
                    val='num',
                    y_axis_label=None,
                order = order,
                    box_kwargs={'fill_alpha': 0.2}, palette = pp, p= p1, 
               display_outliers = False)
                  
p1.output_backend = "svg"
export_svgs(p1, filename="hsf1_nuc_num.svg")
bokeh.io.show(p1)

Now I construct 95 % confidence intervals for the mean number of granules per nucleus for each strain.

In [70]:
# Draw replicates

for genotype in df_nums['genotype'].unique():
    genotype_ints = df_nums.loc[df_nums['genotype'] == genotype, 'num'].values
    
    bs_reps_mean = draw_bs_reps_mean( genotype_ints, size=10000)
    # 95% confidence intervals
    mean_genotype_conf_int = np.percentile(bs_reps_mean, [2.5, 97.5])
    print("Genotype: ", genotype)
    print("""
    Mean number of granules/nucleus after HS 95% conf int :   [{0:.4f}, {1:.4f}]
    """.format(*(tuple(mean_genotype_conf_int))))

Genotype:  1220double

    Mean number of granules/nucleus after HS 95% conf int :   [6.8092, 7.5924]
    
Genotype:  D28

    Mean number of granules/nucleus after HS 95% conf int :   [6.3869, 7.9869]
    
Genotype:  WT

    Mean number of granules/nucleus after HS 95% conf int :   [7.1757, 8.2658]
    
Genotype:  1202single

    Mean number of granules/nucleus after HS 95% conf int :   [8.4363, 9.9112]
    


## Permutation test

In [26]:

for g1, g2 in it.combinations(df_nums['genotype'].unique(), 2):
    g1_ints = df_nums.loc[df_nums['genotype'] == g1, 
                              'mean intensity (a.u.)'].values
    g2_ints = df_nums.loc[df_nums['genotype'] == g2, 
                              'mean intensity (a.u.)'].values
  
    diff_mean = np.mean(g1_ints) - np.mean(g2_ints)

    # Draw replicates
    perm_reps = draw_perm_reps_diff_mean(g1_ints, g2_ints, size=10**5)
 
    # Compute p-value
    p_val = np.sum(np.abs(perm_reps) >= np.abs(diff_mean)) / len(perm_reps)
    
    print("Genotype 1: ", g1)
    print("Genotype 2: ", g2)
    print("""
    p value :   {0:.5f}
    """.format(p_val))

Genotype 1:  1220double
Genotype 2:  D28

    p value :   0.01030
    
Genotype 1:  1220double
Genotype 2:  WT

    p value :   0.84131
    
Genotype 1:  1220double
Genotype 2:  1202single

    p value :   0.00070
    
Genotype 1:  D28
Genotype 2:  WT

    p value :   0.03519
    
Genotype 1:  D28
Genotype 2:  1202single

    p value :   0.00000
    
Genotype 1:  WT
Genotype 2:  1202single

    p value :   0.00118
    


In [42]:
%load_ext watermark
%watermark -v -p numpy,numba,bokeh,bebi103,pandas,jupyterlab

CPython 3.7.1
IPython 7.2.0

numpy 1.15.4
numba 0.41.0
bokeh 1.0.2
bebi103 0.0.41
pandas 0.23.4
jupyterlab 0.35.3
