In [3]:
import warnings

# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.optimize

# Numba for speed
import numba
import corner
# Import plotting tools
import matplotlib.pyplot as plt
import seaborn as sns
import numdifftools as ndt
import scipy.stats as st
# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#part a and b
# Load data
df = pd.read_csv('./data/mean_rest_bouts.csv', comment='#')

# Pull out wild type and mutant and take NaNs to be zero
df = df[df['genotype'].isin(['wt', 'mut'])].fillna(0)

In [3]:
df.head()

Unnamed: 0,fish,genotype,mean_rest_bout_length
2,FISH11,mut,2.255556
3,FISH12,mut,1.529412
4,FISH13,mut,2.373626
5,FISH14,wt,2.352941
7,FISH18,wt,2.111111



# Part A

Sample mean is the best estimate of $\mu$. Unbiased estimator of variance is:
\begin{align}
\frac{n}{n-1}&s^2
\end{align}

In [26]:
def get_estimates(data):
    """
    Takes a tody dataframe as an input. Returns:
    1. mu_wt
    2. mu_mut
    3. sigma_wt
    4. sigma_mut
    """
    #separate the dataset in to wild type and mutant
    wt_fish=data[data["genotype"]=="wt"]
    mut_fish=data[data["genotype"]=="mut"]

    #find means for wt and mut
    mu_wt = wt_fish.mean()
    mu_mut = mut_fish.mean()

    #find unbiased estimator of variance for wt and mut
    sigma_wt = np.sqrt(wt_fish.var(ddof=1))
    sigma_mut = np.sqrt(mut_fish.var(ddof=1))

    #return values
    return mu_wt, mu_mut, sigma_wt, sigma_mut

#part a

muWT, muMUT, sigmaWT, sigmaMUT = get_estimates(df)

print("""The best estimates for µ and σ are:
                µ           σ
             -----------------
    WT       {0:.4f}     {2:.4f}
    Mutant   {1:.4f}     {3:.4f}
    """.format(float(muWT), float(muMUT), 
           float(sigmaWT), float(sigmaMUT)))

The best estimates for µ and σ are:
                µ           σ
             -----------------
    WT       2.2094     0.5229
    Mutant   1.7271     0.8051
    


Frequentist estimate of the difference of the means:
\begin{align}
\delta \equiv \mu_{wt} - \mu_{mut}
\end{align}

# Part B

In [24]:
def get_frequentist_estimate(mu_wt, sigma_wt, mu_mut, sigma_mut, data, trials=10000):
    """
    Returns frequentist estimate of the difference of the mean with a 95%
    confidence interval. Takes as input the result of get_estimates 
    function defined in part a.
    """
   
    #separate the dataset in to wild type and mutant
    wt_fish=data[data["genotype"]=="wt"]
    mut_fish=data[data["genotype"]=="mut"]
    
    def difference_of_means(mu_wt, sigma_wt, mu_mut, sigma_mut, reps_wt, reps_mut, trials):
        """
        This function draws samples out of a gaussian distribution defined for wildtype
        and mutant population using means and sigmas calculated in part a.
        """
        difference_mean_holder=np.empty(trials)
    
        for i in range(trials):
            wt_rep_mean = np.random.normal(mu_wt, sigma_wt, reps_wt).mean()
            mut_rep_mean = np.random.normal(mu_mut, sigma_mut, reps_mut).mean()
            difference_mean_holder[i]=wt_rep_mean - mut_rep_mean
        
        return difference_mean_holder
    
    difference_out = difference_of_means(muWT, sigmaWT, muMUT,sigmaMUT, len(wt_fish),len(mut_fish), trials)
    
    return mu_wt - mu_mut, float(1.96*np.std(difference_out))

#unpack results
frequentist_estimate, confidence_interval = get_frequentist_estimate(muWT, sigmaWT, muMUT, sigmaMUT, df, 10000)

print("""The frequentist estimate of the difference of the mean is:
      µ(wt)-µ(mut) = {0:.4f} ± {1:.4f}"""
      .format(float(frequentist_estimate), confidence_interval), "minutes")

The frequentist estimate of the difference of the mean is:
      µ(wt)-µ(mut) = 0.4823 ± 0.4136 minutes


# Part C:

In [37]:
def get_bayesian_estimate(data):
    """
    This function takes the dataframe containing wildtype and mutant fish and
    calculates the bayesian estimate for the difference of the means.
    """
    #separate the dataset in to wild type and mutant
    wt_fish=data[data["genotype"]=="wt"]
    mut_fish=data[data["genotype"]=="mut"]
    
    # Define log of the posterior
    def log_post(p, x_wt, x_mut):
        """
        Returns the log of the posterior consisting of the product of Gaussians.
        p[0] = mu_wt
        p[1] = mu_mut
        p[2] = sigma_wt
        p[3] = sigma_mut
        """
        # Unpack parameters
        mu_wt, mu_mut, sigma_wt, sigma_mut = p
    
        # Make sure we have everything in the right range
        if (sigma_wt < 0) or (sigma_mut < 0):
            return -np.inf

        # Compute separate parts of posterior from each sample
        log_post_wt = st.norm.logpdf(x_wt, mu_wt, sigma_wt).sum() - np.log(sigma_wt)
    
        log_post_mut = st.norm.logpdf(x_mut, mu_mut, sigma_mut).sum() - np.log(sigma_mut)

        # Add them up to return
        return log_post_wt + log_post_mut

    def negative_log_post(p, x_wt,x_mut):
        
        return -log_post(p,x_wt,x_mut)

    #MAP
    wt=wt_fish[("mean_rest_bout_length")]
    mut=mut_fish[("mean_rest_bout_length")]
    
    #define args
    args = (wt,mut)
    p0 = np.array([ 2, 1.5, 0.5, 0.8])

    hes_fun = ndt.Hessian(log_post)
    res = scipy.optimize.minimize(negative_log_post, p0, args=args)
    hes = hes_fun(res.x,wt,mut)
    cov = -np.linalg.inv(hes)

    bayesian_estimate_diff = res.x[0]-res.x[1]
    credible_interval = 1.96 * np.sqrt(np.sqrt(cov[0,0])**2 + np.sqrt(cov[1,1])**2)
    
    return bayesian_estimate_diff, credible_interval

bayesian_estimate, credible_interval = get_bayesian_estimate(df)

print("""The bayesian estimate of the difference of the mean is:
    µ(wt)-µ(mut) = {0:.4f} ± {1:.4f}""".format(bayesian_estimate, credible_interval), "minutes.")

The bayesian estimate of the difference of the mean is:
    µ(wt)-µ(mut) = 0.4823 ± 0.3978 minutes.


# Part D