**d)** First we import our work horses and load a `DataFrame`

In [2]:
import numpy as np
import pandas as pd
import scipy.optimize
import numba
import matplotlib.pyplot as plt
import seaborn as sns

# Magic function to make matplotlib inline
%matplotlib inline

# import Justin's settings
rc = {'lines.linewidth': 2,
      'axes.labelsize': 18,
      'axes.titlesize': 18,
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

df = pd.read_csv('./data/mean_rest_bouts.csv', comment = '#')

In [3]:
# Since there are two fish whose rest bouts in a minute interveral
# are undetectable we will use .fillna to set them to zero

wt = df[df['genotype'] == 'wt'].fillna(0)

mut = df[df['genotype'] == 'mut'].fillna(0)

# we say that mu is our mean and sigma^2 is n / (n - 1) of our sample variance
# compute the mean and the variance for our wt and mutant data sets 

wt_mean = wt['mean_rest_bout_length'].mean()
wt_var = wt['mean_rest_bout_length'].var(ddof=1)

mut_mean = mut['mean_rest_bout_length'].mean()
mut_var = mut['mean_rest_bout_length'].var(ddof=1)

print ("""
           µ         σ          n
        ---------------------------
WT      {0:.4f}     {2:.4f}      {4:d}
Venus   {1:.4f}     {3:.4f}       {5:d}
""".format(wt_mean, mut_mean, np.sqrt(wt_var), np.sqrt(mut_var), 
           len(wt['mean_rest_bout_length']), len(mut['mean_rest_bout_length'])))


           µ         σ          n
        ---------------------------
WT      2.2094     0.5229      17
Venus   1.7271     0.8051       22



In [4]:
# we want to make an array of the mean rest bout lengths
wt = wt['mean_rest_bout_length'].values

mut = mut['mean_rest_bout_length'].values


In [30]:
# our null hypothesis is that the mutant and the wild type are drawn from the same
# Gaussian distribution in other words we want to test if the mutant mean is the same
# as the wt mean

# We first combine all of the measurements and then distrbute them to a set of 17 and 
# a set of 22

def final_p_value(w, m, trials, greater_or_less='greater', return_abs=False):
    
    def distribute_samples(w, m, some_func, trials, args=()):
        '''
        combine our measurements for rest bout and redistribute them to two groups
        '''
    
        # first we create an empty array to store our samples
        samples = np.empty(trials)
    
        # concatenate all the measurements
        measurements = np.concatenate((w, m))
    
        # create sets for all trials 
        for i in range(trials):
            measurements = np.random.permutation(measurements)
            samples[i] = some_func(measurements[:len(w)], measurements[len(w):], *args)
     
        return samples


    # now I want to define the p-value
    def p_value(samples, actual, greater_or_less='greater'):
        '''
        compute the p value (the probability our measurement is 
        greather than or less than the actual value
        '''
    
        if greater_or_less == 'greater':
            return np.sum(samples >= actual) / len(samples)
        else:
            return np.sum(samples <= actual) / len(samples)

    # now we can run our test and see if we will get the actual difference in our means
    # this will be my some_func in my distribute_samples function
    # now we can run our a Welch's t-test
    # The T statistic is written out in tutorial 6b
    def T_stat(w, m, return_abs=False):
        '''
        Calculate the absolute difference of the means
        '''
        w_s = w.var(ddof=1)
        m_s = m.var(ddof=1)
    
        sd = ((len(w)-1) * w_s**2 + (len(m) - 1) * m_s**2) / (len(w) + len(m) - 2)
    
        diff = w.mean() - m.mean() - np.sqrt(w.var()) + np.sqrt(m.var()) 
        pool_variance = sd * np.sqrt(1/(len(w)) + 1/(len(m)))
            
        if return_abs:
            return np.abs(diff) / pool_variance   
        return diff / pool_variance


    # let's retrieve all our random sample sets of two
    samples = distribute_samples(w, m, T_stat, trials, args = (True,))

    # calculate p value
    p_value_student_t = p_value(samples, T_stat(w, m))

    return p_value_student_t

p_value = final_p_value(wt, mut, 100000)
print("Welch's t test p value =", p_value)

Welch's t test p value = 0.02074


**e)** We want to randomly generate a set of 17 mean sleep bout lengths for wild type fish from a guassian distritubtion using the mean, $\mu_w$, and variance, $\sigma^2_w$. We can do this by defining a function that results in an array of means. 

In [23]:
def gaussian(mu, sigma, n, args=()):
    '''
    define the gaussian distribution
    '''
    vals = np.random.normal(mu, sigma, n)
    
    return vals

wt_set = gaussian(2.2, 0.5, 17)
mut_set = gaussian(1.7, 0.8, 22)


In [29]:
p_val = final_p_value(wt_set, mut_set, 100000)
print('p value = ',p_val)

p value =  2e-05
