# Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [1]:
from __future__ import print_function, division

%matplotlib inline

import numpy as np

import random

import thinkstats2
import thinkplot


## Hypothesis testing

## Exercises

**Exercise:** As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.

To investigate this behavior, run the tests in this chapter with different subsets of the NSFG data. You can use `thinkstats2.SampleRows` to select a random subset of the rows in a DataFrame.

What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [2]:
# So we are going to test in the below class by taking the difference of the means between two groups.
# The mean pregnancy lengths of the first babies and the other.

In [3]:
class DiffMeansPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat
    
    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data

In [4]:
# The below class is to measure the standard deviation between the 2 groups

In [5]:
class DiffStdPermute(DiffMeansPermute):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = group1.std() - group2.std()
        return test_stat

We use the chi-squared statistic to test the statistical significance of the pregnancy lengths of first babies and the others especially in the range between 35 and 43 weeks

In [6]:
class DiffChisquaredPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        firsts, others = data
        stat = self.ChiSquared(firsts) + self.ChiSquared(others)
        return stat
    
    def ChiSquared(self, lengths):
        hist = thinkstats2.Hist(lengths)
        observed = np.array(hist.Freqs(self.values))
        expected = self.expected_probs * len(lengths)
        stat = sum((observed - expected)**2 / expected)
        return stat
    
    def MakeModel(self):
        firsts, others = self.data
        self.n = len(firsts)
        self.pool = np.hstack((firsts, others))

        pmf = thinkstats2.Pmf(self.pool)
        self.values = range(35, 44)
        self.expected_probs = np.array(pmf.Probs(self.values))
    
    def RunModel(self):
        xs, ys = self.data
        xs = np.random.permutation(xs)
        return xs, ys

In [7]:
# The below class is to calculate the correlation

In [8]:
class DiffCorrelationPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        xs, ys = data
        test_stat = abs(thinkstats2.Corr(xs, ys))
        return test_stat
    
    
    def RunModel(self):
        xs, ys = self.data
        xs = np.random.permutation(xs)
        return xs, ys

In [9]:
# importing the NSFG data
import first
## Populating 3 dataframes from the NSFG file
live,first_birth,other_birth = first.MakeFrames()

The below function run_sample is used to run the different tests for different statistics for the same sample records.

In [10]:
def run_sample(sample, iters=1000):
    
    n = len(sample)
    firsts = sample[sample.birthord == 1]
    others = sample[sample.birthord != 1]
    
    ## Statitics is the difference in the mean pregnancy lengths for first and other births   
    data = firsts.prglngth.values, others.prglngth.values
    Null_ht_mean = DiffMeansPermute(data)
    p_mean = Null_ht_mean.PValue(iters = iters)

    ## Difference in the chi-squared lengths of pregnancies for first and other births   
    data = firsts.prglngth.values, others.prglngth.values
    Null_ht_chisq = DiffChisquaredPermute(data)
    p_chisq = Null_ht_chisq.PValue(iters = iters)

    
    
    ## Statitics is the difference in the standard deviation
    ## Null hypothesis is that the std is higher for first babies
    data = firsts.prglngth.values, others.prglngth.values
    Null_ht_sd = DiffStdPermute(data)
    p_std = Null_ht_sd.PValue(iters = iters)

    ## Null hypothesis is that there is no correlation between mother's weight and child's birth weight
    ## Test STatistics used here is Pearson's correlation
    ## We will be doing a two sided test
    live_sample = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    data = live_sample.agepreg.values, live_sample.totalwgt_lb.values
    Null_ht_corr = DiffCorrelationPermute(data)
    p_corr = Null_ht_corr.PValue(iters = iters)    
  
    
    print ('%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f\t' % (n,p_mean, p_chisq, p_std,p_corr ))
    # print(Null_ht_corr.actual, Null_ht_corr.MaxTestStat())
    # print(Null_ht_chisq.actual, Null_ht_chisq.MaxTestStat())

In [11]:
# looping through the sample
print('==============================================')
print('sample\tpmean\tpchi-sq\tpstd\tpcorr')
print('==============================================')
n = len(live)
for _ in range(7):
    sample = thinkstats2.SampleRows(live, n)
    run_sample(sample)
    n //= 2

print('==============================================')

sample	pmean	pchi-sq	pstd	pcorr
9148	0.17	1.00	0.10	0.00	
4574	0.11	1.00	0.25	0.00	
2287	0.86	1.00	0.04	0.00	
1143	0.51	1.00	0.78	0.00	
571	0.04	1.00	0.94	0.00	
285	0.84	1.00	0.15	0.00	
142	0.65	1.00	0.45	0.00	


In [12]:

# sample ---> sample size for each of the 1000 iterations
# pmean  ---> difference in the mean pregnancy lengths
# pchi-sq --> difference in the chi-squared lengths of pregnancies
# pstd -----> difference in the standard deviations of the pregnancy lengths
# pcorr ----> correlation between mother's weight and child's birth weight

# Correlation Test:
# The reported p-value is 0, which means that in 1000 trials we didn't see a correlation, under the null hypothesis, 
# that exceeded the observed correlation. That means that the p-value is probably smaller than  1/1000 ,
# but it is not actually 0.

# Chi-Squared Test:
# The chi-squared p-value stays 1 for the entire population or the samples because the actual value is > 1
# We conclude here that the observed chi-squared statistic is unlikely under the null hypothesis. 
# So the result is statistically significant.

# Mean and Standard Deviation:
# As expected the mean and the standard deviation are behaving erratic. 