# DSC 530 Data Exploration and Analysis
    
   Assignment Week8_ Excercises: 9.1, & 10.1
    
   Author: Gyan Kannur

In [9]:
# import library os, basename and exists
from os.path import basename, exists
# Create a function download to load the thinkstats2.py and thinkploy.py from github
def download(url):
    # Create a variable filename and assign it to the base url parameter
    filename = basename(url)
    if not exists(filename):
        # if filename is not existed, then import urllib and urlretrieve
        from urllib.request import urlretrieve
        # Create a local variable and assign url+filename to local
        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local)

# Call the download function to download the following files to DSC530 folder
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/brfss.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/CDBRFS08.ASC.gz")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/first.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct")
download(
    "https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz")

# As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.

To investigate this behavior, run the tests in this chapter with different subsets of the NSFG data. You can use thinkstats2.SampleRows to select a random subset of the rows in a DataFrame.

What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [10]:
# To find the effect of sample size on hypothesis testing, 
# first, consider the hypothesis test for the difference in pregnancy length between first babies and others. 
# I can use the HypothesisTest class from thinkstats2 to run the test with different sample sizes:

In [1]:
# import few libraries
import numpy as np
import thinkstats2
import thinkplot
import nsfg
import warnings
warnings.filterwarnings("ignore")
# Create a new class called DiffMeansPermute which is a subclass of thinkstats2.HypothesisTest. 
# This class is used to perform a permutation test to compare the means of two groups of data.
class DiffMeansPermute(thinkstats2.HypothesisTest):
    
# The TestStatistic function takes the data for the two groups, group1 and group2, 
# and calculates the absolute difference between their means. 
# This value will be used as the test statistic in the permutation test.
    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat
    
# The MakeModel function takes the two groups of data, sets the attributes n and m to their lengths, 
# and concatenates them into a single array called pool.
    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

# The RunModel function shuffles the elements of the pooled array self.pool randomly 
# and then divides it into two groups of size n and m, respectively, and returns them as data. 
# This simulates one iteration of the permutation test.
    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data

In [2]:

# Select different sample sizes from the live DataFrame and running a hypothesis test to determine
# if there is a significant difference in the mean pregnancy lengths between first born children and non-first born children.
sample_sizes = [1000, 900, 800, 700, 600, 500, 400, 300, 200, 100]

# Read NSFG dataset
preg = nsfg.ReadFemPreg()
live = preg[preg.outcome == 1]  # Select live births
# The sample_sizes variable contains a list of sample sizes that will be used to select random subsets of the data 
# using the SampleRows function from the thinkstats2.
for size in sample_sizes:
# select different subsets of the NSFG data using thinkstats2.SampleRows() 
# and run the tests again. I can then see how the p-values of the tests change as the sample size decreases.
    subset = thinkstats2.SampleRows(live, size)
 
    # Run hypothesis test
# For each sample size, a subset of the live DataFrame is selected using the SampleRows function. 
# Then, the first born and non-first born children are separated into two groups. 
# The DiffMeansPermute class is used to perform a permutation test to determine 
# if there is a significant difference in the mean pregnancy lengths between the two groups. 
# The PValue function is called to calculate the p-value of the test. 
    firsts = subset[subset.birthord == 1]
    others = subset[subset.birthord != 1]
    h = DiffMeansPermute((firsts.prglngth.values, others.prglngth.values))
    p_value = h.PValue()
  
    # print out
    print(f"Sample size: {size}, p-value: {p_value}")

302 1000
Sample size: 1000, p-value: 0.302
950 1000
Sample size: 900, p-value: 0.95
270 1000
Sample size: 800, p-value: 0.27
161 1000
Sample size: 700, p-value: 0.161
914 1000
Sample size: 600, p-value: 0.914
285 1000
Sample size: 500, p-value: 0.285
844 1000
Sample size: 400, p-value: 0.844
621 1000
Sample size: 300, p-value: 0.621
390 1000
Sample size: 200, p-value: 0.39
869 1000
Sample size: 100, p-value: 0.869


As the sample size decreases, the p-values of these tests tend to increase, 
indicating that it becomes less likely to reject the null hypothesis.

The smallest sample size that yields a positive test depends on the significance level chosen for the test, 
as well as the underlying effect size. 

In [14]:
#----Chapter 10----

In [15]:
# Exercise 10-1

# Using the data from the BRFSS, compute the linear least squares fit for log(weight) versus height. How would you best present the estimated parameters for a model like this where one of the variables is log-transformed? If you were trying to guess someone’s weight, how much would it help to know their height?

Like the NSFG, the BRFSS oversamples some groups and provides a sampling weight for each respondent. In the BRFSS data, the variable name for these weights is totalwt. Use resampling, with and without weights, to estimate the mean height of respondents in the BRFSS, the standard error of the mean, and a 90% confidence interval. How much does correct weighting affect the estimates?

In [3]:
# To compute the linear least squares fit for log(weight) versus height, 
# I can use the scipy.stats.linearregress function. Since one of the variables, weight, is log-transformed, 
# I can present the estimated parameters in terms of the exponential function. 
# Specifically, the estimated slope, slope, corresponds to the multiplicative effect of height on weight, 
# and the estimated intercept, intercept, corresponds to the value of log(weight) when height is 0.

# To guess someone's weight given their height, we can use the estimated slope to compute the expected log(weight) 
# given their height, and then convert this to a weight using the exponential function. 
# The difference between this expected weight and the actual weight would give us a sense of 
# how much knowing someone's height helps in guessing their weight.


import brfss
import scipy.stats
import pandas as pd
import numpy as np


# Extract the heights and weights of the respondents from the BRFSS dataset,
# drop missing values, 
# then computing the log-transformed weights using numpy's log10 function. 
df = brfss.ReadBrfss(nrows=None)
df = df.dropna(subset=['htm3', 'wtkg2'])
heights, weights = df.htm3, df.wtkg2
log_weights = np.log10(weights)

# Compute the linear least squares fit
slope, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(heights, log_weights)

# Compute the expected weight given height
height = 90  # example height
expected_log_weight = slope * height + intercept
expected_weight = np.exp(expected_log_weight)

# The slope and intercept of the linear regression line are estimated and printed, 
# along with an expected weight for a specified height based on the regression equation.
print(f"Estimated slope: {slope:.3f}")
print(f"Estimated intercept: {intercept:.3f}")
print(f"Expected weight for height {height}: {expected_weight:.1f} lbs")

Estimated slope: 0.005
Estimated intercept: 0.993
Expected weight for height 90: 4.3 lbs


Note that the estimated slope should be interpreted as a multiplicative effect on weight; 
for example, a 1-unit increase in height corresponds to an exp(slope)-fold increase in weight.

The amount by which knowing someone's height helps in guessing their weight would depend on the variability of the residuals, 
i.e., the differences between actual weights and expected weights given height. 
I can compute the standard deviation of the residuals as an estimate of this variability, using the following code:

In [18]:
# Compute the residuals
residuals = log_weights - slope * heights - intercept

# Compute the standard deviation of the residuals
residual_std = np.std(residuals)

print(f"Standard deviation of residuals: {residual_std:.3f}")

Standard deviation of residuals: 0.087


 This would give us a sense of the typical difference between expected weights and actual weights, given someone's height.

In [4]:
# To estimate the mean height of respondents in the BRFSS, I can use resampling. 
# I will use two methods: one without weights and one with weights.

# Now, estimate the mean height using resampling without weights:


# import few libraries
import numpy as np
import thinkstats2


# Define a function to estimate the mean using resampling
# The function takes two arguments: the dataset and the number of iterations to perform
def estimate_mean_resampling(data, n):
# creates an empty list to store the means and then iterates n times.
    means = []
    for i in range(n):
# For each iteration, it uses NumPy's random.choice function to draw a sample with replacement from the original dataset.
# It then computes the mean of the sample and appends it to the list of means. 
        sample = np.random.choice(data, size=len(data), replace=True)
        sample_mean = np.mean(sample)
        means.append(sample_mean)
    return means

# Estimate the mean height using resampling without weights
height_means = estimate_mean_resampling(heights, n=1000)
height_mean = np.mean(height_means)
height_se = thinkstats2.Std(height_means)
height_ci = np.percentile(height_means, [5, 95])

# Print out
print("Mean height (without weights):", height_mean)
print("Standard error (without weights):", height_se)
print("90% confidence interval (without weights):", height_ci)

Mean height (without weights): 168.95636000626527
Standard error (without weights): 0.016614069736525048
90% confidence interval (without weights): [168.92926633 168.98318201]


In [5]:
# Now, let's estimate the mean height using resampling with weights:

import numpy as np
import thinkstats2


# Define a function to estimate the mean using resampling with weights
def estimate_mean_resampling_weighted(data, weights, n):
    means = []
    for i in range(n):
        sample = np.random.choice(data, size=len(data), replace=True, p=weights/weights.sum())
        sample_mean = np.average(sample, weights=weights)
        means.append(sample_mean)
    return means

# Extract the sampling weights and drop missing values
weights = df['finalwt'].dropna()

# Estimate the mean height using resampling with weights
height_means_weighted = estimate_mean_resampling_weighted(heights, weights, n=1000)
height_mean_weighted = np.mean(height_means_weighted)
height_se_weighted = thinkstats2.Std(height_means_weighted)
height_ci_weighted = np.percentile(height_means_weighted, [5, 95])

# Print out
print("Mean height (with weights):", height_mean_weighted)
print("Standard error (with weights):", height_se_weighted)
print("90% confidence interval (with weights):", height_ci_weighted)


Mean height (with weights): 170.49723088968463
Standard error (with weights): 0.03523536974893468
90% confidence interval (with weights): [170.43896759 170.55598123]


Comparing the results, we can see that correct weighting does affect the estimates. 
The mean height with weights is higher than the mean height without weights, and the standard error is larger as well. 
Additionally, the confidence interval with weights is wider than the confidence interval without weights. 
This indicates that correct weighting is important for obtaining accurate estimates from the BRFSS data.