In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm, t
from statsmodels.stats.proportion import proportions_ztest

### Read data

In [2]:
stack_overflow = pd.read_feather(r"..\data\stack_overflow.feather")  

<img src="..\data\comparing_two_props.jpg" width="800" height="250">

In [3]:
alpha = 0.05

<img src="..\data\two_sample_prop_z_score.jpg" width="600" height="400">

In [4]:
p_hats = stack_overflow.groupby("age_cat")["hobbyist"].value_counts(normalize=True)
print(p_hats)

age_cat      hobbyist
At least 30  Yes         0.773333
             No          0.226667
Under 30     Yes         0.843105
             No          0.156895
Name: hobbyist, dtype: float64


In [5]:
n= stack_overflow.groupby("age_cat")["hobbyist"].count()
print(n)

age_cat
At least 30    1050
Under 30       1211
Name: hobbyist, dtype: int64


In [6]:
p_hat_at_least_30 = p_hats[("At least 30", "Yes")]
p_hat_under_30 = p_hats[("Under 30", "Yes")]
n_at_least_30 = n["At least 30"]
n_under_30 = n["Under 30"]

# Calculate the pooled estimate of the population proportion
p_hat = (n_at_least_30 * p_hat_at_least_30 + n_under_30 * p_hat_under_30) / (n_at_least_30 + n_under_30)

# Calculate the standard error
std_error = np.sqrt(
    p_hat * (1 - p_hat) / n_at_least_30 + p_hat * (1 - p_hat) / n_under_30
)

# Calculate the z-score
z_score = (p_hat_at_least_30 - p_hat_under_30) / std_error
print("z-score is:{}".format(z_score))

# Two-tailed 
p_value = 2 * min(norm.cdf(z_score), 1- norm.cdf(z_score))
print("The two-tailed p-value is:{}".format(p_value))

if p_value <= alpha:
    print("p_value <= alpha. Reject H0 in favour of HA")
else:
    print("Failed to reject H0")

z-score is:-4.223691463320559
The two-tailed p-value is:2.403330142685068e-05
p_value <= alpha. Reject H0 in favour of HA


This tiny p-value leads us to suspect that the proportion of hobbyiest users is different for those under 30 to those who are at least 30. 

### Proportion tests using proportions_ztest()

In [7]:
n= stack_overflow.groupby("age_cat")["hobbyist"].value_counts()

# Create an array of the "Yes" counts for each group
n_hobbyists = np.array([n[("At least 30", "Yes")], n[("Under 30", "Yes")]])

# Create an array of the total number of rows in each group
n_rows = np.array([n[("At least 30", "Yes")] + n[("At least 30", "No")], 
                   n[("Under 30", "Yes")] + n[("Under 30", "No")]])

# Run a z-test on the two proportions
z_score, p_value = proportions_ztest(count=n_hobbyists, nobs=n_rows,
                                     alternative="two-sided")

print("z-score is:{}".format(z_score))
print("The two-tailed p-value is:{}".format(p_value))

z-score is:-4.223691463320559
The two-tailed p-value is:2.403330142685068e-05


In [8]:
if p_value <= alpha:
    print("p_value <= alpha. Reject H0 in favour of HA")
else:
    print("Failed to reject H0")

p_value <= alpha. Reject H0 in favour of HA
