In [6]:
import pandas as pd
import numpy as np
from scipy.stats import norm, t

### Read data

In [7]:
stack_overflow = pd.read_feather(r"..\data\stack_overflow.feather")  

In [8]:
stack_overflow["age_cat"].value_counts(normalize=True)

Under 30       0.535604
At least 30    0.464396
Name: age_cat, dtype: float64

### Hypotheses

* H0: Proportion of Stack Overflow users under thirty = 0.5
* HA: Proportion of Stack Overflow users under thirty != 0.5

alpha = 0.1

<img src="..\data\one_sample_prop_1.jpg" width="600" height="400">

<img src="..\data\one_sample_prop_2.jpg" width="800" height="300">

In [9]:
alpha = 0.1
p_hat = (stack_overflow["age_cat"] == "Under 30").mean()
p_0 = 0.50
n = len(stack_overflow)
numerator = p_hat - p_0
denominator = np.sqrt(p_0 * (1- p_0) / n)
z_score = numerator / denominator 
z_score

3.385911440783663

In [10]:
# Left-tailed ("less than")
p_value = norm.cdf(z_score)

print("Left-tailed p-value is:{:.2f}".format(p_value))

Left-tailed p-value is:1.00


In [11]:
# Right-tailed ("greater than")
p_value = 1- norm.cdf(z_score)
print("Right-tailed  p-value is:{:.2f}".format(p_value))

Right-tailed  p-value is:0.00


In [12]:
# Two-tailed ("less than")
p_value = norm.cdf(-z_score) + 1 - norm.cdf(z_score)
p_value = 2 * (1 - norm.cdf(z_score))

# Print the p-value
print("The p-value is:{:.2f}".format(p_value))

if p_value <= alpha:
    print("p_value <= alpha. Reject H0 in favour of HA")
else:
    print("Failed to reject H0")

The p-value is:0.00
p_value <= alpha. Reject H0 in favour of HA
