In [8]:
import math
from scipy.stats import binom, norm

# parameters
p = 0.03          # probability of failure
n_per_hour = 1200 # requests per hour
hours = 8
threshold = 350

# total number of trials
N = n_per_hour * hours

# exact binomial tail P(X >= threshold)
prob_binom = 1 - binom.cdf(threshold-1, N, p)

# normal approximation with continuity correction
mu = N * p
sigma = math.sqrt(N * p * (1 - p))
z = (threshold - 0.5 - mu) / sigma
prob_normal = 1 - norm.cdf(z)

# print results and differences
print("N =", N)
print("Exact binomial P(X ≥ {}):".format(threshold), prob_binom)
print("Normal approximation (with continuity correction):", prob_normal)
print("Absolute difference:", abs(prob_normal - prob_binom))
print("Relative difference:", (prob_normal - prob_binom) / prob_binom)


N = 9600
Exact binomial P(X ≥ 350): 0.00017722086985161312
Normal approximation (with continuity correction): 0.00011682955237424242
Absolute difference: 6.03913174773707e-05
Relative difference: -0.34076865511345417


In [5]:
import math

p = 0.03
n = 1200 * 8
threshold = 350

mu = n * p
sigma = math.sqrt(n * p * (1 - p))

# continuity correction: P(X ≥ 350) ≈ P(Z ≥ (349.5 - μ)/σ)
z = (threshold - 0.5 - mu) / sigma
prob = 0.5 * math.erfc(z / math.sqrt(2))

print("Approximate probability:", prob)


Approximate probability: 0.00011682955237420686


In [2]:
import kagglehub
import pandas as pd
import numpy as np
from math import sqrt
from scipy.stats import norm

path = kagglehub.dataset_download("stackoverflow/stack-overflow-2023-developers-survey")

import os
for file in os.listdir(path):
    print(file)

df = pd.read_csv(path + "/survey_results_public.csv")  # file name may differ
df.head()


emp = df['Employment'].dropna()

unemployed_mask = emp.str.startswith("Not employed")
unemployed_looking_mask = emp.eq("Not employed, but looking for work")

n_unemployed = unemployed_mask.sum()
n_unemployed_looking = unemployed_looking_mask.sum()

prop_unemployed_looking = n_unemployed_looking / n_unemployed if n_unemployed > 0 else np.nan

print("Proportion of unemployed developers who are looking for work:",
      prop_unemployed_looking)  # This answers your text question


# ----------------------------------------------------
# 2) Proportion of developers with > 20 years of experience
#    Point estimate + 95% CI + precision stuff
# ----------------------------------------------------
# Column is usually 'YearsCodePro' with values like:
# 'Less than 1 year', '1', '2', ..., '50', 'More than 50 years'

def parse_years(x):
    if pd.isna(x):
        return np.nan
    if x == "Less than 1 year":
        return 0.5
    if x == "More than 50 years":
        return 55.0
    try:
        return float(x)
    except ValueError:
        return np.nan

years_raw = df['WorkExp'].dropna()
years_num = years_raw.apply(parse_years)

# Keep only valid numeric
years_num = years_num.dropna()

n_total = len(years_num)
more_than_20 = years_num > 20
count_more_than_20 = more_than_20.sum()

p_hat = count_more_than_20 / n_total  # point estimate

print("Answer 1 Question 3 (point estimate, p̂ for > 20 years):", p_hat)

# ----------------------------------------------------
# 3) 95% confidence interval for that proportion
# ----------------------------------------------------
z = norm.ppf(0.975)  # 1.96-ish
se = sqrt(p_hat * (1 - p_hat) / n_total)
margin_of_error = z * se

ci_lower = p_hat - margin_of_error
ci_upper = p_hat + margin_of_error

print("Answer 2 Question 3 (95% CI lower bound):", ci_lower)
print("Answer 3 Question 3 (95% CI upper bound):", ci_upper)

# ----------------------------------------------------
# 4) Test if ≥ 3% of Stack Overflow users are unemployed
# ----------------------------------------------------
# Use same Employment column, this time unconditional unemployment proportion
emp_all = df['Employment'].dropna()
n_emp = len(emp_all)

unemployed_any = emp_all.str.startswith("Not employed")
count_unemployed = unemployed_any.sum()
p_hat_unemp = count_unemployed / n_emp

p0 = 0.03  # claimed minimum proportion
# One-sided test H0: p = 0.03, H1: p > 0.03
se0 = sqrt(p0 * (1 - p0) / n_emp)
z_test = (p_hat_unemp - p0) / se0
p_value = 1 - norm.cdf(z_test)  # one-sided tail

print("Unemployment point estimate:", p_hat_unemp)
print("z statistic for test p >= 0.03:", z_test)
print("One-sided p-value:", p_value)

# Decision at alpha = 0.05 (adjust if needed)
supports_claim = (p_hat_unemp >= p0) and (p_value < 0.05)

print("Answer 4 Question 3 (does data support ≥ 3% unemployed?):",
      "Yes" if supports_claim else "No")

# ----------------------------------------------------
# 5) Precision checks for the > 20 years proportion
# ----------------------------------------------------
# Margin of error (absolute)
print("Answer 5 Question 3 (margin of error, absolute):", margin_of_error)

# Standard error
print("Answer 6 Question 3 (standard error):", se)

# Relative error (point estimate): SE / p̂
rel_error_point = se / p_hat
print("Answer 7 Question 3 (relative error, point estimate):", rel_error_point)

# Relative error (interval estimate): ME / p̂
rel_error_interval = margin_of_error / p_hat
print("Relative error (interval estimate):", rel_error_interval)

README_2023.txt
so_survey_2023.pdf
survey_results_public.csv
survey_results_schema.csv
Proportion of unemployed developers who are looking for work: 0.5819466605881012
Answer 1 Question 3 (point estimate, p̂ for > 20 years): 0.15271116822322678
Answer 2 Question 3 (95% CI lower bound): 0.14933393851021035
Answer 3 Question 3 (95% CI upper bound): 0.1560883979362432
Unemployment point estimate: 0.04991012309722633
z statistic for test p >= 0.03: 34.60325453355236
One-sided p-value: 0.0
Answer 4 Question 3 (does data support ≥ 3% unemployed?): Yes
Answer 5 Question 3 (margin of error, absolute): 0.003377229713016433
Answer 6 Question 3 (standard error): 0.0017231080467067714
Answer 7 Question 3 (relative error, point estimate): 0.011283444863626505
Relative error (interval estimate): 0.02211514555425141


In [3]:
import kagglehub
import pandas as pd
import numpy as np
from scipy.stats import norm
from math import sqrt, ceil

# Download the latest version
path = kagglehub.dataset_download("dellakoovakkattu/bellabeatdailyactivity")

import os
for file in os.listdir(path):
    print(file)

df = pd.read_csv(path + "/dailyActivity_merged.csv")

# ----------------------------------------------------
# Use the "VeryActiveMinutes" variable
# ----------------------------------------------------
x = df["VeryActiveMinutes"].dropna()

n = len(x)
x_bar = x.mean()
s = x.std(ddof=1)

# ----------------------------------------------------
# Standard error
# ----------------------------------------------------
se = s / sqrt(n)

# ----------------------------------------------------
# 99% confidence interval for the mean
# ----------------------------------------------------
z = norm.ppf(0.995)  # two-sided 99%
margin = z * se

ci_lower = x_bar - margin
ci_upper = x_bar + margin

# ----------------------------------------------------
# Relative error (percentage)
# ----------------------------------------------------
relative_error_percent = (margin / x_bar) * 100

# ----------------------------------------------------
# Required sample size for 3% precision (interval estimate)
# margin ≤ 0.03 * mean
# n ≥ (z * s / (0.03 * mean))^2
# ----------------------------------------------------
desired_precision = 0.03 * x_bar
n_required = ceil((z * s / desired_precision) ** 2)

# ----------------------------------------------------
# Print results in the exact format you need
# ----------------------------------------------------
print("Answer 1 Question 4 (mean):", x_bar)
print("Answer 2 Question 4 (standard error):", se)
print("Answer 3 Question 4 (CI lower):", ci_lower)
print("Answer 4 Question 4 (CI upper):", ci_upper)
print("Answer 5 Question 4 (margin of error):", margin)
print("Answer 6 Question 4 (relative error %):", relative_error_percent)

# Interpretation category:
#   Typically:
#     <5%  = high precision
#     5–10% = moderate precision
#     >10% = low precision
if relative_error_percent < 5:
    precision_label = "high-precision estimate"
elif relative_error_percent < 10:
    precision_label = "moderately precise estimate"
else:
    precision_label = "low-precision estimate"

print("Answer 7 Question 4:", precision_label)
print("Answer 8 Question 4 (required n for 3% precision):", n_required)

dailyActivity_merged.csv
Answer 1 Question 4 (mean): 21.164893617021278
Answer 2 Question 4 (standard error): 1.071279354397138
Answer 3 Question 4 (CI lower): 18.405460863678183
Answer 4 Question 4 (CI upper): 23.924326370364373
Answer 5 Question 4 (margin of error): 2.7594327533430953
Answer 6 Question 4 (relative error %): 13.037782297775871
Answer 7 Question 4: low-precision estimate
Answer 8 Question 4 (required n for 3% precision): 17754
