In [2]:
# Scenario 1 -> 7+ Heads out of 10 Iterations
# H0 -> Coin is fair.
# Ha -> Coin is biased towards heads. -> Ha: P(Heads) > 0.5
# P(No. of Heads = 7 | 8 | 9 | 10)

from scipy.stats import binom

1 - binom.cdf(k = 6, n = 10, p = 0.5)

0.171875

In [3]:
binom.pmf(k = 7, n = 10, p = 0.5) + binom.pmf(k = 8, n = 10, p = 0.5) + binom.pmf(k = 9, n = 10, p = 0.5) + binom.pmf(k = 10, n = 10, p = 0.5)

0.17187499999999994

In [4]:
# p - value is greater than 0.05 ie. Significance Level
# Therefore, Null Hypothesis can't be rejected

In [5]:
# Scenario 2 -> 70+ Heads out of 100 Iterations
# H0 -> Coin is fair.
# Ha -> Coin is biased towards heads. -> Ha: P(Heads) > 0.5
# P(No. of Heads >= 70)

1 - binom.cdf(k = 69, n = 100, p = 0.5)

3.925069822796612e-05

In [6]:
# p - value is much smaller than 0.05
# Therefore, We can reject the Null Hypothesis

# Hypothesis Testing Framework

- Setup a Null and Alternate Hypothesis
- Choose the distribution (Gaussian, Binomial...)
- Select the left / right / two-tailed test as per the hypothesis
- Compute the p - value
- Compare p - value to significance level

# Marketing Case Study

- Retail Chain with `2000` stores in India
- Shampoo Sales in consideration
    - Mean -> `1800`
    - Standard Deviation -> `100`
- Marketing team to be hired.
    - Not cheap
    - Can't give all stores to a team
    - Need Trial run first
    
Experiment with 2 marketing firms - 
- Firm A - 
    - Worked on `50` stores
    - Average `1850` bottles sold
- Firm B - 
    - Worked on `5` stores
    - Average `1900` bottles sold

In [7]:
# Let's set the Significance Level to 0.01
# Confidence Level = 99%!

In [9]:
mu = 1800
std_dev = 100

In [8]:
'''
Firm A ->

H0: Marketing firm has no effect on sales. mu = 1800.
Ha: Marketing firm has a positive effect on sales. mu > 1800.
'''

'\nFirm A ->\n\nH0: Marketing firm has no effect on sales. mu = 1800.\nHa: Marketing firm has a positive effect on sales. mu > 1800.\n'

In [10]:
# 50 stores -> CLT possible -> Sampling 1000 times -> Gaussian

mu_a = 1800
std_dev_a = 100 / (50 ** 0.5)

In [11]:
# Right tailed test as sales > 1800

observed_mu_a = 1850

# P(m > 1850 | H0 is True)

In [14]:
z_a = (1850 - mu_a) / std_dev_a 

In [16]:
z_a # Z - STATISTIC or TEST STATISTIC

3.5355339059327378

In [17]:
import numpy as np
from scipy.stats import norm

In [18]:
p_a = 1 - norm.cdf(z_a)

print(f"P - Value for Firm A - {p_a}")

P - Value for Firm A - 0.00020347600872250293


In [19]:
# Since p value is much lesser than the significance level
# We can reject the Null Hypothesis!

In [20]:
'''
Firm B ->

H0: Marketing firm has no effect on sales. mu = 1800.
Ha: Marketing firm has a positive effect on sales. mu > 1800.
'''

'\nFirm B ->\n\nH0: Marketing firm has no effect on sales. mu = 1800.\nHa: Marketing firm has a positive effect on sales. mu > 1800.\n'

In [24]:
# 5 stores -> Approximation that it follows Gaussian Distribution

mu_b = 1800
std_dev_b = 100 / (5 ** 0.5)

In [22]:
# Right tailed test as sales > 1800

observed_mu_b = 1900

# P(m > 1900 | H0 is True)

In [25]:
z_b = (1900 - mu_b) / std_dev_b

In [26]:
z_b

2.23606797749979

In [27]:
p_b = 1 - norm.cdf(z_b)

In [28]:
print(f"P - Value for Firm B - {p_b}")

P - Value for Firm B - 0.0126736593387341


In [29]:
# P value is bigger than the significance level.
# Null Hypothesis is accepted.

In [31]:
'''
Quiz 4 - 

A fitness App claims that its users walk an average of 8,000 steps per day.
A random sample of 30 users showed an average of 7,600 steps per day with a standard deviation of 1,200 steps.
Conduct a right-tailed Z-test at a 5% significance level to determine if the App's claim is supported.
What is the p-value?
'''

"\nQuiz 4 - \n\nA fitness App claims that its users walk an average of 8,000 steps per day.\nA random sample of 30 users showed an average of 7,600 steps per day with a standard deviation of 1,200 steps.\nConduct a right-tailed Z-test at a 5% significance level to determine if the App's claim is supported.\nWhat is the p-value?\n"

In [32]:
population_mean = 8000
population_std_dev = 1200
sample_size = 30
sample_mean = 7600
alpha = 0.05 # significance level

In [33]:
z_score = (sample_mean - population_mean) / (population_std_dev / (sample_size ** 0.5))

In [34]:
z_score

-1.8257418583505536

In [35]:
p_value = 1 - norm.cdf(z_score)

In [36]:
p_value

0.9660554225690855

## Marketing Case Study Continued

- What should be the minimmum weekly average sales for Firm A / B to convince us that their marketing efforts had a positive effect with confidence level of 99%.

In [37]:
mu = 1800
std_dev = 100

In [38]:
# Firm A

mu_a = 1800
std_dev_a = 100 / (50 ** 0.5)

In [39]:
# z = x - mu_a / std_dev_a

In [40]:
# At the Critical Points, p - value is equal to the significance level

In [47]:
p_a = 0.01
z_critical = norm.ppf(0.99)

In [48]:
z_critical

2.3263478740408408

In [43]:
# Another function to calculate z in such a case using significance level

In [45]:
norm.isf(q = 0.01)

2.3263478740408408

In [49]:
x = z_critical * std_dev_a + mu_a

In [50]:
x

1832.8995271426638

In [51]:
# Firm B

mu_b = 1800
std_dev_b = 100 / (5 ** 0.5) 

In [52]:
# Since p - value is same i.e. 0.01
# z - value will also come out as same

In [53]:
z_b = norm.ppf(0.99)

In [54]:
z_b

2.3263478740408408

In [55]:
x = z_b * std_dev_b + mu_b

In [56]:
x

1904.0374397133487

In [57]:
'''
Quiz 5 - 

In a dataset of exam scores with a mean of 60 and a standard deviation of 15,
What is the critical value for the corresponding Z-score at a 95% confidence level?
'''

'\nQuiz 5 - \n\nIn a dataset of exam scores with a mean of 60 and a standard deviation of 15,\nWhat is the critical value for the corresponding Z-score at a 95% confidence level?\n'

In [58]:
x = norm.ppf(0.95) * 15 + 60

In [59]:
x

84.67280440427209

In [60]:
# Confidence Interval -> point estimate +- margin of error

In [62]:
# CI = Sample Mean +- (Z * (SD / sqrt(Sample Size)))

In [63]:
# Firm A
(1817.1, 1882.9)

(1817.1, 1882.9)

In [64]:
# Firm B
(1795.9, 2004)

(1795.9, 200.4)

In [68]:
margin_of_error_b = norm.ppf(0.99) * (100 / (5 ** 0.5))

In [69]:
(1900 - margin_of_error_b, 1900 + margin_of_error_b)

(1795.9625602866513, 2004.0374397133487)

In [70]:
margin_of_error_a = norm.ppf(0.99) * (100 / (50 ** 0.5))

In [72]:
(1850 - margin_of_error_a, 1850 + margin_of_error_a)

(1817.1004728573362, 1882.8995271426638)

In [73]:
norm.interval(0.99, loc = 1850, scale = 100 / (50 ** 0.5))

(1813.572272645631, 1886.427727354369)