In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
# stats.t.ppf?

In [42]:
def calculate_t_statistic(n, mu, std, M):
    se = std / np.sqrt(n)
    return round((M - mu) / se, 2)

def calculate_t_portion(n, alpha, tail_num):
    df = n - 1
    return round(stats.t.ppf(1 - alpha/tail_num, df=df), 3)

def calculate_r_squared(n, mu, std, M):
    t = calculate_t_statistic(n, mu, std, M)
    return t ** 2 / (t ** 2 + n - 1)

def ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=2):
    
    t, cr = calculate_t_statistic(n, mu, std, M), calculate_t_portion(n, alpha, tail_num)
    
    if tail_num == 2:
        
        rejection_decision = (t > cr) | (t < -1 * cr)
        region = f't > {cr} or t < -{cr}'
        criteria = f'two tail, alpha {alpha}'
        
    elif tail_num == 1:
        
        if t > 0:
        
            rejection_decision = (t > cr)
            region = f't > {cr}'
            
        else:
            
            rejection_decision = (t < -1 * cr)
            region = f't < -{cr}'
        
        criteria = f'one tail, alpha {alpha}'
        
    else:
        print('Should use tail_num 1 or 2.')
        return None
            
    print(f'[{criteria}] t_statistic:{t}, critical_region:{region}\n=> null hypothesis rejection [{rejection_decision}]')
    
def calculate_cohens_d(mu, std, M):
    return round(abs((M - mu) / std), 2)

def calculate_stat_power(n, mu, std, M):
    se = std / np.sqrt(n)
    z = ((mu + 1.96 * se) - M) / se
    return round(1 - stats.norm.cdf(z), 4)

# 유형
1. t test - 가설검정 단계에 따라, Treatment 효과(M - mu) 통계적 유의성(significance)를 판단
2. 표준편차(sigma), 샘플 개수(n)에 따른 귀무가설(null hypothesis) 기각의 관계
3. 단측(one-sided test) / 양측(two-sided test), 유의수준(alpha)에 따른 결과 차이
4. 신뢰구간(confidence interval) 구하는 법
5. 결정계수(r sqaured) 구하는 법
6. 샘플 수(n)과 결정계수(r squared)의 관계
7. raw datapoint 기반 문제풀이 

### t test - 가설검정 단계에 따라, Treatment 효과(M - mu) 통계적 유의성(significance)를 판단

---
![''](./09_src/09_09_01.png)

In [4]:
# stats?

In [10]:
n, mu = 12, 70
std, M = np.sqrt(297 / (n - 1)), 74.5

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:3.0, critical_region:t > 2.201 or t < -2.201
=> null hypothesis rejection [True]


---
![''](./09_src/09_10_01.png)

In [11]:
n, mu = 25, 20
std, M = np.sqrt(384 / (n - 1)), 22.2

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:2.75, critical_region:t > 2.064 or t < -2.064
=> null hypothesis rejection [True]


### 표준편차(sigma), 샘플 개수(n)에 따른 귀무가설(null hypothesis) 기각의 관계

---
![''](./09_src/09_11_01.png)

In [12]:
n, mu = 16, 30
std, M = 3, 31.3

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:1.73, critical_region:t > 2.131 or t < -2.131
=> null hypothesis rejection [False]


In [13]:
n, mu = 36, 30
std, M = 3, 31.3

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:2.6, critical_region:t > 2.03 or t < -2.03
=> null hypothesis rejection [True]


---
![''](./09_src/09_12_01.png)

![''](./09_src/09_12_02.png)

In [14]:
n, mu = 8, 40
std, M = np.sqrt(32), 35

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:-2.5, critical_region:t > 2.365 or t < -2.365
=> null hypothesis rejection [True]


In [15]:
n, mu = 8, 40
std, M = np.sqrt(72), 35

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:-1.67, critical_region:t > 2.365 or t < -2.365
=> null hypothesis rejection [False]


### 단측(one-sided test) / 양측(two-sided test), 유의수준(alpha)에 따른 결과 차이

---
![''](./09_src/09_13_01.png)

In [16]:
n, mu = 9, 3.1
std, M = np.sqrt(162 / (n - 1)), 6.4

ttest_1samp_from_stats(n, mu, std, M)

[two tail, alpha 0.05] t_statistic:2.2, critical_region:t > 2.306 or t < -2.306
=> null hypothesis rejection [False]


In [17]:
n, mu = 9, 3.1
std, M = np.sqrt(162 / (n - 1)), 6.4

ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=1)

[one tail, alpha 0.05] t_statistic:2.2, critical_region:t > 1.86
=> null hypothesis rejection [True]


---
![''](./09_src/09_16_01.png)

In [18]:
n, mu = 16, 73.4
std, M = 8.4, 78.3

ttest_1samp_from_stats(n, mu, std, M, alpha=0.01, tail_num=1)

[one tail, alpha 0.01] t_statistic:2.33, critical_region:t > 2.602
=> null hypothesis rejection [False]


---
![''](./09_src/09_21_01.png)

In [19]:
n, mu = 16, 4
std, M = 1.04, 4.53

ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=1)

[one tail, alpha 0.05] t_statistic:2.04, critical_region:t > 1.753
=> null hypothesis rejection [True]


In [20]:
n, mu = 16, 4
std, M = 1.18, 3.30

ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=1)

[one tail, alpha 0.05] t_statistic:-2.37, critical_region:t < -1.753
=> null hypothesis rejection [True]


### 신뢰구간(confidence interval), 결정계수(r sqaured) 구하는 법

---
![''](./09_src/09_14_01.png)

![''](./09_src/09_14_02.png)

In [21]:
n, mu = 15, 60
std, M = np.sqrt(210 / (n - 1)), 34.5

ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=1)

[one tail, alpha 0.05] t_statistic:-25.5, critical_region:t < -1.761
=> null hypothesis rejection [True]


In [20]:
def calculate_cohens_d(mu, std, M):
    return round(abs((M - mu) / std), 2)

[one tail, alpha 0.05] t_statistic:-2.37, critical_region:t < -1.753
=> null hypothesis rejection [True]


In [22]:
cohens_d = calculate_cohens_d(mu, std, M)

cohens_d

6.58

---
![''](./09_src/09_15_01.png)

In [24]:
n, mu = 16, 15.1
std, M = np.sqrt(240 / (n - 1)), 23.3

ttest_1samp_from_stats(n, mu, std, M, alpha=0.01, tail_num=2)

[two tail, alpha 0.01] t_statistic:8.2, critical_region:t > 2.947 or t < -2.947
=> null hypothesis rejection [True]


In [25]:
std

4.0

In [31]:
ci_start = M - calculate_t_portion(n, alpha=0.1, tail_num=2) * (std / np.sqrt(n))
ci_end = M + calculate_t_portion(n, alpha=0.1, tail_num=2) * (std / np.sqrt(n))

ci_start, ci_end

(21.547, 25.053)

---
![''](./09_src/09_17_01.png)

![''](./09_src/09_17_02.png)

In [32]:
n, mu = 9, 81.7
std, M = 5.7, 77.2

ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=2)

[two tail, alpha 0.05] t_statistic:-2.37, critical_region:t > 2.306 or t < -2.306
=> null hypothesis rejection [True]


In [33]:
ci_start = M - calculate_t_portion(n, alpha=0.1, tail_num=2) * (std / np.sqrt(n))
ci_end = M + calculate_t_portion(n, alpha=0.1, tail_num=2) * (std / np.sqrt(n))

ci_start, ci_end

(73.666, 80.73400000000001)

---
![''](./09_src/09_20_01.png)

![''](./09_src/09_20_02.png)

In [44]:
n, mu = 25, 10
std, M = 1, 12.2

ttest_1samp_from_stats(n, mu, std, M, alpha=0.01, tail_num=1)

[one tail, alpha 0.01] t_statistic:11.0, critical_region:t > 2.492
=> null hypothesis rejection [True]


In [45]:
cohens_d = calculate_cohens_d(mu, std, M)

r_squared = calculate_r_squared(n, mu, std, M)

cohens_d, r_squared

(2.2, 0.8344827586206897)

### 샘플 수(n)과 결정계수(r squared)의 관계

---
![''](./09_src/09_18_01.png)

In [46]:
n, mu = 16, 45
std, M = 8, 49.2

In [47]:
cohens_d = calculate_cohens_d(mu, std, M)

r_squared = calculate_r_squared(n, mu, std, M)

cohens_d, r_squared

(0.53, 0.22720247295208656)

In [48]:
n, mu = 16, 45
std, M = 20, 49.2

In [49]:
cohens_d = calculate_cohens_d(mu, std, M)

r_squared = calculate_r_squared(n, mu, std, M)

cohens_d, r_squared

(0.21, 0.04492665036674816)

---
![''](./09_src/09_19_01.png)

In [50]:
n, mu = 9, 45
std, M = 12, 49

In [51]:
cohens_d = calculate_cohens_d(mu, std, M)

r_squared = calculate_r_squared(n, mu, std, M)

cohens_d, r_squared

(0.33, 0.1111111111111111)

In [52]:
n, mu = 16, 45
std, M = 12, 49

In [53]:
cohens_d = calculate_cohens_d(mu, std, M)

r_squared = calculate_r_squared(n, mu, std, M)

cohens_d, r_squared

(0.33, 0.10548694309107934)

### raw datapoint 기반 문제풀이

---
![''](./09_src/09_22_01.png)

![''](./09_src/09_22_02.png)

In [54]:
raw_dataset = [38, 37, 41, 35, 42, 40, 33, 33, 36, 38, 32, 39]

In [55]:
n, mu = 12, 40

In [64]:
ss = np.sum([r**2 for r in raw_dataset]) - (np.sum(raw_dataset)**2 / n)
std = np.sqrt(ss / (n - 1))
M = np.mean(raw_dataset)

std, M

(3.2752515517548764, 37.0)

In [65]:
ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=2)

[two tail, alpha 0.05] t_statistic:-3.17, critical_region:t > 2.201 or t < -2.201
=> null hypothesis rejection [True]


In [68]:
stats.ttest_1samp(raw_dataset, [mu])

Ttest_1sampResult(statistic=array([-3.17297914]), pvalue=array([0.00887141]))

In [69]:
calculate_cohens_d(mu, std, M)

0.92

---
![''](./09_src/09_23_01.png)

![''](./09_src/09_23_02.png)

In [70]:
raw_dataset = [53, 57, 61, 49, 52, 56, 58, 62, 51, 56]

In [71]:
n, mu = 10, 50

In [72]:
ss = np.sum([r**2 for r in raw_dataset]) - (np.sum(raw_dataset)**2 / n)
std = np.sqrt(ss / (n - 1))
M = np.mean(raw_dataset)

std, M

(4.249182927993988, 55.5)

In [73]:
ttest_1samp_from_stats(n, mu, std, M, alpha=0.05, tail_num=2)

[two tail, alpha 0.05] t_statistic:4.09, critical_region:t > 2.262 or t < -2.262
=> null hypothesis rejection [True]


In [74]:
stats.ttest_1samp(raw_dataset, [mu])

Ttest_1sampResult(statistic=array([4.09314624]), pvalue=array([0.00270428]))

In [75]:
calculate_cohens_d(mu, std, M)

1.29