# 均值检验 --- 小样本  t 检验

## two tail/ one sample/ check mean (z test)

In [None]:
def ztest_mean_two_tail(h0, sample_mean, sample_n, std, alpha): 
    pass

In [None]:
h0 = 5000
sample_n = 6102
issue = 0



## two tail/ one sample/ check mean (t test)

In [27]:
import numpy as np
import scipy.stats as st

def ttest_mean_two_tail(h0, sample_mean, sample_n, std, alpha): 
    sample_mean_norm = (sample_mean - h0)/ (std / (sample_n ** .5))
    df = sample_mean - 1
    t_score_p = st.t.cdf( abs(sample_mean_norm), df)
    p_value = 2 * (1-t_score_p)

    lower = st.t.ppf(alpha/2, df)
    upper = st.t.ppf( 1 - alpha/2, df)

    print ('t_score_p: {:.3f}'.format(t_score_p))
    print ('p_value: {:.3f}'.format(p_value))
    print ('t: {:.3f}'.format(sample_mean_norm))
    print ('lower limit: {:.3f}'.format(lower))
    print ('upper limit: {:.3f}'.format(upper))

h0 = 12
data = np.array([12.2, 10.8, 12, 11.8, 11.9, 12.4, 11.3, 12.2, 12, 12.3])

alpha = .05
sample_n = 10
sample_mean = np.mean(data)
sample_std = np.std(data)
ttest_mean_two_tail(h0, sample_mean, sample_n, sample_std, alpha)

## check var

## two samples/ check mean --- t-test part

### For the formal use, suggestion to use the ttest_ind in the scipy.stats

### For the stats_hypothesis, take the .py file as the correct version due to there are some changes there. 


almost all the situation to compare two means in two samples. 

both samples follow the normal distribution. 

- use z-test: 
    
    - samples qty bigger than 30 

    - samples qty smaller than 30, but known variation. 

- use t test: 

    - var1 == var2, both unknown

    - var1 != var2, both unknown, n1 == n2

    - var1 != var2, both unknown, n1 != n2

In [10]:
import numpy as np
import scipy.stats as st
from scipy.stats import ttest_ind, ttest_ind_from_stats

class CompareTwoSamples_Mean(): 
    def __init__(self, sample_1, sample_2, var_1 = None, var_2 = None): 
        '''
        sample_1 and sample_2 are np.array
        var_1 and var_2 is float value
        '''
        self.sample_1, self.sample_2 = sample_1, sample_2
        
        self.mean_1, self.mean_2 = np.mean(self.sample_1), np.mean(self.sample_2)
        
        if (var_1 is not None) and (var_2 is not None): 
            self.var_1, self.var_2 = var_1, var_2
            self.var_known = True
        else: 
            self.var_1, self.var_2 = np.var(self.sample_1), np.var(self.sample_2)
            self.var_known = False
        
        self.n_1, self.n_2 = len(self.sample_1), len(self.sample_2)
    
    def get_p(self): 
        
        df = 0

        if (self.n_1 >= 30 and self.n_2 >= 30) or self.var_known: 
            # z test due to sample size bigger than 30
            # or var known though size is not bigger than 30
            p_value = (self.mean_1 - self.mean_2)/ (self.var_1/ self.n_1 + self.var_2/ self.n_2) ** .5
            p = st.norm.pdf(p_value)

            return p_value, p, df
        else:
            if self.var_1 == self.var_2: 
                # var1 == var2 unknown
                Sp = ((self.var_1 * (self.n_1 - 1) + self.var_2 * (self.n_2 - 1)) / (self.n_1 + self.n_2 - 2)) ** .5
                dividend = Sp * ((1.0/ self.n_1 + 1.0/ self.n_2) ** .5)
                df = self.n_1 + self.n_2 - 2

            elif self.var_1 != self.var_2 and self.n_1 == self.n_2:
                # var1 != var2 and both unknown and n1 == n2
                dividend = (self.var_1/ self.n_1 + self.var_2 / self.n_2) ** .5
                df = self.n_1 + self.n_2 - 2

            elif self.var_1 != self.var_2 and self.n_1 != self.n_2:
                # var1 != var2 and both unknown and n1 != n2
                dividend = (self.var_1/ self.n_1 + self.var_2 / self.n_2) ** .5
                part_1 = self.var_1 / self.n_1
                part_2 = self.var_2 / self.n_2
                df = (part_1 + part_2) ** 2/ (part_1 ** 2 / (self.n_1 - 1) + part_2 ** 2 / (self.n_2 - 1))
            else: 
                print ('the situation is not listed.')
        
            p_value = (self.mean_1 - self.mean_2)/ dividend
            p = st.t.pdf(p_value, df)

            return p_value, p, df
        

# both are media loading category
sample_1 = np.array([325, 257, 303, 315, 380])
sample_2 = np.array([368, 390, 379, 260])

print ('mean difference: {:.2f}'.format(np.mean(sample_1) - np.mean(sample_2)))
print ('median difference: {:.2f}'.format(np.median(sample_1) - np.median(sample_2)))

mean_compare = CompareTwoSamples_Mean(sample_1, sample_2)
p_value, p_w, _, _, _ = mean_compare.get_p()

t, p = ttest_ind(sample_1, sample_2, equal_var = False)


print (p_value, p_w)
print (t, p)

mean difference: -33.25
median difference: -58.50


ValueError: not enough values to unpack (expected 5, got 3)

### ttest_ind usage

Condition: 

- two samples

- small sample size

- var not equal

- n not equal

Confidence Level: 

- use the func as

    `stats.t.interval(.95, df, loc, scale)`

- get the loc and scale from my stats_hypothesis.py

In [7]:
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy import stats
import pandas as pd

data = pd.read_csv('ChickData.csv')

mm = data.loc[data['feed'] == 'meatmeal', 'weight']
cs = data.loc[data['feed'] == 'casein', 'weight']
# print (mm.shape, cs.shape)
mm_mean = mm.mean()
cs_mean = cs.mean()


t, p = ttest_ind(mm, cs, equal_var = False)
print (t, p)

# following data got from my package
df = 20.815032454861285
loc = -46.67424242424238 
scale = 25.792478386757598
print (stats.t.interval(.95, df, loc, scale))


-1.7288013416117558 0.09866221558427586
(-100.3416751724738, 6.99319032398904)
(-48.3112347079135, -45.03725014180574)


## two samples/ check mean --- wilcox-test part

## two samples/ check mean --- ks part

## 概率论数理统计与随机过程

### ch8 假设检验

已知分布，mean and std，有若干sample，求是否是有显著差异；
 
用的mean和std都是从sample上求得；但是因为已知是正态分布，所以直接用z score求


In [30]:
import numpy as np
import scipy.stats as st
# known
sample_ls = np.array([6.9, 6.7, 5.8, 7.0, 6.8, 5.2, 7.1, 5.6, 6.5])
mean = 6.0
std = .36
sample_n = len(sample_ls)

# check if samples is different than before
t_value = np.mean(sample_ls) / (np.std(sample_ls) / (std ** .5))
c_value = st.norm.ppf(.975)

difference = c_value * (0.6 / (sample_n ** .5))
df = sample_n - 1
t_score_p = st.t.ppf(.975, df)

print ('sample_mean is :{:.3f}'.format(np.mean(sample_ls)))
print ('sample_std is : {:.3f}'.format(np.std(sample_ls)))
print ('threshold_diff is: {:.3f}'.format(difference))
print ('actual_diff is : {:.3f}'.format(actual_difference))


2.3060041350333704
1.959963984540054
sample_mean is :6.400
sample_std is : 0.650
threshold_diff is: 0.392
actual_diff is : 0.400


#### std 已知
已知分布，mean and std, 已知sample数量和sample mean, sample std，求sample mean和一般是否有差异, 已知是正态分布；

用的mean和std都是从sample上求得，所以直接用z score求

In [38]:
import numpy as np
import scipy.stats as st

mean = 128
std = 15

sample_n = 72
sample_mean = 126.07
sample_std = 15
alpha = .05

z_value = (mean - sample_mean)/ (sample_std / (sample_n ** .5))
print (z_value)
print ('actualy p value is : {}'.format((1 - st.norm.cdf(z_value))*2))
z_score_alpha = st.norm.ppf(alpha/2)

print ('threshold data is : {}'.format(mean + z_score_alpha * sample_std / (sample_n ** .5)))


1.0917728701520333
actualy p value is : 0.2749329465332897
threshold data is : 124.5352404391258


#### std 未知

此处用了t分布

已知服从正态分布，已知sample_mean， sample_n, sample_std

求 平均是否大于某个数字

In [42]:
import numpy as np
import scipy.stats as st

sample_n = 25
sample_mean = 66.45
sample_std = 20.32

check_mean = 60

t_value = (sample_mean - check_mean)/ (sample_std/(sample_n ** .5))

t_score_p = st.t.cdf(t_value, sample_n - 1)
print (t_score_p)

0.9372112801064751


#### 成对数据的t检验

用的mean，std 和数量都是已 两边数据相减之后的差值来计算的。因为n < 30，所以用t分布


In [52]:
import numpy as np
import scipy.stats as st

sample_A = np.array([39, 37, 36, 41, 34, 38, 43, 45])
sample_B = np.array([35, 38, 37, 39, 36, 40, 41, 42])

sample_diff = sample_B - sample_A
sample_df_mean = np.mean(sample_diff)
sample_A_std = np.std(sample_A)/ (len(sample_A) ** 0.5)
sample_B_std = np.std(sample_B)/ (len(sample_B) ** 0.5)
sample_bi_std = (np.std(sample_A) ** 2 / len(sample_A) + np.std(sample_B) ** 2 / len(sample_B)) ** .5
sample_std = np.std(sample_diff)
print (np.mean(sample_diff))
print (sample_std)
t_value = np.mean(sample_diff) / (sample_std / (len(sample_A) ** .5))
print (t_value)
t_score_p = st.t.cdf(abs(t_value), len(sample_A) - 1)
print (2*(1 - t_score_p))

-0.625
2.2325713874364688
-0.7918075824648959
0.4544633553335089


#### 方差的假设检验

(n-1) s^2 / std ^2 服从卡方分布

假设mean 未知，查看两个sample中的方差是否相等。

In [56]:
import numpy as np
import scipy.stats as st
# 8.1.2
mean = 180
std = 10
# dist = norm
# h0: sample_std <=10; h1: sample_std >10
sample_n = 12
sample_std = 14

chi_value = (sample_n - 1) * (sample_std ** 2) / (std ** 2)
print (chi_value)
chi_value_p = st.chi2.cdf(chi_value, sample_n - 1)
print ('percent of chi-square is: {:.3f}'.format(chi_value_p))

# till 8.3

21.56
percent of chi-square is: 0.972


In [None]:
# 8.3.1/ 7.4.5
import numpy as np
import scipy.stats as st

boy_n = 19
mean_boy = 242.793
std_boy = 16.566
girl_n = 17
mean_girl = 297.783
std_girl = 19.047
# two sets independent, h0: std_boy == std_girl; h1 <>s
s2 = 

In [None]:
# 8.1.3
sample_boy = np.array([61, 73, 58, 64, 70, 64, 72, 60, 65, 80, 55, 72, 56, 56, 74, 65])
sample_girl = np.array([83, 58, 70, 56, 76, 64, 80, 68, 78, 108, 76, 70, 97])
# dist = norm
# h0: mean_boy == mean_girl

In [None]:
# 8.1.4
sup_p = [9, 3, 3, 1]
data = [315, 108, 101, 32] 