## Paired sample means t test

### This scenario involves data measured of the same entity twice and the comparison is made between their means

In [47]:
import pandas as pd
import os
import stats_distributions as stds
import statsmodels.stats.weightstats as stsm
import numpy as np
from numpy import sqrt, abs, round
from scipy.stats import norm

from scipy import stats


In [7]:
os.chdir("C:\\Users\\satish\\Desktop")

In [9]:
df = pd.read_excel("MSE_data.xlsx", sheet_name = "HT - Paired sample")

In [10]:
df.head(10)

Unnamed: 0,x1,x2
0,1005,963
1,1035,1027
2,1281,1272
3,1051,1079
4,1034,1070
5,1079,1173
6,1104,1067
7,1439,1347
8,1029,1100
9,1160,1204


In [18]:
# H0 : u(D) = 0

alpha = 0.05
n = 10
t_stat, pval = stats.ttest_rel(df['x1'], df['x2'])

print("t_critical: + %.5f ; alpha: %.5f" %(stds.pval_to_t(alpha/2, n-1, "left"), alpha))
print("t_stat: %.3f ; p_value: %.3f" %(t_stat, pval))

if pval < 0.05:
    print("reject null hypothesis")
else:
    print("not reject null hypothesis")

t_critical: + -2.26216 ; alpha: 0.05000
t_stat: -0.474 ; p_value: 0.647
not reject null hypothesis


In [22]:
# this implies means can be the same 

# For specific u(d0) other than 0, we can use https://www.statsmodels.org/stable/generated/statsmodels.stats.weightstats.ttost_paired.html#statsmodels.stats.weightstats.ttost_paired

## Independent sample means t test

In [32]:
# H0 : u(D) = 0
df = pd.read_excel("MSE_data.xlsx", sheet_name = "HT - Independent sample")

### Equal pop variances

In [39]:
# assume population variances are equal => uservar = "pooled"; value = u(d0)
t_stat, pval, deg_freedom = stsm.ttest_ind(df["x1"], df["x2"], usevar = "pooled", value = 0)

In [40]:
print("t_critical: + %.5f ; alpha: %.5f" %(stds.pval_to_t(alpha/2, n-1, "left"), alpha))
print("t_stat: %.3f ; p_value: %.3f ; deg_freedom: %d" %(t_stat, pval, deg_freedom))

if pval < 0.05:
    print("reject null hypothesis")
else:
    print("not reject null hypothesis")

t_critical: + -2.26216 ; alpha: 0.05000
t_stat: -0.533 ; p_value: 0.600 ; deg_freedom: 18
not reject null hypothesis


In [None]:
# this implies means can be the same 

### Unequal pop variances

In [41]:
# assume population variances are unequal => uservar = "unequal"
t_stat, pval, deg_freedom = stsm.ttest_ind(df["x1"], df["x2"], usevar = "unequal", value = 0)

In [42]:
print("t_critical: + %.5f ; alpha: %.5f" %(stds.pval_to_t(alpha/2, n-1, "left"), alpha))
print("t_stat: %.3f ; p_value: %.3f ; deg_freedom: %d" %(t_stat, pval, deg_freedom))

if pval < 0.05:
    print("reject null hypothesis")
else:
    print("not reject null hypothesis")

t_critical: + -2.26216 ; alpha: 0.05000
t_stat: -0.533 ; p_value: 0.601 ; deg_freedom: 16
not reject null hypothesis


In [None]:
# this implies means can be the same 


## z test for two sample when pop stdev are known

In [44]:
def z_two_samp(m1, m2, sigma1, sigma2, n1, n2):
    
    ovr_sigma = sqrt(sigma1**2/n1 + sigma2**2/n2)
    z = (m1 - m2)/ovr_sigma
    
    # since two tail, find p value this way
    pval = 2*(1 - norm.cdf(abs(z)))
    return z, pval

In [52]:
# assume pop stddev to be same as samp stdev here
z_stat, p_val = z_two_samp(df["x1"].mean(), df["x2"].mean(), df["x1"].std(), df["x2"].std(), df["x1"].shape[0], 
                            df["x2"].shape[0])

print("z_critical: + %.5f ; alpha: %.5f" %(stds.pval_to_z(alpha/2, "left"), alpha))
print("z_stat: %.5f ; p_value: %.5f " %(z_stat, p_val))


if (p_val < 0.05):
    print('Reject Null Hypothesis : Significant difference')
else:
    print('Do not reject Null Hypothesis : Not Significant difference')


z_critical: + -1.95996 ; alpha: 0.05000
z_stat: -0.53311 ; p_value: 0.59395 
Do not reject Null Hypothesis : Not Significant difference


## F test to Compare two population variances 

In [75]:
import numpy as np

#define F-test function
def f_test(x, y, pos = "left"):   
    x = np.array(x)
    y = np.array(y)
    f_stat = np.var(x, ddof=1)/np.var(y, ddof=1) #calculate F test statistic 
    dfn = x.size - 1 #define degrees of freedom numerator 
    dfd = y.size - 1 #define degrees of freedom denominator 
    p_value = stds.f_to_pval(f_stat, dfn, dfd, pos)
    return f_stat, p_value


In [76]:
f_stat, p_value = f_test(df["x1"], df["x2"], "left")
print("f_stat: %.5f ; p_value: %.5f " %(f_stat, p_val))

if (p_val < 0.05):
    print('Reject Null Hypothesis : Significant difference')
else:
    print('Do not reject Null Hypothesis : Not Significant difference')

f_stat: 1.97909 ; p_value: 0.59395 
Do not reject Null Hypothesis : Not Significant difference


In [None]:
# in case of two tail, get lower and upper critical values that cover the area of CI mentioned