## statistical hypothesis testing


In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import f_oneway

In [54]:
sns.set()

In [55]:
# https://www.kaggle.com/c/titanic/data
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [56]:
population = df.dropna()

In [57]:
sample = population.sample(n=100, random_state=1)

### ratio diff hypothesis testing

$$\displaystyle z=\frac{\widehat{p}_{1}-\widehat{p}_{2}}{\sqrt{\widehat{p}(1-\widehat{p}) \left(\frac{1}{n_1}+\frac{1}{n_2} \right)}}$$

$H_0$: survived ratio is no diff between gender  
$H_1$: female survived than male


In [58]:
male_survived = sample[sample["sex"] == "male"]["survived"]
female_survived = sample[sample["sex"] == "female"]["survived"]

In [59]:
proportions_ztest(
    [
        male_survived[male_survived == 1].count(),
        female_survived[female_survived == 1].count(),
    ],
    [male_survived.count(), female_survived.count()],
    alternative="smaller",
)

(-4.812570498370754, 7.450064270605205e-07)

### mean diff hypothesis testing

$$\displaystyle t=\frac{\overline{x}_{1}-\overline{x}_{2}}{s\sqrt{\frac{1}{n_{1}}+\frac{1}{n_{2}}}}$$

$H_0$: age mean has no diff between gender  
$H_1$: age mean has diff between gender


In [60]:
male_age = sample[sample["sex"] == "male"]["age"].dropna()
female_age = sample[sample["sex"] == "female"]["age"].dropna()
(np.mean(male_age), np.mean(female_age))

(38.279583333333335, 33.86538461538461)

In [61]:
# student's t-test
print(stats.ttest_ind(male_age, female_age, equal_var=True, alternative="two-sided"))

# welch's t-test
print(stats.ttest_ind(male_age, female_age, equal_var=False, alternative="two-sided"))

Ttest_indResult(statistic=1.4186445694606509, pvalue=0.15917512786763272)
Ttest_indResult(statistic=1.4136161493636352, pvalue=0.16073295193859768)


### testing for goodness of fit

$$\displaystyle \chi^{2} = \sum_{i=1}^{n} \frac{(X_i-E)^{2}}{E}$$

$H_0$: pclass is equality  
$H_1$: pclass is inequality

In [62]:
observed = np.array(sample["pclass"].value_counts())
expected = np.array([len(sample["pclass"]) / 3] * 3)
(observed, expected)

(array([90,  7,  3]), array([33.33333333, 33.33333333, 33.33333333]))

In [63]:
stats.chisquare(observed, expected)

Power_divergenceResult(statistic=144.73999999999998, pvalue=3.7162792859465714e-32)

### test of independence

$H_0$: gender is not associated with number of surviving  
$H_1$: gender is associated with number of surviving

In [64]:
cross_table = pd.crosstab(sample["sex"], sample["survived"])
cross_table

survived,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,5,47
male,26,22


In [65]:
stats.chi2_contingency(cross_table, correction=False)

Chi2ContingencyResult(statistic=23.160834801788514, pvalue=1.490012854121048e-06, dof=1, expected_freq=array([[16.12, 35.88],
       [14.88, 33.12]]))

### test of homogeneity of variance

$$\displaystyle F=\frac{s_{1}^{2}}{s_{2}^{2}}$$

$H_0$: age variance has no diff between gender  
$H_1$: age variance has diff between gender

In [66]:
male_age = sample[sample["sex"] == "male"]["age"].dropna()
female_age = sample[sample["sex"] == "female"]["age"].dropna()
(np.var(male_age), np.var(female_age))

(258.3421748263889, 216.96264792899407)

In [67]:
s_1 = stats.tvar(male_age)
s_2 = stats.tvar(female_age)
degree_of_freedoms = (len(male_age) - 1, len(female_age) - 1)

F = s_1 / s_2
stats.f(*degree_of_freedoms).sf(F)

0.26852752398819046