## confidence interval


In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [33]:
sns.set()

In [34]:
# https://www.kaggle.com/c/titanic/data
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [35]:
len(df)

891

### ratio confidence interval

$$\displaystyle \widehat{p}-z_{\alpha/2} \times \sqrt{\frac{\widehat{p}(1-\widehat{p})}{n}} \leq p \leq \widehat{p} + z_{\alpha/2} \times \sqrt{\frac{\widehat{p}(1-\widehat{p})}{n}}$$

In [36]:
population_survived = df["survived"].dropna()
is_survived = 1

population_survived_ratio = (
    population_survived[population_survived == is_survived].count()
    / population_survived.count()
)
population_survived_ratio

0.3838383838383838

estimate probability of survival in titanic passengers

In [37]:
number_of_samples = 100
sample_size = 100
alpha = 0.95
results = []

for i in range(number_of_samples):
    sample = population_survived.sample(sample_size)
    sample_survived_ratio = sample[sample == is_survived].count() / sample.count()
    min, max = stats.binom.interval(alpha, sample_size, sample_survived_ratio)
    min_ratio = min / sample_size
    max_ratio = max / sample_size
    results.append(
        {
            "min": min_ratio,
            "max": max_ratio,
            "is_in_confidence_interval": min_ratio
            < population_survived_ratio
            < max_ratio,
        }
    )

print(pd.DataFrame(results)["is_in_confidence_interval"].value_counts())
print(pd.DataFrame(results))

True     94
False     6
Name: is_in_confidence_interval, dtype: int64
     min   max  is_in_confidence_interval
0   0.26  0.44                       True
1   0.31  0.50                       True
2   0.23  0.41                       True
3   0.27  0.46                       True
4   0.27  0.46                       True
..   ...   ...                        ...
95  0.27  0.46                       True
96  0.33  0.53                       True
97  0.22  0.40                       True
98  0.22  0.40                       True
99  0.27  0.46                       True

[100 rows x 3 columns]


### mean confidence interval by $t$ distribution

$$\displaystyle \overline{x}-t_{\alpha/2}(n-1) \times \sqrt{\frac{s^{2}}{n}} \leq \mu  \leq \overline{x}+t_{\alpha/2}(n-1) \times \sqrt{\frac{s^{2}}{n}}$$

In [38]:
population_age = df["age"].dropna()

population_age_mean = population_age.mean()
population_age_mean

29.69911764705882

estimate mean of age on titanic passengers

In [39]:
number_of_samples = 100
sample_size = 10
degree_of_freedom = sample_size - 1
alpha = 0.95
results = []

for i in range(number_of_samples):
    sample = population_age.sample(sample_size)
    sample_age_mean = sample.mean()
    sample_age_tvar = stats.tvar(sample)
    min, max = stats.t(degree_of_freedom).interval(
        alpha, loc=sample_age_mean, scale=np.sqrt(sample_age_tvar / sample_size)
    )
    results.append(
        {
            "min": min,
            "max": max,
            "is_in_confidence_interval": min < population_age_mean < max,
        }
    )

print(pd.DataFrame(results)["is_in_confidence_interval"].value_counts())
print(pd.DataFrame(results))

True     95
False     5
Name: is_in_confidence_interval, dtype: int64
          min        max  is_in_confidence_interval
0   15.310394  40.689606                       True
1   15.767059  37.032941                       True
2   23.461792  48.338208                       True
3   14.018880  46.881120                       True
4    9.139124  36.194876                       True
..        ...        ...                        ...
95  10.153935  33.846065                       True
96  19.798234  36.001766                       True
97  18.651906  46.048094                       True
98  14.483014  40.316986                       True
99  15.660911  39.039089                       True

[100 rows x 3 columns]


### variance confidence interval by $\chi^2$ distribution

$$\displaystyle \frac{(n-1)s^{2}}{\chi_{\alpha/2}^{2}(n-1)} \leq \sigma^{2} \leq \frac{(n-1)s^{2}}{\chi_{1-\alpha/2}^{2}(n-1)}$$

In [43]:
population_age = df["age"].dropna()

population_age_var = population_age.var()
population_age_var

211.01912474630805

estimate variance of age on titanic passengers

In [48]:
number_of_samples = 100
sample_size = 10
degree_of_freedom = sample_size - 1
alpha = 0.95
results = []

for i in range(number_of_samples):
    sample = population_age.sample(sample_size)
    sample_age_tvar = stats.tvar(sample)
    min, max = stats.chi2(degree_of_freedom).interval(alpha)
    min_var = degree_of_freedom * sample_age_tvar / max
    max_var = degree_of_freedom * sample_age_tvar / min
    results.append(
        {
            "min": min_var,
            "max": max_var,
            "is_in_confidence_interval": min_var < population_age_var < max_var,
        }
    )

print(pd.DataFrame(results)["is_in_confidence_interval"].value_counts())
print(pd.DataFrame(results))

True     98
False     2
Name: is_in_confidence_interval, dtype: int64
           min          max  is_in_confidence_interval
0    71.915140   506.602844                       True
1    98.729061   695.492261                       True
2    86.969468   612.652360                       True
3    54.587219   384.537120                       True
4    42.102180   296.586844                       True
..         ...          ...                        ...
95   89.424987   629.950146                       True
96   65.715989   462.933218                       True
97   92.988571   655.053651                       True
98  165.018047  1162.461934                       True
99   50.697144   357.133665                       True

[100 rows x 3 columns]
