In [37]:
import typing

import pandas as pd
import numpy as np

from statsmodels.stats.weightstats import _tconfint_generic

In [11]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [43]:
def conf_int(feature: pd.Series, alpha : float = 0.05) -> typing.Tuple[np.float64, np.float64]:
    feature_mean = feature.mean()
    feature_mean_std = feature.std(ddof=1) / sqrt(len(feature))
    
    return _tconfint_generic(mean=feature_mean,
                             std_mean=feature_mean_std,
                             dof=len(feature) - 1,
                             alpha=alpha, 
                             alternative='two-sided'
                            )

## Данные
Для 61 большого города в Англии и Уэльсе известны средняя годовая смертность на 100000 населения (по данным 1958–1964) и концентрация кальция в питьевой воде (в частях на миллион). Чем выше концентрация кальция, тем жёстче вода. Города дополнительно поделены на северные и южные.

In [2]:
data_path = './data/water.txt'

In [3]:
water_data = pd.read_table(data_path)

In [4]:
water_data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [15]:
water_data.isna().sum()

location     0
town         0
mortality    0
hardness     0
dtype: int64

In [24]:
water_data.shape

(61, 4)

In [30]:
water_data['location'].value_counts()

North    35
South    26
Name: location, dtype: int64

### Построим 95% доверительный интервал для средней годовой смертности в больших городах
Построим t-интервал  
$$\bar{X}_n \pm t_{1-\frac{\alpha}{2}} \frac{S}{\sqrt{n}}$$

In [48]:
mortality_conf_int_l, mortality_conf_int_u = conf_int(water_data['mortality'])

In [49]:
print(f'Mortality mean 95%% confidence interval ({mortality_conf_int_l}, {mortality_conf_int_u})')

Mortality mean 95%% confidence interval (1476.0833413552848, 1572.2117406119285)


### Построим 95% доверительный интервал для средней годовой смертности по всем южным городам

In [25]:
south_water_data = water_data.query("location == 'South'")

In [50]:
south_mortality_conf_int_l, south_mortality_conf_int_u = conf_int(south_water_data['mortality'])

In [29]:
print(f'Mortality on South mean 95%% confidence interval ({south_mortality_conf_int_l}, {south_mortality_conf_int_u})')

Mortality on South mean 95%% confidence interval (1320.1517462936238, 1433.463638321761)


### Построим 95% доверительный интервал для средней годовой смертности по всем северным городам

In [31]:
north_water_data = water_data.query("location == 'North'")

In [52]:
north_mortality_conf_int_l, north_mortality_conf_int_u = conf_int(north_water_data['mortality'])

In [35]:
print(f'Mortality on North mean 95%% confidence interval ({north_mortality_conf_int_l}, {north_mortality_conf_int_u})')

Mortality on North mean 95%% confidence interval (1586.5605251961385, 1680.6394748038613)


### Построим 95% доверительные интервалы для средней жёсткости воды в северных и южных городах

In [54]:
south_hardness_conf_int_l, south_hardness_conf_int_u = conf_int(south_water_data['hardness'])

In [55]:
print(f'Hardness on South mean 95%% confidence interval ({south_hardness_conf_int_l}, {south_hardness_conf_int_u})')

Hardness on South mean 95%% confidence interval (53.467198692036106, 86.07126284642544)


In [56]:
north_hardness_conf_int_l, north_hardness_conf_int_u = conf_int(north_water_data['hardness'])

In [57]:
print(f'Hardness on North mean 95%% confidence interval ({north_hardness_conf_int_l}, {north_hardness_conf_int_u})')

Hardness on North mean 95%% confidence interval (21.42248728572426, 39.37751271427574)
