# Доверительные интервалы для доли 

## Генерация данных

In [51]:
import numpy as np

In [52]:
np.random.seed(1)
statistical_population = np.random.randint(2, size=100000)
#unique, counts = np.unique(statistical_population, return_counts=True)
#dict(zip(unique, counts))
random_sample = np.random.choice(statistical_population, size=1000)

In [40]:
#истинное значение доли
statistical_population.mean()

0.49771

## Точечная оценка доли

In [53]:
random_sample

array([1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,

In [42]:
len(random_sample)

1000

## Доверительный интервал для доли

In [43]:
from statsmodels.stats.proportion import proportion_confint

### Доверительный интервал на основе нормального распределения

$$\hat{p}\pm z_{1-\frac{\alpha}{2}} \sqrt{\frac{\hat{p}\left(1-\hat{p}\right)}{n}}$$

In [44]:
normal_interval = proportion_confint(sum(random_sample), len(random_sample), method='normal')

In [45]:
normal_interval

(0.4710104963037765, 0.5329895036962234)

In [46]:
print('normal_interval [%f, %f] with width %f' % (normal_interval[0], 
                                                 normal_interval[1],
                                                 normal_interval[1]-normal_interval[0]))

normal_interval [0.471010, 0.532990] with width 0.061979


### Доверительный интервал Уилсона

$$\frac1{ 1 + \frac{z^2}{n} } \left( \hat{p} + \frac{z^2}{2n} \pm z \sqrt{ \frac{ \hat{p}\left(1-\hat{p}\right)}{n} + \frac{
z^2}{4n^2} } \right), \;\; z \equiv z_{1-\frac{\alpha}{2}}$$ 

In [47]:
wilson_interval = proportion_confint(sum(random_sample), len(random_sample), method='wilson')

In [48]:
print('wilson_interval [%f. %f] with width %f' % (wilson_interval[0],
                                                 wilson_interval[1],
                                                 wilson_interval[1]-wilson_interval[0]))

wilson_interval [0.471062. 0.532922] with width 0.061860


## Размер выборки для интервала заданной ширины

In [49]:
from statsmodels.stats.proportion import samplesize_confint_proportion

In [36]:
n_samples = int(np.ceil(samplesize_confint_proportion(random_sample.mean(), 0.01)))
n_samples

9604

In [37]:
len(random_sample)

1000

In [13]:
np.random.seed(1)
random_sample = np.random.choice(statistical_population, size = n_samples)

In [14]:
normal_interval = proportion_confint(sum(random_sample), len(random_sample), method='normal')

In [15]:
print('normal_interval [%f, %f] with width %f' % (normal_interval[0],
                                                  normal_interval[1],
                                                  normal_interval[1] - normal_interval[0]))

normal_interval [0.481776, 0.501773] with width 0.019997
