# 3-7参数估计

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

%precision 3
%matplotlib inline

In [4]:
fish = pd.read_csv("3-7-1-fish_length.csv")['length']
fish

0    4.352982
1    3.735304
2    5.944617
3    3.798326
4    4.087688
5    5.265985
6    3.272614
7    3.526691
8    4.150083
9    3.736104
Name: length, dtype: float64

* 点估计：直接指定总体分布的参数为某一个值的估计方法

In [6]:
# 点估计 将样本均值和方差作为总体分布的均值和方差
mu = np.mean(fish)
sigma_2 = np.var(fish, ddof=1)

(mu, sigma_2)

(4.187, 0.680)

* 区间估计： 估计值具有一定范围的估计方法，使用概率的方法计算这个范围
* 置信水平： 估计的区间可信度的概率
* 置信区间： 满足某个置信水平的区间
* 置信界限： 置信区间的上界值（上置信界限）和下界值（下置信界限）
* 自由度：样本容量-1

In [7]:
df = len(fish) - 1 # 自由度
df

9

In [8]:
sigma = np.std(fish, ddof=1)
se = sigma / np.sqrt(len(fish))
(sigma, se)

(0.825, 0.261)

In [11]:
interval = stats.t.interval(
    confidence=0.95, df=df, loc=mu, scale=se
)
interval

(3.597, 4.777)

* 样本的方差越大就表明数据更偏离均值，相应的均值就更不可信

In [13]:
se2 = (sigma*10) / np.sqrt(len(fish)) # 将方差扩大10倍，置信区间也变大了
stats.t.interval(
    confidence=0.95, df=df, loc=mu, scale=se2
)

(-1.713, 10.087)

* 区间估计结果的解读：抽样中计算置信区间范围，有95%包含真正的总体均值

In [14]:
be_included_array = np.zeros(20000, dtype='bool')
be_included_array

array([False, False, False, ..., False, False, False])

In [15]:
np.random.seed(1)

norm_dist = stats.norm(loc=4, scale=0.8)
for i in range(0, 20000):
    sample = norm_dist.rvs(size=10)
    df = len(sample) - 1
    mu = np.mean(sample)
    std = np.std(sample, ddof=1)
    se = std / np.sqrt(len(sample))
    interval = stats.t.interval(0.95, df, mu, se)
    if (interval[0] <= 4 and interval[1] >= 4):
        be_included_array[i] = True

sum(be_included_array) / len(be_included_array)

0.948