首先导入数据，这里使用的是一次数据竞赛的train_label的数据，即房租的价格

In [7]:
import pandas as pd
import numpy as np
from scipy import stats
data = pd.read_csv('train_data.csv')
tradeMoney = data['tradeMoney']

In [45]:
tradeMoney_mean = np.mean(tradeMoney)
tradeMoney_std = np.std(tradeMoney, ddof=1)
print(tradeMoney_mean)
print(tradeMoney_std)

8837.074227557916
551428.6590976383


查看数据的描述信息

In [3]:
tradeMoney.describe()

count    4.144000e+04
mean     8.837074e+03
std      5.514287e+05
min      0.000000e+00
25%      2.800000e+03
50%      4.000000e+03
75%      5.500000e+03
max      1.000000e+08
Name: tradeMoney, dtype: float64

选取样本容量为100的一个样本

In [26]:
tradeMoney_sam = tradeMoney.sample(100)
tradeMoney_sam.describe()

count      100.000000
mean      5026.100000
std       3400.800231
min       1190.000000
25%       2882.500000
50%       4250.000000
75%       5975.000000
max      24500.000000
Name: tradeMoney, dtype: float64

* 一个正态总体方差已知，均值的区间估计，使用的是正态分布
* np.std求得的均值是有偏的，这里我们需要的是无偏的均值，所以需要加上ddof=1

In [50]:
# 自定义函数实现正态分布下的置信区间，这里使用的是总体方差
def norm_conf(data, std, confidence=0.95):
    sample_mean = np.mean(data)  # 求样本均值
    sample_size = len(data)
    alpha = 1 - confidence  # 显著性水平
    norm_score = stats.norm.isf(alpha / 2)  # 查表得正态分布的分数
    ME = std / np.sqrt(sample_size) * norm_score
    lower_limit = sample_mean - ME
    upper_limit = sample_mean + ME
#     print('(%.6f, %.6f)' % (lower_limit, upper_limit))
    return lower_limit, upper_limit

In [29]:
norm_conf(tradeMoney_sam, tradeMoney_std)

(-103051.931187, 113104.131187)


* 一个正态总体，方差未知，均值的区间估计，使用的是t分布

In [58]:
# 自定义函数实现t分布下的置信区间
def ttest_conf(data, confidence=0.95):
    sample_mean = np.mean(data)
    sample_std = np.std(data,ddof=1)    
    sample_size = len(data)
    alpha = 1 - confidence
    t_score = stats.t.isf(alpha / 2, df = (sample_size-1) )
    ME = t_score * sample_std / np.sqrt(sample_size)
    lower_limit = sample_mean - ME
    upper_limit = sample_mean + ME
    print(  '( %.6f, %.6f)' % (lower_limit, upper_limit))
    return lower_limit, upper_limit

In [35]:
ttest_conf(tradeMoney_sam)

( 4351.307453, 5700.892547)


In [52]:
# 重复抽取数据，验证一个正态总体，方差已知，均值的区间估计的准确度
scale_means = []
size = 0
for _ in range(1000):
    tradeMoney_sample = tradeMoney.sample(100, replace=True)
    lower_limit_norm, upper_limit_norm = norm_conf(tradeMoney_sample, tradeMoney_std)
    if tradeMoney_mean >= lower_limit_norm and tradeMoney_mean <= upper_limit_norm:
        size += 1
print('一个正态总体，方差已知，均值的区间估计的准确度为：', size / 1000)

一个正态总体，方差已知，均值的区间估计的准确度为： 0.992


In [59]:
# 重复抽取数据，验证一个正态总体，方差未知，均值的区间估计的准确度
scale_means = []
size = 0
for _ in range(1000):
    tradeMoney_sample = tradeMoney.sample(100, replace=True)
    lower_limit_t, upper_limit_t = ttest_conf(tradeMoney_sample)
    if tradeMoney_mean >= lower_limit_t and tradeMoney_mean <= upper_limit_t:
        size += 1
print('一个正态总体，方差已知，均值的区间估计的准确度为：', size / 1000)

( 4100.090217, 5404.309783)
( 4199.073783, 6347.486217)
( 4104.042986, 5052.157014)
( 4020.200282, 5606.199718)
( 4221.990344, 5627.209656)
( 4248.126996, 5591.273004)
( 4060.628165, 5727.371835)
( 4269.618079, 6393.981921)
( 3880.595470, 5288.844530)
( 3831.101297, 6506.898703)
( 4129.343095, 5665.316905)
( 4064.648134, 6444.471866)
( 4301.457156, 5813.802844)
( 4224.787548, 5987.212452)
( 4135.242232, 5387.957768)
( 3881.959220, 5547.380780)
( 3685.569110, 4914.230890)
( 4126.991567, 5670.808433)
( 4164.992325, 5504.267675)
( 3784.130395, 4593.869605)
( 4547.483786, 6210.436214)
( 4306.178809, 5405.721191)
( -979303.435783, 2988937.635583)
( 4228.520927, 5423.779073)
( 3933.779086, 5443.200914)
( 3870.107773, 5044.892227)
( 3827.209063, 4733.790937)
( 4520.821993, 6249.578007)
( 4086.243903, 5673.956097)
( 4200.901639, 5437.158361)
( 3501.084344, 4645.115656)
( 4140.990568, 5958.009432)
( 3907.516702, 5550.083298)
( 3828.135023, 6136.064977)
( 3977.801768, 4900.758232)
( 4009.857841,