# **时间洞察力特质和人格特质对饮食行为的影响**

data from **Eisenberg2019**, measuring tool:

**Independent Variables (IV):** Ten-Item Personality Inventory (TIPI); Zimbardo Time Perspective Inventory (ZTPI)
**Dependent Variables (DV):** 3 factor Eating Questionnaire (TFEQ-R18)

In [1]:
#加载需要使用的库
%matplotlib inline
import numpy as np 
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import arviz as az
import pymc3 as pm
from mpl_toolkits.mplot3d import Axes3D



## 数据预处理和可视化

In [2]:
# 数据预处理和可视化 3 factor Eating Questionnaire (TFEQ-R18)
np.random.seed(123)  #随机数种子，确保随后生成的随机数相同
data0 = pd.read_csv("/home/mw/input/1242517/eating_survey.csv.gz")  #读取数据
data0 = data0.set_index('worker_id')#设置索引
es=data0.groupby('worker_id').response.sum()#变量赋值，便于后续调用
thr_e = pd.DataFrame()
es_item = pd.DataFrame()#转置数据便于计算
for i in range(1,19):
    column_name='es'+str(i)
    es_item[column_name]=data0[data0['question_num']==i+1]['response']

cog = ['es2','es11','es12','es15','es16','es18']
uncon = ['es1','es4','es5','es7','es8','es9','es13','es14','es17']
emo = ['es3','es6','es10']
thr_e['cog_res']= es_item.loc[:,cog].sum(axis=1)
thr_e['uncon_e']= es_item.loc[:,uncon].sum(axis=1)
thr_e['emo_e']= es_item.loc[:,emo].sum(axis=1)
thr_e.describe()  #描述统计

Unnamed: 0,cog_res,uncon_e,emo_e
count,522.0,522.0,522.0
mean,13.639847,18.689655,6.264368
std,4.017785,5.301961,2.658242
min,6.0,9.0,3.0
25%,11.0,15.0,4.0
50%,14.0,18.0,6.0
75%,17.0,22.0,8.0
max,24.0,32.0,12.0


In [3]:
thr_e.plot.density()  #绘制量表得分的概率密度图

<AxesSubplot:ylabel='Density'>

In [4]:
thr_e.plot.hist()  #绘制量表得分的直方图

<AxesSubplot:ylabel='Frequency'>

In [5]:
# 数据预处理和可视化 Ten-Item Personality Inventory (TIPI)
data1 = pd.read_csv("/home/mw/input/1258529/ten_item_personality_survey.csv.gz")
data1 = data1.set_index('worker_id')
ten_p = pd.DataFrame()

for i in range(1,11):
    column_name='ten_p'+str(i)
    if i % 2==0:
        ten_p[column_name]=abs(8-data1[data1['question_num']==i+2]['response'])
    else:
        ten_p[column_name] = data1[data1['question_num']==i+2]['response']
five_p=pd.DataFrame()
five_p['Extra']= ten_p['ten_p1']+ten_p['ten_p6']
five_p['Agree']= ten_p['ten_p2']+ten_p['ten_p7']
five_p['Cons']= ten_p['ten_p3']+ten_p['ten_p8']
five_p['Emo']= ten_p['ten_p4']+ten_p['ten_p9']
five_p['Open']= ten_p['ten_p5']+ten_p['ten_p10']
five_p.describe()

Unnamed: 0,Extra,Agree,Cons,Emo,Open
count,522.0,522.0,522.0,522.0,522.0
mean,8.519157,8.078544,8.216475,8.494253,8.162835
std,1.421192,1.741323,1.297079,1.357769,1.758532
min,2.0,2.0,5.0,3.0,2.0
25%,8.0,7.0,7.0,8.0,7.0
50%,8.0,8.0,8.0,8.0,8.0
75%,9.0,9.0,9.0,9.0,9.0
max,13.0,14.0,14.0,14.0,14.0


In [6]:
five_p.plot.density()

<AxesSubplot:ylabel='Density'>

In [7]:
five_p.plot.hist()

<AxesSubplot:ylabel='Frequency'>

In [8]:
# 数据预处理和可视化 Zimbardo Time Perspective Inventory (ZTPI)
data2 = pd.read_csv("/home/mw/input/1258529/time_perspective_survey.csv.gz")
tp = pd.DataFrame()
data2=data2.set_index('worker_id')
for i in range(1,57):
    column_name='time_p'+str(i)
    if i in [9,24,25,41,56]:
        tp[column_name]=abs(6-data2[data2['question_num']==i+1]['response'])
    else:
        tp[column_name] = data2[data2['question_num']==i+1]['response']
five_tp=pd.DataFrame()
Past_Neg = ['time_p4', 'time_p5', 'time_p16', 'time_p22', 'time_p27', 'time_p33', 'time_p34', 'time_p36', 'time_p50', 'time_p54']
Present_Hed=['time_p1', 'time_p8', 'time_p12', 'time_p17', 'time_p19', 'time_p23', 'time_p26', 'time_p28', 'time_p31', 'time_p32', 'time_p42', 'time_p44', 'time_p46', 'time_p48', 'time_p55']
Future=['time_p6', 'time_p9', 'time_p10', 'time_p13', 'time_p18', 'time_p21', 'time_p24', 'time_p30', 'time_p40', 'time_p43', 'time_p45', 'time_p51', 'time_p56']
Past_Pos=['time_p2', 'time_p7', 'time_p11', 'time_p15', 'time_p20', 'time_p25', 'time_p29','time_p41','time_p49']
Present_Fat = ['time_p3', 'time_p14', 'time_p35','time_p37', 'time_p38', 'time_p39', 'time_p47', 'time_p52', 'time_p53']
five_tp['Past_Neg']= tp.loc[:,Past_Neg].mean(axis=1)
five_tp['Present_Hed']= tp.loc[:,Present_Hed].mean(axis=1)
five_tp['Future']= tp.loc[:,Future].mean(axis=1)
five_tp['Past_Pos']= tp.loc[:,Past_Pos].mean(axis=1)
five_tp['Present_Fat']= tp.loc[:,Present_Fat].mean(axis=1)
five_tp.describe()

Unnamed: 0,Past_Neg,Present_Hed,Future,Past_Pos,Present_Fat
count,522.0,522.0,522.0,522.0,522.0
mean,3.10364,2.957727,3.598291,3.275223,2.374202
std,0.804072,0.608542,0.392872,0.525825,0.656916
min,1.0,1.4,2.076923,1.666667,1.0
25%,2.5,2.533333,3.384615,3.0,2.0
50%,3.1,2.933333,3.615385,3.333333,2.333333
75%,3.7,3.333333,3.846154,3.666667,2.777778
max,4.9,4.666667,5.0,4.444444,4.555556


In [9]:
five_tp.plot.density()

<AxesSubplot:ylabel='Density'>

In [10]:
five_tp.plot.hist()

<AxesSubplot:ylabel='Frequency'>

In [11]:
#数据整合（标准化前）
data_all = pd.concat((five_tp,five_p,thr_e),axis=1)
#data_all = data_all.rename({0:'time_p'},axis='columns')
data_all.describe()
data_all

Unnamed: 0_level_0,Past_Neg,Present_Hed,Future,Past_Pos,Present_Fat,Extra,Agree,Cons,Emo,Open,cog_res,uncon_e,emo_e
worker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
s001,2.7,3.333333,3.384615,3.333333,2.111111,7,5,7,8,8,7,14,3
s002,2.8,4.133333,3.461538,3.666667,3.000000,12,8,10,10,7,9,28,10
s003,3.4,3.200000,3.230769,3.666667,3.111111,5,7,7,10,8,15,13,4
s004,2.1,2.666667,3.846154,3.444444,2.000000,8,7,8,8,8,18,16,3
s005,3.1,3.266667,3.692308,3.111111,2.222222,10,6,8,8,8,11,20,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
s554,2.4,2.733333,4.076923,3.555556,1.777778,9,8,10,9,8,17,27,9
s556,2.4,3.800000,4.076923,3.444444,2.333333,7,10,11,10,6,20,18,3
s557,4.0,3.000000,4.000000,4.000000,2.444444,10,11,8,8,6,9,20,4
s559,3.6,2.466667,3.538462,3.222222,2.777778,8,9,11,8,11,10,28,12


In [12]:
list(data_all.columns)

['Past_Neg',
 'Present_Hed',
 'Future',
 'Past_Pos',
 'Present_Fat',
 'Extra',
 'Agree',
 'Cons',
 'Emo',
 'Open',
 'cog_res',
 'uncon_e',
 'emo_e']

In [13]:
#将变量标准化
for i in list(data_all.columns):
    data_all[i] = (data_all[i] - data_all[i].mean()) / data_all[i].std()
data_all.head(10)

Unnamed: 0_level_0,Past_Neg,Present_Hed,Future,Past_Pos,Present_Fat,Extra,Agree,Cons,Emo,Open,cog_res,uncon_e,emo_e
worker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
s001,-0.501995,0.617224,-0.54388,0.110512,-0.400493,-1.068932,-1.767934,-0.937857,-0.364018,-0.092597,-1.652614,-0.884513,-1.228017
s002,-0.377628,1.931842,-0.348083,0.744437,0.95263,2.449242,-0.045106,1.375031,1.108986,-0.661253,-1.154827,1.756019,1.405302
s003,0.368574,0.398121,-0.935474,0.744437,1.12177,-2.476202,-0.619382,-0.937857,1.108986,-0.092597,0.338533,-1.073123,-0.851829
s004,-1.248196,-0.478291,0.630901,0.32182,-0.569634,-0.365297,-0.619382,-0.166894,-0.364018,-0.092597,1.085213,-0.507294,-1.228017
s005,-0.004527,0.507673,0.239307,-0.312105,-0.231353,1.041973,-1.193658,-0.166894,-0.364018,-0.092597,-0.65704,0.247143,-1.228017
s006,1.239143,1.931842,0.435104,1.167053,1.290911,-0.365297,-0.045106,-0.166894,-0.364018,-0.092597,-0.408147,-1.82756,-1.228017
s007,1.114776,0.836327,-0.739677,0.744437,1.460051,1.041973,2.826274,-0.166894,1.845488,-0.661253,0.83632,-0.130075,-0.851829
s008,0.866042,-0.368739,-0.935474,-0.523413,1.12177,-1.772567,-0.045106,0.604069,0.372484,-0.661253,-1.901507,2.133238,2.157679
s009,-0.128894,0.945879,-0.739677,0.110512,0.95263,-0.365297,0.52917,1.375031,1.845488,-0.092597,1.085213,1.3788,0.276736
s010,-0.128894,0.836327,-0.54388,0.110512,-0.569634,0.338338,1.103446,-0.166894,0.372484,-0.661253,1.085213,-0.695904,-0.099452


In [14]:
data_all.describe() # 标准化后的描述性统计

Unnamed: 0,Past_Neg,Present_Hed,Future,Past_Pos,Present_Fat,Extra,Agree,Cons,Emo,Open,cog_res,uncon_e,emo_e
count,522.0,522.0,522.0,522.0,522.0,522.0,522.0,522.0,522.0,522.0,522.0,522.0,522.0
mean,1.293984e-15,2.186416e-16,1.216056e-14,-3.458972e-15,1.675969e-16,-1.561118e-16,-5.580891e-16,-9.039172000000001e-17,6.557122e-16,-4.283504e-16,1.114477e-16,-1.069813e-16,-2.0949610000000003e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.616233,-2.55977,-3.872426,-3.059112,-2.091898,-4.587106,-3.490761,-2.479783,-4.04653,-3.504534,-1.901507,-1.82756,-1.228017
25%,-0.7507286,-0.6973941,-0.54388,-0.523413,-0.5696339,-0.365297,-0.6193819,-0.937857,-0.3640184,-0.6612533,-0.6570404,-0.6959038,-0.851829
50%,-0.004526767,-0.04008494,0.0435104,0.1105118,-0.06221257,-0.365297,-0.04510596,-0.1668942,-0.3640184,-0.09259725,0.08963976,-0.1300755,-0.09945212
75%,0.741675,0.6172242,0.6309008,0.7444366,0.6143492,0.3383378,0.5291699,0.6040685,0.3724839,0.4760588,0.8363199,0.6243623,0.6529248
max,2.234079,2.808255,3.567853,2.223594,3.320596,3.152877,3.400549,4.458882,4.054995,3.319339,2.578574,2.510457,2.157679


## 线性模型建构前的相关性分析

In [15]:
# 相关分析
x = data_all.values
cor=np.zeros((x.shape[1],x.shape[1],2))
# 利用stats.pearsonr对13个变量（问卷中13个分量表）两两做双变量pearson积差相关
for i in range(x.shape[1]):
    for j in range(x.shape[1]):
        cor[i,j] = stats.pearsonr(x[:,i],x[:,j])

In [16]:
# 相关显著性p值
p_value = np.zeros((x.shape[1],x.shape[1]))
for i in range(x.shape[1]):
    for j in range(x.shape[1]):
        p_value[i,j] = cor[i,j,1]
p_value_matrix = p_value # 相关性分析显著性
correlation_matrix = np.corrcoef(data_all, rowvar=0) # 相关系数

In [17]:
# 相关可视化
fig = plt.figure(figsize=(15, 7.5)) # 调用figure创建一个绘图对象 

ax = fig.add_subplot(121) # 设置1个子网格并添加子网格对象 
hot_img0 = ax.matshow(np.abs(correlation_matrix), vmin=0, vmax=1) 
 # 绘制热力图，值域从0到1 
fig.colorbar(hot_img0) # 为热力图生成颜色渐变条 
ticks = np.arange(0, 13, 1) # 生成0～13，步长为1 
ax.set_xticks(ticks) # 生成x轴刻度 
ax.set_yticks(ticks) # 设置y轴刻度 
plt.xticks(rotation=60)
plt.yticks(rotation=30)
plt.title('Correlation Matrix',fontsize=15,fontweight='bold') 
names = data_all.columns # 生成坐标轴标签文字 
ax.set_xticklabels(names) # 生成x轴标签 
ax.set_yticklabels(names) # 生成y轴标签

ax = fig.add_subplot(122) # 设置1个子网格并添加子网格对象 
hot_img1 = ax.matshow(p_value, vmin=0, vmax=1) 
 # 绘制热力图，值域从0到1 
fig.colorbar(hot_img1) # 为热力图生成颜色渐变条 
ticks = np.arange(0, 13, 1) # 生成0～13，步长为1 
ax.set_xticks(ticks) # 生成x轴刻度 
ax.set_yticks(ticks) # 设置y轴刻度 
plt.xticks(rotation=60)
plt.yticks(rotation=30)
plt.title('p-value Matrix',fontsize=15,fontweight='bold') 
names = data_all.columns # 生成坐标轴标签文字 
ax.set_xticklabels(names) # 生成x轴标签 
ax.set_yticklabels(names) # 生成y轴标签

[Text(0, 0, 'Past_Neg'),
 Text(0, 1, 'Present_Hed'),
 Text(0, 2, 'Future'),
 Text(0, 3, 'Past_Pos'),
 Text(0, 4, 'Present_Fat'),
 Text(0, 5, 'Extra'),
 Text(0, 6, 'Agree'),
 Text(0, 7, 'Cons'),
 Text(0, 8, 'Emo'),
 Text(0, 9, 'Open'),
 Text(0, 10, 'cog_res'),
 Text(0, 11, 'uncon_e'),
 Text(0, 12, 'emo_e')]

In [31]:
with pm.Model() as model1:
    # 先验分布: alpha, beta, sigma这三个参数是随机变量
    alpha = pm.Normal('alpha',mu=0,sd=1)
    beta = pm.Normal('beta',mu=0,sd=1, shape=6)  
    sigma = pm.HalfNormal('sigma',sd=1, shape=3)
    # 自变量x1~x6是data_all中的标准化得分数据
    x1 = pm.Data("x1", data_all['Future'])
    x2 = pm.Data("x2", data_all['Extra'])
    x3 = pm.Data("x3", data_all['Agree'])
    x4 = pm.Data("x4", data_all['Cons'])
    x5 = pm.Data("x5", data_all['Emo'])
    x6 = pm.Data("x6", data_all['Open'])
    # 正态分布均值是确定性随机变量，这个变量的值完全由右端值确定
    mu = pm.Deterministic("mu", alpha + beta[0]*x1 + beta[1]*x2 + beta[2]*x3 + beta[3]*x4 + beta[4]*x5 + beta[5]*x6) 
    # Y的观测值，这是一个特殊的观测随机变量，表示模型数据的可能性。也可以表示模型的似然，通过 observed 参数来告诉这个变量其值是已经被观测到了的，不会被拟合算法改变
    # 假定因变量服从正态分布
    y_cr = pm.Normal('y_cr', mu=mu, sd=sigma[0], observed=data_all['cog_res'] )
    y_ue = pm.Normal('y_ue', mu=mu, sd=sigma[1], observed=data_all['uncon_e'] )
    y_ee = pm.Normal('y_ee', mu=mu, sd=sigma[2], observed=data_all['emo_e'] )
    # 先验预测检查
    prior_checks = pm.sample_prior_predictive(samples=50)

In [36]:
x = np.linspace(-5, 5, 50) # 生成从-2，2之间的50个假数据

for a, b in zip(prior_checks["alpha"], prior_checks["beta"]):
    y = a + b[0] * x + b[1] * x + b[2] * x + b[3] * x + b[4] * x + b[5] * x         # 基于假数据生成预测值
    plt.plot(x,y)

In [25]:
pm.model_to_graphviz(model1)

In [32]:
#采样过程仍在该容器中进行
with model1:
    # 使用mcmc方法进行采样，draws为采样次数，tune为调整采样策略的次数，这些次数将在采样结束后被丢弃，
    # target_accept为接受率， return_inferencedata=True为该函数返回的对象是arviz.InnferenceData对象
    # chains为我们采样的链数，cores为我们的调用的cpu数，多个链可以在多个cpu中并行计算，我们在和鲸中调用的cpu数为2
    trace1 = pm.sample(draws = 2000, tune=1000, target_accept=0.9,chains=2, cores= 2,progressbar = True)

  
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma, beta, alpha]


Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 6 seconds.


In [27]:
# 绘制特定参数的采样情况，选取对象为trace，选取其中'alpha','beta','sigma'三个参数
az.plot_trace(trace1,var_names=['alpha','beta','sigma'])



array([[<AxesSubplot:title={'center':'alpha'}>,
        <AxesSubplot:title={'center':'alpha'}>],
       [<AxesSubplot:title={'center':'beta'}>,
        <AxesSubplot:title={'center':'beta'}>],
       [<AxesSubplot:title={'center':'sigma'}>,
        <AxesSubplot:title={'center':'sigma'}>]], dtype=object)

In [28]:
az.summary(trace1, var_names=['alpha','beta','sigma'], kind="diagnostics")



Unnamed: 0,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,0.001,0.001,5299.0,2782.0,1.0
beta[0],0.001,0.0,5786.0,3233.0,1.0
beta[1],0.001,0.001,5252.0,2949.0,1.0
beta[2],0.001,0.001,5725.0,3232.0,1.0
beta[3],0.001,0.001,5296.0,3148.0,1.0
beta[4],0.001,0.001,5357.0,3332.0,1.0
beta[5],0.001,0.001,5339.0,3302.0,1.0
sigma,0.0,0.0,5042.0,2956.0,1.0


In [29]:
with model1:
    #pm.sample_posterior_predictive()利用trace.posterior的后验分布计算后验预测分布
    ppc_y1 = pm.sample_posterior_predictive(trace1, var_names=["mu", "sigma", "y_cr"])
    #将pymc3 数据转换为 InferenceData 对象。
    ppc_data1 = az.from_pymc3(trace = trace1, posterior_predictive=ppc_y1)

In [30]:
az.plot_ppc(ppc_data1)

<AxesSubplot:xlabel='y_cr'>

  func(*args, **kwargs)


# model3

In [37]:
with pm.Model() as model3:
    # 先验分布: alpha, beta, sigma这三个参数是随机变量
    alpha = pm.Normal('alpha',mu=-2,sd=1)
    beta = pm.Normal('beta',mu=0,sd=1)
    sigma = pm.HalfNormal('sigma', sd=1)
    # 自变量x1~x6是data_all中的标准化得分数据
    x1 = pm.Data("x1", data_all['Future'])
    x2 = pm.Data("x2", data_all['Extra'])
    x3 = pm.Data("x3", data_all['Agree'])
    x4 = pm.Data("x4", data_all['Cons'])
    x5 = pm.Data("x5", data_all['Emo'])
    x6 = pm.Data("x6", data_all['Open'])
    # 正态分布均值是确定性随机变量，这个变量的值完全由右端值确定
    mu = pm.Deterministic("mu", alpha + beta[0]*x1 + beta[1]*x2 + beta[2]*x3 + beta[3]*x4 + beta[4]*x5 + beta[5]*x6)
    # Y的观测值，这是一个特殊的观测随机变量，表示模型数据的可能性。也可以表示模型的似然，通过 observed 参数来告诉这个变量其值是已经被观测到了的，不会被拟合算法改变
    # 假定因变量服从正态分布
    y_cr = pm.Normal('y_cr', mu=mu, sd=sigma[0],observed=data_all['cog_res'] )
    y_ue = pm.Normal('y_ue', mu=mu, sd=sigma[1],observed=data_all['uncon_e'] )
    y_ee = pm.Normal('y_ee', mu=mu, sd=sigma[2], observed=data_all['emo_e'] )
    # 先验预测检查
    prior_checks = pm.sample_prior_predictive(samples=50)

In [39]:
x = np.linspace(-5, 5, 50) # 生成从-2，2之间的50个假数据

for a, b in zip(prior_checks["alpha"], prior_checks["beta"]):
    y = a + b[0] * x + b[1] * x + b[2] * x + b[3] * x + b[4] * x + b[5] * x         # 基于假数据生成预测值
    plt.plot(x,y)

In [40]:
pm.model_to_graphviz(model3)

In [41]:
#采样过程仍在该容器中进行
with model3:
    # 使用mcmc方法进行采样，draws为采样次数，tune为调整采样策略的次数，这些次数将在采样结束后被丢弃，
    # target_accept为接受率， return_inferencedata=True为该函数返回的对象是arviz.InnferenceData对象
    # chains为我们采样的链数，cores为我们的调用的cpu数，多个链可以在多个cpu中并行计算，我们在和鲸中调用的cpu数为2
    trace3 = pm.sample(draws = 2000, tune=1000, target_accept=0.9,chains=2, cores= 2,progressbar = True)

  
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma, beta, alpha]


Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 7 seconds.


In [42]:
# 绘制特定参数的采样情况，选取对象为trace，选取其中'alpha','beta','sigma'三个参数
az.plot_trace(trace3,var_names=['alpha','beta','sigma'])



array([[<AxesSubplot:title={'center':'alpha'}>,
        <AxesSubplot:title={'center':'alpha'}>],
       [<AxesSubplot:title={'center':'beta'}>,
        <AxesSubplot:title={'center':'beta'}>],
       [<AxesSubplot:title={'center':'sigma'}>,
        <AxesSubplot:title={'center':'sigma'}>]], dtype=object)

In [43]:
az.summary(trace3, var_names=['alpha','beta','sigma'], kind="diagnostics")



Unnamed: 0,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,0.0,0.0,6951.0,2895.0,1.0
beta[0],0.0,0.0,6988.0,3439.0,1.0
beta[1],0.0,0.0,6838.0,3075.0,1.0
beta[2],0.0,0.0,7138.0,3135.0,1.0
beta[3],0.0,0.0,6275.0,2904.0,1.0
beta[4],0.0,0.0,6103.0,2744.0,1.0
beta[5],0.0,0.0,7074.0,3220.0,1.0
sigma[0],0.0,0.0,6816.0,3328.0,1.0
sigma[1],0.0,0.0,7835.0,2816.0,1.0
sigma[2],0.0,0.0,6513.0,3007.0,1.0


In [46]:
with model3:
    #pm.sample_posterior_predictive()利用trace.posterior的后验分布计算后验预测分布
    ppc_y3 = pm.sample_posterior_predictive(trace3, var_names=["mu", "sigma", "y_cr","y_ue","y_ee"])
    #将pymc3 数据转换为 InferenceData 对象。
    ppc_data3 = az.from_pymc3(trace = trace3, posterior_predictive=ppc_y3)

In [47]:
az.plot_ppc(ppc_data3)

array([<AxesSubplot:xlabel='y_cr'>, <AxesSubplot:xlabel='y_ue'>,
       <AxesSubplot:xlabel='y_ee'>], dtype=object)

  func(*args, **kwargs)


In [52]:
with pm.Model() as exgaussian:
    # 先验分布: alpha, beta, sigma这三个参数是随机变量
    alpha = pm.Normal('alpha',mu=-1,sd=1)
    beta = pm.Normal('beta',mu=0,sd=1,shape=6)
    sigma = pm.HalfNormal('sigma',sd=1)
    nu = pm.HalfNormal('nu',sd=1)
    # 自变量x1~x6是data_all中的标准化得分数据
    x1 = pm.Data("x1", data_all['Future'])
    x2 = pm.Data("x2", data_all['Extra'])
    x3 = pm.Data("x3", data_all['Agree'])
    x4 = pm.Data("x4", data_all['Cons'])
    x5 = pm.Data("x5", data_all['Emo'])
    x6 = pm.Data("x6", data_all['Open'])
    # 正态分布均值是确定性随机变量，这个变量的值完全由右端值确定
    mu = pm.Deterministic("mu", alpha + beta[0]*x1 + beta[1]*x2 + beta[2]*x3 + beta[3]*x4 + beta[4]*x5 + beta[5]*x6)
    # Y的观测值，这是一个特殊的观测随机变量，表示模型数据的可能性。也可以表示模型的似然，通过 observed 参数来告诉这个变量其值是已经被观测到了的，不会被拟合算法改变
    # 假定因变量服从正态分布
    y_ee = pm.ExGaussian('y_ee', mu=mu, sigma=sigma, nu=nu, observed=data_all['emo_e'] )
    # 先验预测检查
    prior_checks = pm.sample_prior_predictive(samples=50)

In [50]:
x = np.linspace(-5, 5, 50) # 生成从-2，2之间的50个假数据

for a, b in zip(prior_checks["alpha"], prior_checks["beta"]):
    y = a + b[0] * x + b[1] * x + b[2] * x + b[3] * x + b[4] * x + b[5] * x         # 基于假数据生成预测值
    plt.plot(x,y)

In [53]:
pm.model_to_graphviz(exgaussian)

In [54]:
#采样过程仍在该容器中进行
with exgaussian:
    # 使用mcmc方法进行采样，draws为采样次数，tune为调整采样策略的次数，这些次数将在采样结束后被丢弃，
    # target_accept为接受率， return_inferencedata=True为该函数返回的对象是arviz.InnferenceData对象
    # chains为我们采样的链数，cores为我们的调用的cpu数，多个链可以在多个cpu中并行计算，我们在和鲸中调用的cpu数为2
    trace5 = pm.sample(draws = 2000, tune=1000, target_accept=0.9,chains=2, cores= 2,progressbar = True)

  
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [nu, sigma, beta, alpha]


Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 29 seconds.
There were 116 divergences after tuning. Increase `target_accept` or reparameterize.
There were 158 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 10% for some parameters.


In [55]:
# 绘制特定参数的采样情况，选取对象为trace，选取其中'alpha','beta','sigma'三个参数
az.plot_trace(trace5,var_names=['alpha','beta','sigma','nu'])



array([[<AxesSubplot:title={'center':'alpha'}>,
        <AxesSubplot:title={'center':'alpha'}>],
       [<AxesSubplot:title={'center':'beta'}>,
        <AxesSubplot:title={'center':'beta'}>],
       [<AxesSubplot:title={'center':'sigma'}>,
        <AxesSubplot:title={'center':'sigma'}>],
       [<AxesSubplot:title={'center':'nu'}>,
        <AxesSubplot:title={'center':'nu'}>]], dtype=object)

In [57]:
with exgaussian:
    #pm.sample_posterior_predictive()利用trace.posterior的后验分布计算后验预测分布
    ppc_y5 = pm.sample_posterior_predictive(trace5, var_names=["mu", "sigma","y_ee"])
    #将pymc3 数据转换为 InferenceData 对象。
    ppc_data5 = az.from_pymc3(trace = trace5, posterior_predictive=ppc_y5)

In [58]:
az.plot_ppc(ppc_data5)

<AxesSubplot:xlabel='y_ee'>