In [6]:
#####  RUN NOTES
# Justice Conformity study3, HDDM analysis
# Written by Jae-Young Son
# Last modified 08-13-20 for HDDM workshop (Carney Computational Modeling Workshop)


#####  SET UP ENVIRONMENT

# Import requisite packages
import pandas as pd
import hddm
import pickle
from patsy import dmatrix
from kabuki.utils import concat_models
import pathlib

# Name this model
modelName = 'categorical_unfair'  # Change this!

# Check whether save directories exist; if not, create them
pathlib.Path('./PPC/').mkdir(parents=True, exist_ok=True)


##### PATCHES SO YOU CAN GENERATE REGRESSION MODEL PREDICTIONS

import pymc as pm
import numpy as np
import pymc.progressbar as pbar

def _parents_to_random_posterior_sample(bottom_node, pos=None):
    """Walks through parents and sets them to pos sample."""
    for i, parent in enumerate(bottom_node.extended_parents):
        if not isinstance(parent, pm.Node): # Skip non-stochastic nodes
            continue

        if pos is None:
            # Set to random posterior position
            pos = np.random.randint(0, len(parent.trace()))

        assert len(parent.trace()) >= pos, "pos larger than posterior sample size"
        parent.value = parent.trace()[pos]

def _post_pred_generate(bottom_node, samples=500, data=None, append_data=True):
    """Generate posterior predictive data from a single observed node."""
    datasets = []
    ##############################
    # Sample and generate stats
    for sample in range(samples):
        _parents_to_random_posterior_sample(bottom_node)
        # Generate data from bottom node
        sampled_data = bottom_node.random()
        sampled_data.reset_index(inplace=True)
        if append_data and data is not None:
            sampled_data = sampled_data.join(data.reset_index(), lsuffix='_sampled')
        datasets.append(sampled_data)
    return datasets

def post_pred_gen(model, groupby=None, samples=500, append_data=False, progress_bar=True):
    results = {}

    # Progress bar
    if progress_bar:
        n_iter = len(model.get_observeds())
        bar = pbar.progress_bar(n_iter)
        bar_iter = 0
    else:
        print("Sampling...")

    if groupby is None:
        iter_data = ((name, model.data.loc[obs['node'].value.index]) for name, obs in model.iter_observeds())
    else:
        iter_data = model.data.groupby(groupby)

    for name, data in iter_data:
        node = model.get_data_nodes(data.index)

        if progress_bar:
            bar_iter += 1
            bar.update(bar_iter)

        if node is None or not hasattr(node, 'random'):
            continue # Skip

        ##############################
        # Sample and generate stats
        datasets = _post_pred_generate(node, samples=samples, data=data, append_data=append_data)
        results[name] = pd.concat(datasets, names=['sample'], keys=list(range(len(datasets))))

    if progress_bar:
        bar_iter += 1
        bar.update(bar_iter)

    return pd.concat(results, names=['node'])

In [2]:
##### LOAD PREVIOUSLY-ESTIMATED MODELS

# Create empty array that will eventually store our models
models = []

# Loop over 5 times to load in our 5 estimated models
for i in range(5):
    m = pickle.load(open('./Models/'+modelName+'_'+'%s'%i, 'rb'))
    models.append(m)

# Combine traces
m = concat_models(models)



In [7]:
# POSTERIOR PREDICTIVE CHECK (PPC)

# Generate predicted results from previously-estimated models
ppc_data = post_pred_gen(m, samples=2, append_data=True)
ppc_data.to_csv('./PPC/'+modelName+'_simData.csv')


### Note: don't need to load raw data since the PPC functions don't work
# Load data from csv file into a NumPy structured array
# data = hddm.load_csv('JustCon3_ddm.csv')  # Change this!
# data = data[data.fair_cat=='unfair']

### Note: don't really need to do this since the PPC functions don't work
# Take a look at each of your PPC nodes before looking at the aggregate stats
# This ensures that the appropriate stats were computed for each node/subject
# ppc_compare = hddm.utils.post_pred_stats(data, ppc_data, call_compare=False)
# print(ppc_compare)

### Note: these functions don't produce any meaningful statistics for regression models
# Get PPC stats comparing observed and simulated data
# ppc_compare = hddm.utils.post_pred_stats(data, ppc_data)
# ppc_compare.to_csv('./Results/04_a_va_PPC_stats.csv')  # Change this!

 [------------------107%------------------] 44 of 41 complete in 40.4 sec