In [None]:
#####  RUN NOTES
# Justice Conformity study3, HDDM analysis
# Written by Jae-Young Son
# Last modified 08-13-20 for HDDM workshop (Carney Computational Modeling Workshop)


#####  SET UP ENVIRONMENT

# Import requisite packages
import matplotlib
# If using the cluster, uncomment the following line & run before importing matplotlib.pyplot or pylab
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
import hddm
import pandas as pd
import pickle
from patsy import dmatrix
from kabuki.analyze import gelman_rubin
from kabuki.utils import concat_models
import pathlib

# Name this model (useful if you're running different models in different scripts)
modelName = 'categorical_unfair'  # Change this!

# Get around a problem with saving regression outputs in Python 3
def savePatch(self, fname):
    import pickle
    with open(fname, 'wb') as f:
        pickle.dump(self, f)
hddm.HDDM.savePatch = savePatch

# Check whether save directories exist; if not, create them
pathlib.Path('./Models/').mkdir(parents=True, exist_ok=True)
pathlib.Path('./Results/').mkdir(parents=True, exist_ok=True)
pathlib.Path('./Plots/' + modelName).mkdir(parents=True, exist_ok=True)

In [None]:
##### LOAD DATA

# Load data from csv file into a NumPy structured array
data = hddm.load_csv('JustCon3_ddm.csv')  # Change this!
data = data[data.fair_cat=='unfair']

# Check to make sure it's the data you expect/want
data.head(10)

In [None]:
#####  RUN REGRESSION MODELS
# Notes: You don't strictly have to run multiple chains in order to estimate parameters.
#        Here, we're sampling a total of 50 traces (10 per chain, 5 chains). You could do that in a single chain.
#        However, estimation is only one part of what we want to do. We also want to verify that our estimates
#        are *reliable*. If we ran this analysis many more times, would we get comparable estimates each time?
#        My preference for checking convergence is to use the Gelman-Rubin r-hat statistic. Basically, you run
#        multiple chains, then find the between-chain to within-chain variance ratio. You then check whether this
#        ratio (the scale reduction factor) exceeds some tolerance threshold that you define. I've seen 1.2 used,
#        but I personally favor the stricter threshold of 1.1. Up to you.

# Create empty array that will eventually store our models
models = []

# Loop over 5 times to get 5 chains
for i in range(5):
    # Define regression model
    m = hddm.HDDMRegressor(data,
                           # Change this!
                           {"v ~ C(rev_cat, Treatment('Alone'))",
                            "a ~ C(rev_cat, Treatment('Alone'))"},
                           group_only_regressors=True,
                           p_outlier=.05,
                           include={'z'})
    m.find_starting_values()
    
    # Start sampling from the defined regression model
    #      For the sake of demonstration, we only collect 10 traces (not nearly enough)
    #      It's also conventional to discard/"burn" the first X traces, as they're typically not good estimates
    #      Instead of storing traces in RAM, we're going to save them to our hard drive
    m.sample(10, burn=0, dbname='./Models/'+modelName+'_%s.db'%i, db='pickle')
    m.savePatch('./Models/'+modelName+'_%s'%i)
    
    # Once you're finished running a chain, add that chain to your array of models
    models.append(m)

In [None]:
##### EXAMINE MODEL OUTPUT

# Save traces of concatenated model (only valid to look at if converged!)
m_comb = concat_models(models)
m_comb_export = m_comb.get_traces()
m_comb_export.to_csv('./Results/'+modelName+'_traces.csv')

# Get DIC (model fit statistic like AIC/BIC, useful for model selection)
print("DIC: %f" %m_comb.dic)