In [2]:
from a2c.model import Model
from a2c.runner import Runner
import tensorflow as tf
from bandits import *
import numpy as np
from collections import defaultdict
import scipy.signal
from scipy.stats import spearmanr

### Choosing model

In [4]:
model_config = {'scope' : 'agent_test', 'lstm_units' : 12, 
                'gamma' : 0.5, 'ent_coef' : 0.05, 'vf_coef' : 0.5,
                'max_grad_norm' : 50,  
                'lr' : 5e-4, 'lr_half_period' : 1500000, 'anneal_lr' : False, 
                'path': './experiments/12unitsmodel/'}

nactions = 2
nobs = 4

tf.reset_default_graph()
sess = tf.Session()


train_bandits = [second_order_bandit(P_intervals = [(0.01, 0.99)]),
                 first_order_bandit(P_intervals = [(0.01, 0.99)]),
                 zero_order_bandit(P_intervals = [(0.01, 0.99)])]

model = Model(nactions, nobs, sess = sess, **model_config)
model.load_model()

runner = Runner(model)

Successfully loaded model "agent_test":
  "agent_test" was trained for 327000 epochs, using 26160000 timesteps
  "agent_test" has following parameters: lr = 0.000500


In [3]:
# Training, not necessary if model was loaded
runner.run_training(1000000, train_bandits)


Savinig model after 500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 1000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 1500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 2000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 2500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 3000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 3500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 4000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 4500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 5000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 5500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 6000 episodes of training
Average env speed is 2.3 ep/second

Savinig model af


Savinig model after 50000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 50500 episodes of training
Average env speed is 2.2 ep/second

Savinig model after 51000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 51500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 52000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 52500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 53000 episodes of training
Average env speed is 2.2 ep/second

Savinig model after 53500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 54000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 54500 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 55000 episodes of training
Average env speed is 2.3 ep/second

Savinig model after 55500 episodes of training
Average env speed is 2.3 ep/second

Sav

KeyboardInterrupt: 

### Humans data

In [5]:
# Load subjects' data and create bandits for earch subject condition pair

path_to_data = './data/' 
good_subjects = [3,4,5,6,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,28,29,30,31,
                 32,33,34,35,36,37,38,39,40,41]

good_subjects = [str(s) for s in good_subjects]

actions_sub = defaultdict(dict)
obses_sub = defaultdict(dict)
rewards_sub = defaultdict(dict)
test_bandits = []

for cond in ['1', '2', '3', '4']:
    for sub in good_subjects:
        
        path = path_to_data + 'Subject%s_Cond%s_behavior.txt' % (sub, cond)
        
        data = np.loadtxt(path, dtype = 'int')
        data[:, :2] = data[:, :2] - 1
        
        actions_sub['Subject%sCond%s' % (sub, cond)] = data[:, 0]
        obses_sub['Subject%sCond%s' % (sub, cond)] = data[:, 1]
        rewards_sub['Subject%sCond%s' % (sub, cond)] = data[:, 2]
        
        bandit = determenistic_bandit(data[:, 1])
        bandit.name = 'Subject%sCond%s' % (sub, cond)
        test_bandits.append(bandit)

In [53]:
# running model on the humans data
rewards, stats = runner.run_evaluation(1, test_bandits)
subject_keys = stats.keys()

### Analyzing each unit in lstm

In [61]:
# computing spearman rank correlation for each unit's activity with rewards at t, t-1, t-2

# for outputs

correlations  = []
for comp in range(12):
    corr_list = []
    for k in subject_keys:
        outputs = np.array(stats[k]['lstm_outputs'])[0, :, 0, 0, comp]
        outputs_sc = (outputs - np.min(outputs)) / (np.max(outputs) - np.min(outputs))
        
        corr_0 = spearmanr(outputs_sc, obses_sub[k])[0]
        corr_1 = spearmanr(outputs_sc[1:], obses_sub[k][:-1])[0]
        corr_2 = spearmanr(outputs_sc[2:], obses_sub[k][:-2])[0]
        corr_list.append((corr_0, corr_1, corr_2))
        
    correlations.append(np.mean(corr_list, 0))
    
correlations = np.round(correlations, 2)
correlations

array([[-0.01,  0.01,  0.06],
       [-0.02, -0.09,  0.04],
       [ 0.08, -0.75, -0.26],
       [-0.05, -0.25, -0.2 ],
       [ 0.29, -0.02,  0.27],
       [ 0.12, -0.01,  0.42],
       [-0.02,  0.01,  0.02],
       [-0.15, -0.52,  0.14],
       [-0.25,  0.28, -0.57],
       [ 0.05, -0.21,  0.09],
       [-0.02,  0.2 , -0.25],
       [-0.06, -0.13, -0.02]])

In [62]:
# computing spearman rank correlation for each unit's activity with rewards at t, t-1, t-2
correlations_states  = []
for comp in range(12):
    corr_list = []
    for k in subject_keys:
        states = np.array(stats[k]['lstm_states'])[0, :, 0, 0, comp]
        states_sc = (states - np.min(states)) / (np.max(states) - np.min(states))
        
        corr_0 = spearmanr(states_sc, obses_sub[k])[0]
        corr_1 = spearmanr(states_sc[1:], obses_sub[k][:-1])[0]
        corr_2 = spearmanr(states_sc[2:], obses_sub[k][:-2])[0]
        corr_list.append((corr_0, corr_1, corr_2))
        
    correlations_states.append(np.mean(corr_list, 0))
    
correlations_states = np.round(correlations_states, 2)
correlations_states

array([[-0.01,  0.02,  0.06],
       [-0.01, -0.08,  0.06],
       [ 0.08, -0.75, -0.28],
       [-0.05, -0.26, -0.21],
       [ 0.12,  0.49,  0.06],
       [ 0.08, -0.19,  0.47],
       [-0.  , -0.  ,  0.  ],
       [-0.13, -0.27, -0.31],
       [-0.2 ,  0.14, -0.58],
       [ 0.05, -0.07,  0.14],
       [-0.02,  0.03,  0.07],
       [ 0.  ,  0.  ,  0.01]])

In [63]:
# same as above, but instead correlation, checking how similar is the binarized unit's activity 
# to reward at t, t-1, t-2
scores = defaultdict(dict)
for comp in range(12):
    for th in [0.2, 0.3, 0.4, 0.5, 0.6]:
        scores_list = []
        for k in subject_keys:
            
            outputs = np.array(stats[k]['lstm_outputs'])[0, :, 0, 0, comp]
            outputs_sc = (outputs - np.min(outputs)) / (np.max(outputs) - np.min(outputs))

            outputs_bin = np.zeros(outputs_sc.shape, dtype = 'int')
            outputs_bin[outputs_sc <= th] = 1
            
            score_0 = max(np.mean(outputs_bin == obses_sub[k]), 1 - np.mean(outputs_bin == obses_sub[k]))
            
            score_1 = max(np.mean(outputs_bin[1:] == obses_sub[k][:-1]), 
                          1 - np.mean(outputs_bin[1:] == obses_sub[k][:-1])) 
            
            score_2 = max(np.mean(outputs_bin[2:] == obses_sub[k][:-2]),
                          1 - np.mean(outputs_bin[2:] == obses_sub[k][:-2]))
            
            scores_list.append((score_0, score_1, score_2))
        scores[comp][th] = np.round(np.mean(scores_list, 0), 2)
        
for comp in scores:
    keys = list(scores[comp].keys())
    ths_scores = [np.max(scores[comp][th]) for th in keys]
    best_th = keys[np.argmax(ths_scores)]
    print('Comp %d, best th %.1f, scores' % (comp, best_th), scores[comp][best_th] )

Comp 0, best th 0.4, scores [0.53 0.55 0.53]
Comp 1, best th 0.3, scores [0.54 0.55 0.54]
Comp 2, best th 0.3, scores [0.66 0.88 0.71]
Comp 3, best th 0.5, scores [0.56 0.6  0.58]
Comp 4, best th 0.6, scores [0.64 0.71 0.73]
Comp 5, best th 0.4, scores [0.61 0.62 0.72]
Comp 6, best th 0.2, scores [0.53 0.53 0.54]
Comp 7, best th 0.3, scores [0.63 0.75 0.7 ]
Comp 8, best th 0.5, scores [0.64 0.72 0.77]
Comp 9, best th 0.2, scores [0.55 0.57 0.56]
Comp 10, best th 0.2, scores [0.56 0.55 0.6 ]
Comp 11, best th 0.5, scores [0.53 0.54 0.53]


In [64]:
# same as above, but instead correlation, checking how similar is the binarized unit's activity 
# to reward at t, t-1, t-2
scores = defaultdict(dict)
for comp in range(12):
    for th in [0.2, 0.3, 0.4, 0.5, 0.6]:
        scores_list = []
        for k in subject_keys:
            
            states = np.array(stats[k]['lstm_states'])[0, :, 0, 0, comp]
            states_sc = (states - np.min(states)) / (np.max(states) - np.min(states))

            states_bin = np.zeros(states_sc.shape, dtype = 'int')
            states_bin[states_sc <= th] = 1
            
            score_0 = max(np.mean(states_bin == obses_sub[k]), 1 - np.mean(states_bin == obses_sub[k]))
            
            score_1 = max(np.mean(states_bin[1:] == obses_sub[k][:-1]), 
                          1 - np.mean(states_bin[1:] == obses_sub[k][:-1])) 
            
            score_2 = max(np.mean(states_bin[2:] == obses_sub[k][:-2]),
                          1 - np.mean(states_bin[2:] == obses_sub[k][:-2]))
            
            scores_list.append((score_0, score_1, score_2))
        scores[comp][th] = np.round(np.mean(scores_list, 0), 2)
        
for comp in scores:
    keys = list(scores[comp].keys())
    ths_scores = [np.max(scores[comp][th]) for th in keys]
    best_th = keys[np.argmax(ths_scores)]
    print('Comp %d, best th %.1f, scores' % (comp, best_th), scores[comp][best_th] )

Comp 0, best th 0.5, scores [0.54 0.56 0.55]
Comp 1, best th 0.3, scores [0.54 0.55 0.54]
Comp 2, best th 0.4, scores [0.66 0.82 0.73]
Comp 3, best th 0.5, scores [0.56 0.6  0.58]
Comp 4, best th 0.6, scores [0.6  0.73 0.7 ]
Comp 5, best th 0.5, scores [0.61 0.66 0.73]
Comp 6, best th 0.4, scores [0.56 0.56 0.56]
Comp 7, best th 0.5, scores [0.59 0.63 0.64]
Comp 8, best th 0.5, scores [0.64 0.71 0.8 ]
Comp 9, best th 0.4, scores [0.55 0.58 0.57]
Comp 10, best th 0.3, scores [0.54 0.54 0.54]
Comp 11, best th 0.5, scores [0.56 0.56 0.56]
