In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
from tqdm.notebook import tqdm
import numpy as np
from matplotlib import pyplot as plt
import modelutils_v2 as modelutils
import copy,random
from scipy import stats as st

The input (at least the state) into the network is 11650 elements long. The action vector is npeople long. The critic network maps the state and action vectors to a scalar. The actor network maps a state to an action. The action output will be the theta of a bernoulli trial to determine if an individual will be tested.

In [3]:
npeople = 100
inputlen = 11650+npeople #we'll need to change the 11650 is the underlying model changes
nlocations=10

In [None]:
class Critic(Model):
  def __init__(self):
    super(MyModel, self).__init__()
    self.d1 = Dense(inputlen+npeople, activation='relu')
    self.d2 = Dense(12000,activation='relu')
    self.d3 = Dense(5000,activation='relu')
    self.d4 = Dense(1000,activation='relu')
    self.d5 = Dense(500,activation='relu')
    self.d6 = Dense(100,activation='relu')
    self.dout = Dense(1,activation='relu')

  def call(self, x):
    x = self.d1(x)
    x = self.d2(x)
    x = self.d3(x)
    x = self.d4(x)
    x = self.d5(x)
    x = self.d6(x)
    return self.dout(x)

class Actor(Model):
  def __init__(self):
    super(MyModel, self).__init__()
    self.d1 = Dense(inputlen, activation='relu')
    self.d2 = Dense(12000,activation='relu')
    self.d2 = Dense(12000,activation='relu')
    self.d3 = Dense(5000,activation='relu')
    self.d4 = Dense(1000,activation='relu')
    self.d5 = Dense(500,activation='relu')
    self.d6 = Dense(250,activation='relu')
    self.dout = Dense(npeople,activation='softmax')

  def call(self, x):
    x = self.d1(x)
    x = self.d2(x)
    x = self.d3(x)
    x = self.d4(x)
    x = self.d5(x)
    x = self.d6(x)
    return self.dout(x)

# Create an instance of the model
critic_target = Critic()
critic_raw = Critic()

actor_target = Actor()
actor_raw = Actor()

In [None]:

optimizer = tf.keras.optimizers.Adam()

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [7]:
episodes = 5
bufferlen = 10
batchsize = 8
gamma=1e-3
tau = 1e-1

homelocs = np.zeros([nlocations,npeople])
for person_idx in range(npeople):
    homelocs[np.random.choice(np.arange(nlocations)),person_idx]=1.

init_state = modelutils.state(homelocs) 

rand_action[rand_action>0.2] = 999
rand_action[rand_action<0.2] = 1
rand_action[rand_action==999] = 0

next_state = init_state.update_state(rand_action)

replay_buffer = [(init_state,rand_action,next_state.reward(),next_state)]

for _ in range(bufferlen-1):
    print(_,'/',bufferlen-1)
    init_state = replay_buffer[-1][-1]
    
    rand_action = st.uniform.rvs(size=npeople)
    rand_action[rand_action>0.2] = 999
    rand_action[rand_action<0.2] = 1
    rand_action[rand_action==999] = 0
    
    next_state = init_state.update_state(rand_action)
    
    replay_buffer.append((init_state,rand_action,next_state.reward(),next_state))

0 / 9
1 / 9
2 / 9
3 / 9
4 / 9
5 / 9
6 / 9
7 / 9
8 / 9


In [None]:
@tf.function
def train_step(replay_buffer):
  with tf.GradientTape() as tape:
    
    #create a new sars tuple based on the selected action
    
    init_state = replay_buffer[-1][-1]
    actor_network_input = init_state.flatten()
    
    sampled_action = actor_raw(actor_network_input, training=True)  #this action vector contains probabilities, which we need to convert into a binary format a la bernoulli trials
    binary_action = [1 if st.uniform.rvs()<x else 0 for x in sampled_action ]
    
    next_state = init_state.update_state(binary_action)
    replay_buffer.append((init_state,rand_action,next_state.reward(),next_state))
    replay_buffer.pop()  #need to do this or else the buffer will grow endlessly
    
    #grab a minibatch from the replay buffer for training
    training_minibatch = random.sample(replay_buffer,batchsize)
    
    #we need to manually calculate yi for each item in the batch so we have something to train our critic network. Output needs to be an array as that's what's returned from tensorflow.
    #yi = ri + gamma*q'(s_{i+1},u'(s_{i+1}))
    #we'll start by calculating a list of actions from the target actor network
    statelist = [training_minibatch[x][-1] for x in range(len(training_minibatch))]
    rewardlist = np.array([training_minibatch[x][2] for x in range(len(training_minibatch))]) #cast immediately to array as its simply a list of scalars
    
    action_list = [actor_target(x) for x in statelist]
    binary_action_list = [np.array([1 if st.uniform.rvs()<x else 0 for x in sampled_action]) for sampled_action in action_list]  #you really need to check these two lines
    
    #next, we'll feed these actions into the target critic network to get a list of scalar values
    value_list = np.array([gamma * critic_target(np.concatenate([sampled_state.flatten(),sampled_action])) for sampled_state,sampled_action in zip(statelist,binary_action_list)])
    
    #we can finally assemble yi using the rewards
    yi = np.array(rewardlist + value_list)
    
    #onto defining the loss for the critic network. first we need to assemble the input for the critic_raw network
    prev_statelist = [training_minibatch[x][0] for x in range(len(training_minibatch))]
    prev_actionlist = [training_minibatch[x][1] for x in range(len(training_minibatch))]
    prev_binary_action_list = [np.array([1 if st.uniform.rvs()<x else 0 for x in sampled_action]) for sampled_action in prev_actionlist]  #you really need to check these two lines
    
    critic_raw_output = np.array([gamma * critic_raw(np.concatenate([sampled_state.flatten(),sampled_action])) for sampled_state,sampled_action in zip(prev_statelist,prev_binary_action_list)])
    
    
    critic_loss = tf.keras.losses.MeanSquaredError(yi,critic_raw_output)
    
    
    #loss = loss_object(labels, predictions)
  #gradients = tape.gradient(loss, model.trainable_variables)
  #optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  #train_loss(loss)
  #train_accuracy(labels, predictions)

In [None]:
#main training loop
for episode_idx in tqdm(range(episodes)):


  for images, labels in train_ds:
    train_step(images, labels)



  template = 'Episode {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
  print(template.format(episode_idx + 1,
                        train_loss.result(),))