# Reinforce

$J(\theta) 
= \sum_{s \in \mathcal{S}} d^\pi(s) V^\pi(s) 
= \sum_{s \in \mathcal{S}} d^\pi(s) \sum_{a \in \mathcal{A}} \pi_\theta(a \vert s) Q^\pi(s, a)$

### Loss
https://aleksispi.github.io/assets/pg_autodiff.pdf

In [1]:
import numpy as np

In [2]:
import tensorflow as tf



In [3]:
import tensorflow.keras.backend as K

def policy_gradient_loss(Returns):
    def modified_crossentropy(action, action_probs):
        action_masks = action#tf.one_hot(action, n_acts)
        log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(action_probs), axis=1)
        loss = -tf.reduce_mean(Returns * log_probs)
        return loss
    return modified_crossentropy

In [4]:
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam


class PGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.states = []
        self.actions = []
        self.rewards = []
        self.model, self.model_trainable = self._build_model()

    def _build_model(self):
        input_state = Input(name='input_state', shape=(self.state_size,), dtype='float32')
        input_discount_reward = Input(name='input_discount_reward', shape=(1,), dtype='float32')
        x = Dense(32, activation='tanh')(input_state)
        x = Dense(self.action_size, activation='softmax')(x)
        model = Model(inputs=input_state, outputs=x)
        return model, 0

    def memorize(self, state, action, reward):
        y = np.zeros([self.action_size])
        y[action] = 1
        self.actions.append(np.array(y).astype('float32'))
        self.states.append(state)
        self.rewards.append(reward)

    def act(self, state):
        state = state.reshape([1, state.shape[0]])
        prob = self.model.predict(state, batch_size=1).flatten()
        action = np.random.choice(self.action_size, 1, p=prob)[0]
        
        return action, prob

    def discount_rewards(self, rewards):
        sum_re = sum(rewards)
        return np.array([sum_re] * len(rewards))

    def train(self):
        rewards = self.discount_rewards(np.vstack(self.rewards))
        X = np.squeeze(np.vstack(self.states))
        Y = np.squeeze(np.vstack(self.actions))
        l = policy_gradient_loss(self.rewards)
        self.model.compile(loss=l, optimizer=Adam(learning_rate=1e-2))
        self.model.train_on_batch(X,Y)
        self.states, self.actions, self.rewards = [], [], []

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [5]:

if __name__ == "__main__":
    env = gym.make('CartPole-v0')
    #env = gym.make('MountainCar-v0')
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
    batch = 5000
    epoch = 0
    while True: #epochs
        agent.states, agent.actions, agent.rewards = [], [], []
        states = []
        actions = []
        rewards = []
        state = env.reset()       # first obs comes from starting distribution
        
        while True:
            states.append(state)
            action, prob = agent.act(state)
            y = np.zeros([agent.action_size])
            y[action] = 1
            actions.append(np.array(y).astype('float32'))
            state, reward, done, info = env.step(action)
            rewards.append(reward)
            score += reward
            if done:
                episode += 1
                agent.states += states
                agent.actions += actions
                agent.rewards += [sum(rewards)] * len(rewards)
                states = []
                actions = []
                rewards = []
                print('Episode: %d - Score: %f.' % (episode, score))
                score = 0
                state = env.reset()
                if len(agent.states) > batch:
                    break
        print('train', epoch)
        epoch += 1
        agent.train()

Episode: 1 - Score: 17.000000.
Episode: 2 - Score: 20.000000.
Episode: 3 - Score: 33.000000.
Episode: 4 - Score: 28.000000.
Episode: 5 - Score: 32.000000.
Episode: 6 - Score: 14.000000.
Episode: 7 - Score: 24.000000.
Episode: 8 - Score: 45.000000.
Episode: 9 - Score: 19.000000.
Episode: 10 - Score: 39.000000.
Episode: 11 - Score: 24.000000.
Episode: 12 - Score: 15.000000.
Episode: 13 - Score: 22.000000.
Episode: 14 - Score: 18.000000.
Episode: 15 - Score: 26.000000.
Episode: 16 - Score: 16.000000.
Episode: 17 - Score: 14.000000.
Episode: 18 - Score: 33.000000.
Episode: 19 - Score: 26.000000.
Episode: 20 - Score: 21.000000.
Episode: 21 - Score: 17.000000.
Episode: 22 - Score: 20.000000.
Episode: 23 - Score: 18.000000.
Episode: 24 - Score: 56.000000.
Episode: 25 - Score: 18.000000.
Episode: 26 - Score: 14.000000.
Episode: 27 - Score: 11.000000.
Episode: 28 - Score: 17.000000.
Episode: 29 - Score: 11.000000.
Episode: 30 - Score: 46.000000.
Episode: 31 - Score: 19.000000.
Episode: 32 - Sco

Episode: 253 - Score: 12.000000.
Episode: 254 - Score: 41.000000.
Episode: 255 - Score: 61.000000.
Episode: 256 - Score: 13.000000.
Episode: 257 - Score: 56.000000.
Episode: 258 - Score: 55.000000.
Episode: 259 - Score: 20.000000.
Episode: 260 - Score: 16.000000.
Episode: 261 - Score: 39.000000.
Episode: 262 - Score: 28.000000.
Episode: 263 - Score: 44.000000.
Episode: 264 - Score: 13.000000.
Episode: 265 - Score: 25.000000.
Episode: 266 - Score: 22.000000.
Episode: 267 - Score: 30.000000.
Episode: 268 - Score: 13.000000.
Episode: 269 - Score: 56.000000.
Episode: 270 - Score: 17.000000.
Episode: 271 - Score: 23.000000.
Episode: 272 - Score: 14.000000.
Episode: 273 - Score: 28.000000.
Episode: 274 - Score: 16.000000.
Episode: 275 - Score: 10.000000.
Episode: 276 - Score: 40.000000.
Episode: 277 - Score: 80.000000.
Episode: 278 - Score: 42.000000.
Episode: 279 - Score: 22.000000.
Episode: 280 - Score: 15.000000.
Episode: 281 - Score: 17.000000.
Episode: 282 - Score: 32.000000.
Episode: 2

Episode: 501 - Score: 30.000000.
Episode: 502 - Score: 19.000000.
Episode: 503 - Score: 43.000000.
Episode: 504 - Score: 58.000000.
Episode: 505 - Score: 16.000000.
Episode: 506 - Score: 19.000000.
Episode: 507 - Score: 20.000000.
Episode: 508 - Score: 21.000000.
Episode: 509 - Score: 37.000000.
Episode: 510 - Score: 13.000000.
Episode: 511 - Score: 94.000000.
Episode: 512 - Score: 58.000000.
Episode: 513 - Score: 15.000000.
Episode: 514 - Score: 20.000000.
Episode: 515 - Score: 78.000000.
Episode: 516 - Score: 18.000000.
Episode: 517 - Score: 20.000000.
Episode: 518 - Score: 45.000000.
Episode: 519 - Score: 26.000000.
Episode: 520 - Score: 20.000000.
Episode: 521 - Score: 35.000000.
Episode: 522 - Score: 16.000000.
Episode: 523 - Score: 23.000000.
Episode: 524 - Score: 39.000000.
Episode: 525 - Score: 18.000000.
Episode: 526 - Score: 48.000000.
Episode: 527 - Score: 62.000000.
Episode: 528 - Score: 24.000000.
Episode: 529 - Score: 14.000000.
Episode: 530 - Score: 31.000000.
Episode: 5

Episode: 749 - Score: 36.000000.
Episode: 750 - Score: 26.000000.
Episode: 751 - Score: 52.000000.
Episode: 752 - Score: 45.000000.
Episode: 753 - Score: 34.000000.
Episode: 754 - Score: 31.000000.
Episode: 755 - Score: 61.000000.
Episode: 756 - Score: 85.000000.
Episode: 757 - Score: 53.000000.
Episode: 758 - Score: 29.000000.
Episode: 759 - Score: 70.000000.
Episode: 760 - Score: 36.000000.
Episode: 761 - Score: 38.000000.
Episode: 762 - Score: 123.000000.
Episode: 763 - Score: 39.000000.
Episode: 764 - Score: 76.000000.
Episode: 765 - Score: 12.000000.
Episode: 766 - Score: 68.000000.
Episode: 767 - Score: 76.000000.
Episode: 768 - Score: 64.000000.
Episode: 769 - Score: 26.000000.
Episode: 770 - Score: 58.000000.
Episode: 771 - Score: 69.000000.
Episode: 772 - Score: 59.000000.
Episode: 773 - Score: 30.000000.
Episode: 774 - Score: 94.000000.
Episode: 775 - Score: 18.000000.
Episode: 776 - Score: 82.000000.
Episode: 777 - Score: 83.000000.
Episode: 778 - Score: 41.000000.
Episode: 

Episode: 996 - Score: 52.000000.
Episode: 997 - Score: 68.000000.
Episode: 998 - Score: 22.000000.
Episode: 999 - Score: 50.000000.
Episode: 1000 - Score: 174.000000.
Episode: 1001 - Score: 62.000000.
Episode: 1002 - Score: 79.000000.
Episode: 1003 - Score: 42.000000.
Episode: 1004 - Score: 24.000000.
Episode: 1005 - Score: 159.000000.
train 7
Episode: 1006 - Score: 39.000000.
Episode: 1007 - Score: 114.000000.
Episode: 1008 - Score: 126.000000.
Episode: 1009 - Score: 87.000000.
Episode: 1010 - Score: 93.000000.
Episode: 1011 - Score: 153.000000.
Episode: 1012 - Score: 113.000000.
Episode: 1013 - Score: 111.000000.
Episode: 1014 - Score: 69.000000.
Episode: 1015 - Score: 200.000000.
Episode: 1016 - Score: 200.000000.
Episode: 1017 - Score: 49.000000.
Episode: 1018 - Score: 60.000000.
Episode: 1019 - Score: 57.000000.
Episode: 1020 - Score: 83.000000.
Episode: 1021 - Score: 88.000000.
Episode: 1022 - Score: 67.000000.
Episode: 1023 - Score: 118.000000.
Episode: 1024 - Score: 41.000000.


KeyboardInterrupt: 

In [None]:
env = gym.make('MountainCar-v0')
env.reset()
for _ in range(3):
    action = env.action_space.sample()
    print('action: ')
    print(action)
    state, reward, done, info = env.step(action)
    print('state: ')
    print(state)
    print('reward: ')
    print(reward)
    print()