$J(\theta) 
= \sum_{s \in \mathcal{S}} d^\pi(s) V^\pi(s) 
= \sum_{s \in \mathcal{S}} d^\pi(s) \sum_{a \in \mathcal{A}} \pi_\theta(a \vert s) Q^\pi(s, a)$

In [1]:
import numpy as np

In [2]:
import tensorflow.keras.backend as K

def policy_gradient_loss(Returns):
    def modified_crossentropy(action, action_probs):
        cost = K.categorical_crossentropy(action,action_probs,from_logits=False)* Returns
        return -K.mean(cost)
    return modified_crossentropy



In [11]:
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop


class PGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.memory = [{'states':[], 'gradients':[], 'rewards':[]}]
        self.states = []
        self.gradients = []
        self.rewards = []
        #self.probs = []
        self.model, self.model_trainable = self._build_model()
        #self.model_trainable.summary()

    def _build_model(self):
        input_state = Input(name='input_state', shape=(self.state_size,), dtype='float32')
        input_discount_reward = Input(name='input_discount_reward', shape=(1,), dtype='float32')
        x = Dense(32, activation='relu')(input_state)
        x = Dense(32, activation='relu')(x)
        x = Dense(32, activation='relu')(x)
        x = Dense(32, activation='relu')(x)
        x = Dense(self.action_size, activation='softmax')(x)
        #x_trainable = -multiply([input_discount_reward, x])
        model = Model(inputs=input_state, outputs=x)
        #model_trainable = Model(inputs=[input_state, input_discount_reward], outputs=x_trainable)
        #model_trainable.compile(loss='categorical_crossentropy')
        return model, 0#model_trainable

    def memorize(self, state, action, prob, reward):
        y = np.zeros([self.action_size])
        y[action] = 1
        self.memory[-1]['gradients'].append(np.array(y).astype('float32'))
        self.memory[-1]['states'].append(state)
        self.memory[-1]['rewards'].append(reward)

    def act(self, state):
        state = state.reshape([1, state.shape[0]])
        prob = self.model.predict(state, batch_size=1).flatten()
        #self.probs.append(prob)
        action = np.random.choice(self.action_size, 1, p=prob)[0]
        return action, prob

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        discounted_rewards[-1] = rewards[-1]
        for t in range(rewards.size - 1, 0, -1):
            discounted_rewards[t - 1] = self.gamma * discounted_rewards[t] + rewards[t - 1]
        return discounted_rewards

    def train(self):
        rewards = self.discount_rewards(np.vstack(self.memory[-1]['rewards']))
        rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-7)
        rewards = np.squeeze(np.vstack([rewards]))
        self.memory[-1]['discount_rewards'] = reward
        for i in range(len(self.memory)):
            X = np.squeeze(np.vstack([self.memory[i]['states']]))
            Y = np.squeeze(np.vstack([self.memory[i]['gradients']]))
            l = policy_gradient_loss([self.memory[i]['discount_rewards']])
            print(X.shape)
            print(Y.shape)
            print(rewards.shape)
            #self.model.compile(loss='categorical_crossentropy', optimizer='adam',  loss_weights=rewards)
            self.model.compile(loss=l, optimizer=RMSprop(learning_rate=0.00001, rho=0.9))
            self.model.train_on_batch(X,Y)
            #self.model_trainable.fit([X,rewards], Y)
            #self.model.train_on_batch(X, Y)
        #self.states, self.probs, self.gradients, self.rewards = [], [], [], []
        if len(self.memory) > 20 :
            self.memory.pop(0)
        self.memory.append({'states':[], 'gradients':[], 'rewards':[]})

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [12]:
env = gym.make('MountainCar-v0')
#env = gym.make('LunarLander-v2')
#env = gym.make('LunarLanderContinuous-v2')
env.reset()
for _ in range(3):
    action = env.action_space.sample()
    print('action: ')
    print(action)
    state, reward, done, info = env.step(action)
    print('state: ')
    print(state)
    print('reward: ')
    print(reward)
    print()

action: 
1
state: 
[-4.98782030e-01 -1.87354939e-04]
reward: 
-1.0

action: 
2
state: 
[-0.49815534  0.00062669]
reward: 
-1.0

action: 
1
state: 
[-4.97719287e-01  4.36050829e-04]
reward: 
-1.0



In [13]:

if __name__ == "__main__":
    env = gym.make('LunarLander-v2')
    #env = gym.make('MountainCar-v0')
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
    while True:
        action, prob = agent.act(state)
        state, reward, done, info = env.step(action)
        score += reward
        agent.memorize(state, action, prob, reward)

        if done:
            episode += 1
            agent.train()
            print('train')
            print('Episode: %d - Score: %f.' % (episode, score))
            score = 0
            state = env.reset()

(126, 8)
(126, 4)
(126,)
train
Episode: 1 - Score: -270.260663.
(126, 8)
(126, 4)
(75,)
(75, 8)
(75, 4)
(75,)
train
Episode: 2 - Score: -181.541671.
(126, 8)
(126, 4)
(89,)
(75, 8)
(75, 4)
(89,)
(89, 8)
(89, 4)
(89,)
train
Episode: 3 - Score: -126.860057.
(126, 8)
(126, 4)
(63,)
(75, 8)
(75, 4)
(63,)
(89, 8)
(89, 4)
(63,)
(63, 8)
(63, 4)
(63,)
train
Episode: 4 - Score: -43.568800.
(126, 8)
(126, 4)
(75,)
(75, 8)
(75, 4)
(75,)
(89, 8)
(89, 4)
(75,)
(63, 8)
(63, 4)
(75,)
(75, 8)
(75, 4)
(75,)
train
Episode: 5 - Score: -89.531249.
(126, 8)
(126, 4)
(83,)
(75, 8)
(75, 4)
(83,)
(89, 8)
(89, 4)
(83,)
(63, 8)
(63, 4)
(83,)
(75, 8)
(75, 4)
(83,)
(83, 8)
(83, 4)
(83,)
train
Episode: 6 - Score: -86.125232.
(126, 8)
(126, 4)
(87,)
(75, 8)
(75, 4)
(87,)
(89, 8)
(89, 4)
(87,)
(63, 8)
(63, 4)
(87,)
(75, 8)
(75, 4)
(87,)
(83, 8)
(83, 4)
(87,)
(87, 8)
(87, 4)
(87,)
train
Episode: 7 - Score: -95.351071.
(126, 8)
(126, 4)
(79,)
(75, 8)
(75, 4)
(79,)
(89, 8)
(89, 4)
(79,)
(63, 8)
(63, 4)
(79,)
(75, 8)
(7

(79, 8)
(79, 4)
(74,)
(95, 8)
(95, 4)
(74,)
(99, 8)
(99, 4)
(74,)
(125, 8)
(125, 4)
(74,)
(122, 8)
(122, 4)
(74,)
(68, 8)
(68, 4)
(74,)
(86, 8)
(86, 4)
(74,)
(62, 8)
(62, 4)
(74,)
(138, 8)
(138, 4)
(74,)
(92, 8)
(92, 4)
(74,)
(111, 8)
(111, 4)
(74,)
(86, 8)
(86, 4)
(74,)
(96, 8)
(96, 4)
(74,)
(87, 8)
(87, 4)
(74,)
(114, 8)
(114, 4)
(74,)
(71, 8)
(71, 4)
(74,)
(68, 8)
(68, 4)
(74,)
(118, 8)
(118, 4)
(74,)
(74, 8)
(74, 4)
(74,)
train
Episode: 26 - Score: -120.300833.
(87, 8)
(87, 4)
(111,)
(79, 8)
(79, 4)
(111,)
(95, 8)
(95, 4)
(111,)
(99, 8)
(99, 4)
(111,)
(125, 8)
(125, 4)
(111,)
(122, 8)
(122, 4)
(111,)
(68, 8)
(68, 4)
(111,)
(86, 8)
(86, 4)
(111,)
(62, 8)
(62, 4)
(111,)
(138, 8)
(138, 4)
(111,)
(92, 8)
(92, 4)
(111,)
(111, 8)
(111, 4)
(111,)
(86, 8)
(86, 4)
(111,)
(96, 8)
(96, 4)
(111,)
(87, 8)
(87, 4)
(111,)
(114, 8)
(114, 4)
(111,)
(71, 8)
(71, 4)
(111,)
(68, 8)
(68, 4)
(111,)
(118, 8)
(118, 4)
(111,)
(74, 8)
(74, 4)
(111,)
(111, 8)
(111, 4)
(111,)
train
Episode: 27 - Score: -180.0

(94, 8)
(94, 4)
(86,)
(128, 8)
(128, 4)
(86,)
(112, 8)
(112, 4)
(86,)
(79, 8)
(79, 4)
(86,)
(86, 8)
(86, 4)
(86,)
train
Episode: 41 - Score: -142.786382.
(114, 8)
(114, 4)
(91,)
(71, 8)
(71, 4)
(91,)
(68, 8)
(68, 4)
(91,)
(118, 8)
(118, 4)
(91,)
(74, 8)
(74, 4)
(91,)
(111, 8)
(111, 4)
(91,)
(121, 8)
(121, 4)
(91,)
(107, 8)
(107, 4)
(91,)
(75, 8)
(75, 4)
(91,)
(67, 8)
(67, 4)
(91,)
(76, 8)
(76, 4)
(91,)
(79, 8)
(79, 4)
(91,)
(78, 8)
(78, 4)
(91,)
(73, 8)
(73, 4)
(91,)
(112, 8)
(112, 4)
(91,)
(94, 8)
(94, 4)
(91,)
(128, 8)
(128, 4)
(91,)
(112, 8)
(112, 4)
(91,)
(79, 8)
(79, 4)
(91,)
(86, 8)
(86, 4)
(91,)
(91, 8)
(91, 4)
(91,)
train
Episode: 42 - Score: -205.211566.
(71, 8)
(71, 4)
(84,)
(68, 8)
(68, 4)
(84,)
(118, 8)
(118, 4)
(84,)
(74, 8)
(74, 4)
(84,)
(111, 8)
(111, 4)
(84,)
(121, 8)
(121, 4)
(84,)
(107, 8)
(107, 4)
(84,)
(75, 8)
(75, 4)
(84,)
(67, 8)
(67, 4)
(84,)
(76, 8)
(76, 4)
(84,)
(79, 8)
(79, 4)
(84,)
(78, 8)
(78, 4)
(84,)
(73, 8)
(73, 4)
(84,)
(112, 8)
(112, 4)
(84,)
(94, 8)
(9

(114, 8)
(114, 4)
(76,)
(118, 8)
(118, 4)
(76,)
(84, 8)
(84, 4)
(76,)
(92, 8)
(92, 4)
(76,)
(72, 8)
(72, 4)
(76,)
(90, 8)
(90, 4)
(76,)
(101, 8)
(101, 4)
(76,)
(97, 8)
(97, 4)
(76,)
(96, 8)
(96, 4)
(76,)
(78, 8)
(78, 4)
(76,)
(76, 8)
(76, 4)
(76,)
train
Episode: 57 - Score: -85.869721.
(128, 8)
(128, 4)
(94,)
(112, 8)
(112, 4)
(94,)
(79, 8)
(79, 4)
(94,)
(86, 8)
(86, 4)
(94,)
(91, 8)
(91, 4)
(94,)
(84, 8)
(84, 4)
(94,)
(58, 8)
(58, 4)
(94,)
(100, 8)
(100, 4)
(94,)
(64, 8)
(64, 4)
(94,)
(114, 8)
(114, 4)
(94,)
(118, 8)
(118, 4)
(94,)
(84, 8)
(84, 4)
(94,)
(92, 8)
(92, 4)
(94,)
(72, 8)
(72, 4)
(94,)
(90, 8)
(90, 4)
(94,)
(101, 8)
(101, 4)
(94,)
(97, 8)
(97, 4)
(94,)
(96, 8)
(96, 4)
(94,)
(78, 8)
(78, 4)
(94,)
(76, 8)
(76, 4)
(94,)
(94, 8)
(94, 4)
(94,)
train
Episode: 58 - Score: -321.943741.
(112, 8)
(112, 4)
(117,)
(79, 8)
(79, 4)
(117,)
(86, 8)
(86, 4)
(117,)
(91, 8)
(91, 4)
(117,)
(84, 8)
(84, 4)
(117,)
(58, 8)
(58, 4)
(117,)
(100, 8)
(100, 4)
(117,)
(64, 8)
(64, 4)
(117,)
(114, 8)
(1

(76, 8)
(76, 4)
(64,)
(99, 8)
(99, 4)
(64,)
(75, 8)
(75, 4)
(64,)
(111, 8)
(111, 4)
(64,)
(72, 8)
(72, 4)
(64,)
(73, 8)
(73, 4)
(64,)
(99, 8)
(99, 4)
(64,)
(88, 8)
(88, 4)
(64,)
(80, 8)
(80, 4)
(64,)
(105, 8)
(105, 4)
(64,)
(70, 8)
(70, 4)
(64,)
(72, 8)
(72, 4)
(64,)
(64, 8)
(64, 4)
(64,)
train
Episode: 73 - Score: -180.631984.
(97, 8)
(97, 4)
(79,)
(96, 8)
(96, 4)
(79,)
(78, 8)
(78, 4)
(79,)
(76, 8)
(76, 4)
(79,)
(94, 8)
(94, 4)
(79,)
(117, 8)
(117, 4)
(79,)
(80, 8)
(80, 4)
(79,)
(76, 8)
(76, 4)
(79,)
(99, 8)
(99, 4)
(79,)
(75, 8)
(75, 4)
(79,)
(111, 8)
(111, 4)
(79,)
(72, 8)
(72, 4)
(79,)
(73, 8)
(73, 4)
(79,)
(99, 8)
(99, 4)
(79,)
(88, 8)
(88, 4)
(79,)
(80, 8)
(80, 4)
(79,)
(105, 8)
(105, 4)
(79,)
(70, 8)
(70, 4)
(79,)
(72, 8)
(72, 4)
(79,)
(64, 8)
(64, 4)
(79,)
(79, 8)
(79, 4)
(79,)
train
Episode: 74 - Score: -174.632254.
(96, 8)
(96, 4)
(82,)
(78, 8)
(78, 4)
(82,)
(76, 8)
(76, 4)
(82,)
(94, 8)
(94, 4)
(82,)
(117, 8)
(117, 4)
(82,)
(80, 8)
(80, 4)
(82,)
(76, 8)
(76, 4)
(82,)
(99, 8

KeyboardInterrupt: 