<a href="https://colab.research.google.com/github/sugiyama404/ReinforcementLearningForGymOrAtari/blob/main/A2C/A2CForCarRacing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip uninstall gym -y # gym 0.17.3 was broken at 2021/11/08
!pip install gym gym[box2d] tensorflow-addons  > /dev/null 2>&1

Found existing installation: gym 0.21.0
Uninstalling gym-0.21.0:
  Successfully uninstalled gym-0.21.0


In [2]:
import gym
from gym import wrappers

import numpy as np
import pandas as pd
import time
from datetime import datetime
import random
import copy

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, ReLU, Input, Lambda, Conv2D, Flatten
from sklearn.preprocessing import StandardScaler
import tensorflow_addons as tfa
from tensorflow_addons.optimizers import RectifiedAdam

from dataclasses import dataclass

from tensorflow.keras.utils import Progbar

import math
from time import sleep


!apt update  > /dev/null 2>&1
!apt install xvfb  > /dev/null 2>&1
!pip install pyvirtualdisplay  > /dev/null 2>&1
from pyvirtualdisplay import Display
d = Display()
d.start()

<pyvirtualdisplay.display.Display at 0x7f17b85acb50>

In [3]:
class Brain:
    def __init__(self):

        obs_shape = (96,96,3)
        nb_actions = 5
        #opt = Adam(learning_rate=0.001, epsilon=0.1)
        opt = RectifiedAdam(learning_rate=0.001, epsilon=0.1)
        #opt = RectifiedAdam(learning_rate=0.0001, epsilon=0.001)
        input_ = inputs = Input(shape=obs_shape)
        # 32, 64, 128
        common = Conv2D(8, kernel_size=(8, 8), strides=(4, 4), activation='relu')(inputs)
        common = Conv2D(16, kernel_size=(4, 4), strides=(2, 2), activation='relu')(common)
        common = Conv2D(32, kernel_size=(3, 3), strides=(1, 1), activation='relu')(common)
        common = Flatten()(common)

        common = Dense(512, activation='relu')(common)
        common = Dense(100, activation='relu')(common)
        actor_layer = keras.layers.Dense(nb_actions, activation="softmax")(common)
        critic_layer = keras.layers.Dense(1, activation="linear")(common)

        model = keras.Model(input_, [actor_layer, critic_layer])
        model.compile(loss = "mean_absolute_error", optimizer=opt)
        model.summary()
        Brain.model = model

In [4]:
class Actor(Brain):
    def __init__(self):
        super().__init__()

    def policynetwork(self, state):
        act_p, _ = Brain.model(np.array([state]))
        return np.random.choice(5, p=act_p[0].numpy())

In [5]:
class Critic(Brain):
    def __init__(self):

        self.gamma = 0.90
        self.beta  = 0.1

    def valuenetwork(self, val):

        states, next_states, actions = val['state'], val['next_state'], val['act']
        next_rewards, rewards, dones = val['reward'], val['next_reward'], val['done']

        onehot_actions = tf.one_hot(actions, 5)

        with tf.GradientTape() as tape:

            act_p, v = Brain.model(states, training=True)
            _, next_v = Brain.model(next_states, training=True)

            a_pi = tf.reduce_sum(onehot_actions * act_p, axis=1, keepdims=True)
            a_pi = tf.clip_by_value(a_pi, 1e-10, 1.0)

            q = (1 - dones) * next_rewards + self.gamma * next_v
            advantage = q - v

            value_losses = self._value_losses(advantage)
            policy_losses = self._policy_losses(advantage, a_pi, v)
            total_loss = value_losses + policy_losses
            loss = tf.reduce_mean(total_loss)

        gradients = tape.gradient(loss, Brain.model.trainable_variables)
        Brain.model.optimizer.apply_gradients(zip(gradients, Brain.model.trainable_variables))

    def _value_losses(self,advantage):
        return (advantage)**2

    def _policy_losses(self,advantage,a_pi,v):

        a = tf.math.log(a_pi) * advantage
        b = self._entropy(v)
        policy_losses = - ( a + b )
        return policy_losses

    def _entropy(self, v):
        sigma = tf.math.reduce_std(v)
        sigma = tf.math.square(sigma)
        entropy = self.beta*0.5*(tf.math.log(2 * math.pi * sigma) + 1)
        return entropy

In [6]:
@dataclass
class ExperiencesMemory:
    state : np.ndarray = np.empty((0,96, 96, 3))
    next_state : np.ndarray = np.empty((0,96, 96, 3))
    action : np.ndarray = np.array([],int)
    reward : np.ndarray = np.array([])
    done : np.ndarray = np.array([])
    batch_size : int = 32

    def reset_experiences(self):
        self.state = np.empty((0,96, 96, 3))
        self.next_state = np.empty((0,96, 96, 3))
        self.action = np.array([],int)
        self.reward = np.array([])
        self.done = np.array([])

    def set_experiences(self, state, next_state, action, reward, done):
        state = np.reshape(state, [1, 96, 96, 3])
        self.state = np.append(self.state, state, axis=0)
        next_state = np.reshape(next_state, [1, 96, 96, 3])
        self.next_state = np.append(self.next_state, next_state, axis=0)
        self.action = np.append(self.action, np.array(action))
        self.reward = np.append(self.reward, np.array(reward))
        self.done = np.append(self.done, np.array(done))

    def get_experiences(self):
        mb_index = np.random.choice((len(self.action) - 1), self.batch_size, replace=False)
        next_mb_index = mb_index + 1
        key = ['state','next_state','act','reward','next_reward','done']
        value = [self.state[mb_index], self.next_state[mb_index],
                 self.action[mb_index], self.reward[mb_index],
                 self.reward[next_mb_index], self.done[mb_index]]
        dict1=dict(zip(key,value))
        return dict1

    def isGetter(self):
        return True if((len(self.action) % 10 == 0) and (len(self.action) > (self.batch_size + 1))) else False

In [7]:
class Main:
    def __init__(self, env, actor, critic, experiences, episodes_times = 1000):
        self.env = env
        self.actor = actor
        self.critic = critic
        self.experiences = experiences
        self.episodes_times = episodes_times

    def play_game(self):
        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['score']
                if (int(str(self.episodes_times)[:-1])*10 == episode):
                    pb_i = Progbar(int(str(self.episodes_times)[-1]), stateful_metrics=metrics_names)
                else:
                    pb_i = Progbar(10, stateful_metrics=metrics_names)
                score_mean = np.array([])

            state = self.env.reset()
            done = False
            score = 0
            self.experiences.reset_experiences()
    
            while not done:
                self.env.render()              
                action = self.actor.policynetwork(state)
                tmp_action = self.action_clipping(action)
                next_state, reward, done, info = self.env.step(tmp_action)
                score+=reward

                self.experiences.set_experiences(state, next_state, action, reward, done)
                if self.experiences.isGetter():
                    m_batch = self.experiences.get_experiences()
                    self.critic.valuenetwork(m_batch)
                    self.experiences.reset_experiences()

                state = next_state

            score_mean = np.append(score_mean, score)
            values = [('score',np.mean(score_mean))]
            pb_i.add(1, values=values)

        self.env.close()

    def action_clipping(self, val):
        actions = np.array([[ 0, 0, 0],  # [0]: straight
                            [ 0, 1, 0],  # [1]: acceleration
                            [ 0, 0, 1],  # [2]: decelerate
                            [ 1, 0, 0],  # [3]: Turn right
                            [-1, 0, 0]]) # [4]: Turn left
        return actions[val]

In [8]:
episodes_times = 200
batch_size = 32

actor = Actor()
critic = Critic()
experiences = ExperiencesMemory(batch_size = batch_size)
gym.logger.set_level(40)
env = gym.make('CarRacing-v0')
env.unwrapped.verbose = 0
env = wrappers.Monitor(env, './', force=True, video_callable=(lambda ep: ep % 25 == 0))
main = Main(env, actor, critic, experiences, episodes_times)
main.play_game()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 96, 96, 3)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 23, 23, 8)    1544        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 10, 10, 16)   2064        conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 8, 8, 32)     4640        conv2d_1[0][0]                   
______________________________________________________________________________________________