In [None]:
!pip3 install box2d-py



In [None]:
import os
import gym
import numpy as np
from PPO import PPO

class PpoLearning():

    def __init__(self, no_games, PPO):

        self.no_games = no_games
        self.ppo = PPO

    def train(self):

        game_rew_hist = [] # list to hold the total rewards per game
        iters = 0
        step = 0

        for game in range(1, no_games+1):

            done = False
            game_total_rew = 0
            state = env.reset()

            while not done:
                action, log_prob, value = self.ppo.get_action_value(state)
                next_state , reward, done, info = env.step(action)
                step += 1
                game_total_rew += reward
                self.ppo.memory.save_memory(state, value, action, log_prob, reward, done)
                if step % self.ppo.memory.T == 0:
                    self.ppo.optimize()
                    iters += 1
                    self.ppo.memory.reset()
                state = next_state

            game_rew_hist.append(game_total_rew)

            if (game) % 10 == 0:
                avg_score = np.mean(game_rew_hist[-10:])

                print('Episode: ', game, 'average score:', avg_score, 
                'learning_iterations:', iters)

In [None]:
# define the environment
env = gym.make('LunarLander-v2')

# hyper-parameters for PPO learning
epochs = 4 # number of epochs per learning
no_batches = 2 # number of batches for splitting the timesteps
hidden_dim = 256
gamma = 0.99 # discount factor
gae_lambda = 0.95 # gae smoothing parameter
lr_actor = 0.0003
lr_critic = 0.0003
clip = 0.2 # PPO clipping epsilon parameter
T = 10 # timesteps per each learning

ppo = PPO(env=env, T=T, hidden_dim=hidden_dim, gamma=gamma, 
    gae_lambda=gae_lambda, clip=clip, no_batches=no_batches,
    epochs=epochs, lr_actor=lr_actor, lr_critic=lr_critic)

no_games = 400 

training = PpoLearning(no_games, ppo)
training.train()

Episode:  10 average score: -300.6651341053953 learning_iterations: 97
Episode:  20 average score: -192.17584528859837 learning_iterations: 174
Episode:  30 average score: -185.8861826264018 learning_iterations: 262
Episode:  40 average score: -271.4786175337487 learning_iterations: 386
Episode:  50 average score: -178.13296602210326 learning_iterations: 732
Episode:  60 average score: -82.98857865063013 learning_iterations: 1348
Episode:  70 average score: -156.58717081276703 learning_iterations: 2059
Episode:  80 average score: -67.04186497424729 learning_iterations: 3034
Episode:  90 average score: -81.21109393901874 learning_iterations: 3892
Episode:  100 average score: -74.7623382420826 learning_iterations: 4756
Episode:  110 average score: -78.46268327336813 learning_iterations: 5756
Episode:  120 average score: -77.55109821764067 learning_iterations: 6734
Episode:  130 average score: 184.26495201350303 learning_iterations: 7376
Episode:  140 average score: 148.81720177237895 lea