In [1]:
!pip3 install box2d-py



In [1]:
import os
import gym
import numpy as np
from PPO import PPO

class PpoLearning():

    def __init__(self, no_games, PPO):

        self.no_games = no_games
        self.ppo = PPO

    def train(self):

        game_rew_hist = [] # list to hold the total rewards per game
        iters = 0
        step = 0

        for game in range(1, no_games+1):

            done = False
            game_total_rew = 0
            state = env.reset()

            while not done:
                action, log_prob, value = self.ppo.get_action_value(state)
                next_state , reward, done, info = env.step(action)
                step += 1
                game_total_rew += reward
                self.ppo.memory.save_memory(state, value, action, log_prob, reward, done)
                if step % self.ppo.memory.T == 0:
                    self.ppo.optimize()
                    iters += 1
                    self.ppo.memory.reset()
                state = next_state

            game_rew_hist.append(game_total_rew)

            if (game) % 10 == 0:
                avg_score = np.mean(game_rew_hist[-10:])

                print('Episode: ', game, 'average score:', avg_score, 
                'learning_iterations:', iters)

In [3]:
# define the environment
env = gym.make('LunarLander-v2')

# hyper-parameters for PPO learning
epochs = 4 # number of epochs per learning
no_batches = 2 # number of batches for splitting the timesteps
hidden_dim = 256
gamma = 0.99 # discount factor
gae_lambda = 0.95 # gae smoothing parameter
lr_actor = 0.0003
lr_critic = 0.0005
clip = 0.2 # PPO clipping epsilon parameter
T = 10 # timesteps per each learning

ppo = PPO(env=env, T=T, hidden_dim=hidden_dim, gamma=gamma, 
    gae_lambda=gae_lambda, clip=clip, no_batches=no_batches,
    epochs=epochs, lr_actor=lr_actor, lr_critic=lr_critic)

no_games = 400 

training = PpoLearning(no_games, ppo)
training.train()

Episode:  10 average score: -209.53796400402243 learning_iterations: 145
Episode:  20 average score: -278.60772269850116 learning_iterations: 236
Episode:  30 average score: -231.90745944621113 learning_iterations: 480
Episode:  40 average score: -220.6486923304828 learning_iterations: 1214
Episode:  50 average score: -247.24998869328593 learning_iterations: 1780
Episode:  60 average score: -110.4599722604119 learning_iterations: 2390
Episode:  70 average score: -122.00230300250897 learning_iterations: 3210
Episode:  80 average score: -102.48884521698888 learning_iterations: 4051
Episode:  90 average score: -107.05408009912017 learning_iterations: 5051
Episode:  100 average score: -85.48473467150889 learning_iterations: 6051
Episode:  110 average score: -32.89486626684405 learning_iterations: 7037
Episode:  120 average score: -9.609541616276406 learning_iterations: 7865
Episode:  130 average score: -4.804643463310162 learning_iterations: 8651
Episode:  140 average score: 97.96652868860