In [1]:
import sys; sys.path.insert(0, '..')

import gym
import sys
import torch
import tester
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output, Javascript

import pytorch_drl.models.ppo_models as models

from pytorch_drl.algs.ppo import PPO

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
device =torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda:0


## 1. Define Utils:

### 1.1 Plotting:

In [3]:
def mean_filter(arr, filter_len):
    arr = np.concatenate([[0]*(filter_len-1), arr])
    filter_arr = np.ones(filter_len)/filter_len
    arr = arr
    output = list()
    for i in range(filter_len-1, len(arr)):
        conv = np.sum(filter_arr * arr[i-filter_len+1:i+1])
        output.append(conv)
    return output

def plot(scores, n=None):
    if n is not None:
        scores = mean_filter(scores, n)
    clear_output(True)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode')
    plt.show()

In [4]:
def copy_network(network1, network2):
    network2.load_state_dict(network1.state_dict())

## 2. Create environment

In [5]:
env_name = "LunarLander-v2"
env_name = "CartPole-v0"
env = gym.make(env_name)
env.seed(0)

state_size =  env.observation_space.shape[0]
action_size = env.action_space.n

print("State size:", state_size, "\nAction size:", action_size)

State size: 4 
Action size: 2




## 3. Define networks for different algorithms

In [5]:
ppo_model = models.PPOPolicyNetwork(state_size, action_size)

## 4. PPO Test

In [None]:
tmax = 200
n_traj = 2000

n_env = 8
epochs = 4
batch_size = 16 * n_env #tmax * n_env // 4    #32

ppo_model = models.PPOPolicyNetwork(state_size, action_size)
# init agent:
agent = PPO(ppo_model,
            env_name,
            n_env=n_env,
            epochs=epochs,
            lr=1e-4,
            critic_coef=0.5,
            normalize_rewards=False,
            max_grad_norm=0.5,
            mini_batch_size=batch_size,
            device="cpu"
            )
# train the agent
scores, losses = agent.train(tmax, n_traj,  env)

# plot the training:
x = np.arange(len(scores))
scores = mean_filter(scores, 50)
#losses = mean_filter(losses, 50)
plt.plot(x, scores, label = "scores")
#plt.plot(x, losses, label = "losses")
#plt.legend()
plt.show()

Trajectory 0, AVG. Loss 146.69
TEST at 0; score is 23.0
AVG score is 23.0
Trajectory 1, AVG. Loss 149.05
TEST at 1; score is 20.0
AVG score is 21.5
Trajectory 2, AVG. Loss 127.78
TEST at 2; score is 38.0
AVG score is 27.0
Trajectory 3, AVG. Loss 134.75
TEST at 3; score is 30.0
AVG score is 27.75
Trajectory 4, AVG. Loss 133.94
TEST at 4; score is 17.0
AVG score is 25.6
Trajectory 5, AVG. Loss 124.31
TEST at 5; score is 70.0
AVG score is 33.0
Trajectory 6, AVG. Loss 116.06
TEST at 6; score is 24.0
AVG score is 31.714285714285715
Trajectory 7, AVG. Loss 112.83
TEST at 7; score is 34.0
AVG score is 32.0
Trajectory 8, AVG. Loss 105.77
TEST at 8; score is 13.0
AVG score is 29.88888888888889
Trajectory 9, AVG. Loss 101.32
TEST at 9; score is 19.0
AVG score is 28.8
Trajectory 10, AVG. Loss 94.39
TEST at 10; score is 32.0
AVG score is 29.09090909090909
Trajectory 11, AVG. Loss 91.16
TEST at 11; score is 41.0
AVG score is 30.083333333333332
Trajectory 12, AVG. Loss 87.96
TEST at 12; score is 18.

Trajectory 107, AVG. Loss 109.20
TEST at 107; score is 81.0
AVG score is 84.1
Trajectory 108, AVG. Loss 109.82
TEST at 108; score is 200.0
AVG score is 84.8
Trajectory 109, AVG. Loss 109.84
TEST at 109; score is 146.0
AVG score is 89.95
Trajectory 110, AVG. Loss 108.97
TEST at 110; score is 71.0
AVG score is 90.85
Trajectory 111, AVG. Loss 104.82
TEST at 111; score is 88.0
AVG score is 90.3
Trajectory 112, AVG. Loss 105.03
TEST at 112; score is 148.0
AVG score is 94.15
Trajectory 113, AVG. Loss 103.48
TEST at 113; score is 100.0
AVG score is 95.0
Trajectory 114, AVG. Loss 105.79
TEST at 114; score is 100.0
AVG score is 95.25
Trajectory 115, AVG. Loss 102.06
TEST at 115; score is 95.0
AVG score is 95.55
Trajectory 116, AVG. Loss 100.83
TEST at 116; score is 84.0
AVG score is 96.1
Trajectory 117, AVG. Loss 100.12
TEST at 117; score is 103.0
AVG score is 98.4
Trajectory 118, AVG. Loss 98.06
TEST at 118; score is 45.0
AVG score is 95.25
Trajectory 119, AVG. Loss 97.72
TEST at 119; score is

Trajectory 211, AVG. Loss 41.89
TEST at 211; score is 200.0
AVG score is 158.4
Trajectory 212, AVG. Loss 44.48
TEST at 212; score is 189.0
AVG score is 158.9
Trajectory 213, AVG. Loss 44.38
TEST at 213; score is 179.0
AVG score is 157.85
Trajectory 214, AVG. Loss 44.85
TEST at 214; score is 200.0
AVG score is 157.85
Trajectory 215, AVG. Loss 45.02
TEST at 215; score is 151.0
AVG score is 158.25
Trajectory 216, AVG. Loss 50.44
TEST at 216; score is 138.0
AVG score is 156.3
Trajectory 217, AVG. Loss 52.61
TEST at 217; score is 200.0
AVG score is 160.6
Trajectory 218, AVG. Loss 52.86
TEST at 218; score is 135.0
AVG score is 162.25
Trajectory 219, AVG. Loss 52.24
TEST at 219; score is 137.0
AVG score is 163.3
Trajectory 220, AVG. Loss 53.34
TEST at 220; score is 195.0
AVG score is 163.1
Trajectory 221, AVG. Loss 52.91
TEST at 221; score is 181.0
AVG score is 165.9
Trajectory 222, AVG. Loss 50.54
TEST at 222; score is 173.0
AVG score is 164.55
Trajectory 223, AVG. Loss 49.14
TEST at 223; sc

### 4.1 Trained Agent Demonstration

In [70]:
tester.test_agent(agent, env, max_t=200, render=True, num_of_episodes=5, log=True)

30.0
111.0
72.0
91.0
105.0


## 5. PPO Continuous Test


### 5.1 Trained Agent Demonstration