In [None]:
%matplotlib inline

In [None]:
from env.balancebot_env import BalancebotEnv
from notebook.services.config import ConfigManager

from stable_baselines import PPO2
from stable_baselines.common.policies import FeedForwardPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.bench import Monitor

import papermill as pm
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import torch

In [None]:
log_dir = "/tmp/gym/{}".format(int(time.time()))
os.makedirs(log_dir, exist_ok=True)

In [None]:
pm.record("log_dir", log_dir)

In [None]:
# Create the environment
def make_env(rank):
    def _init():
        env = BalancebotEnv(render=False)
        env = Monitor(env, os.path.join(log_dir, str(rank)))
        return env
    return _init

num_cpu = 16
env = SubprocVecEnv([make_env(rank=i) for i in range(num_cpu)])


In [None]:
h1_dim = 32
h2_dim = 16
ts_num = 1e4

In [None]:
# Create the RL Agwnt
class CustomPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy, self).__init__(*args, **kwargs,
                                           layers=[h1_dim, h2_dim],
                                           feature_extraction="mlp")

model = PPO2(CustomPolicy, env, verbose=0, tensorboard_log=log_dir+"/tensorboard")


## How do I put the weight of encoder into model ?
## I want to put the weight into 'pi_fc0' and 'vf_fc0' and set them to untrainable
 

![Network](assets/network.png)

In [None]:
# Train and Save the agent
model.learn(total_timesteps=ts_num, tb_log_name="PPO2")
model.save("ppo_save")

In [None]:
def movingAverage(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    y = movingAverage(y, window=50)
    # Truncate x
    x = x[len(x) - len(y):]

    plt.ioff()
    fig = plt.figure(title)
    plt.plot(x, y)    
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed : " + "h1: " + str(h1_dim) + ", h2: " + str(h2_dim))
    pm.display('matplotlib_plot', fig)


In [None]:
plot_results(log_dir)

## You can open tensorboard at terminal
## For example:
### tensorboard --logdir log_dir+"/tensorboard"

In [None]:
# delete trained model to demonstrate loading
del model 

In [None]:
# Create the evaluation env
env = DummyVecEnv([lambda: BalancebotEnv(render=False)])

In [None]:
# Load the trained agent
model = PPO2.load("ppo_save", env=env, policy=CustomPolicy)

In [None]:

# Enjoy trained agent
for ep in range(10):
    obs = env.reset()
    dones = False
    while not dones:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)