# Run PPO on Ant from [here](https://github.com/pat-coady/trpo)
#### More descriptions can be found on Patrick Coady's blog regarding [gym and ppo](https://learningai.io/projects/2017/07/28/ai-gym-workout.html) or [descriptions about Ant env](https://gist.github.com/pat-coady/bac60888f011199aad72d2f1e6f5a4fa)

In [1]:
import gym,mujoco_py,warnings,time,os,glob,shutil,csv,skvideo.io
gym.logger.set_level(40)
warnings.filterwarnings("ignore") 
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym.envs import mujoco
from datetime import datetime
from util import PID_class,Scaler,Logger,display_frames_as_gif
from custom_ant import AntEnvCustom
from ppo import NNValueFunction,Policy,run_episode,run_policy,add_value,discount,\
    add_disc_sum_rew,add_gae,build_train_set,log_batch_stats,run_episode_vid
np.set_printoptions(precision=2,linewidth=150)
%matplotlib inline  
%config InlineBackend.figure_format = 'retina'
print ("Packages Loaded") 

Packages Loaded


### main

In [2]:
env = AntEnvCustom()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
env.reset() # Reset 
# render_img = env.render(mode='rgb_array')
print ("obs_dim:[%d] act_dim:[%d]"%(obs_dim,act_dim))

obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
# Logger
env_name = 'Ant'
now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
logger = Logger(logName=env_name,now=now,_NOTUSE=True)
aigym_path = os.path.join('/tmp', env_name, now)
# Scaler
scaler = Scaler(obs_dim)
# Value function
hid1_mult = 10
val_func = NNValueFunction(obs_dim, hid1_mult)
# Policy Function
kl_targ = 0.003
policy_logvar = -1.0
policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)

Custom Ant Environment made by SJ.
obs_dim:[111] act_dim:[8]
Value Params -- h1: 1120, h2: 74, h3: 5, lr: 0.00116
Policy Params -- h1: 1120, h2: 299, h3: 80, lr: 5.2e-05, logvar_speed: 16
setting up loss with KL penalty


### Run policy for the first time

In [3]:
trajectories = run_policy(env, policy, scaler, logger, episodes=5)
add_value(trajectories, val_func)  # add estimated values to episodes
gamma = 0.995 # Discount factor 
lam = 0.95 # Lambda for GAE
add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
add_gae(trajectories, gamma, lam)  # calculate advantage
print ('observes shape:',trajectories[0]['observes'].shape)
print ('actions shape:',trajectories[0]['actions'].shape)
print ('rewards shape:',trajectories[0]['rewards'].shape)
print ('unscaled_obs shape:',trajectories[0]['unscaled_obs'].shape)
print ('values shape:',trajectories[0]['values'].shape)
print ('disc_sum_rew shape:',trajectories[0]['disc_sum_rew'].shape)
print ('advantages shape:',trajectories[0]['advantages'].shape)

observes shape: (27, 112)
actions shape: (27, 8)
rewards shape: (27,)
unscaled_obs shape: (27, 112)
values shape: (27,)
disc_sum_rew shape: (27,)
advantages shape: (27,)


### Loop

In [4]:
SAVE_VID = True
MAKE_GIF = False 

In [None]:
maxEpoch  = 10000
batchSize = 50
for _epoch in range(maxEpoch):
    # 1. Run policy
    trajectories = run_policy(env, policy, scaler, logger, episodes=batchSize)
    # 2. Get (predict) value from the critic network 
    add_value(trajectories, val_func)  # add estimated values to episodes
    # 3. Get GAE
    gamma = 0.995 # Discount factor 
    lam = 0.95 # Lambda for GAE
    add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
    add_gae(trajectories, gamma, lam)  # calculate advantage
    # concatenate all episodes into single NumPy arrays
    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    # add various stats to training log:
    # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
    # Update
    policy.update(observes, actions, advantages, logger)  # update policy
    val_func.fit(observes, disc_sum_rew, logger)  # update value function
    # logger.write(display=True)  # write logger results to file and stdout
    
    # Print
    for _tIdx in range(len(trajectories)):
        rs = trajectories[_tIdx]['rewards']
        if _tIdx == 0: rTotal = rs
        else: rTotal = np.concatenate((rTotal,rs))
        # Reward details      
    sumRwd = rTotal.sum()
    reward_contacts,reward_ctrls,reward_forwards,reward_headings,reward_survives = [],[],[],[],[]
    for traj in trajectories:
        cTraj = traj['rDetails']
        for _iIdx in range(len(cTraj)):
            reward_contacts.append(cTraj[_iIdx]['reward_contact'])
            reward_ctrls.append(cTraj[_iIdx]['reward_ctrl'])
            reward_forwards.append(cTraj[_iIdx]['reward_forward'])
            reward_headings.append(cTraj[_iIdx]['reward_heading'])
            reward_survives.append(cTraj[_iIdx]['reward_survive'])
    sumReward_contact = np.asarray(reward_contacts).sum()
    sumReward_ctrl = np.asarray(reward_ctrls).sum()
    sumReward_forward = np.asarray(reward_forwards).sum()
    sumReward_heading = np.asarray(reward_headings).sum()
    sumReward_survive = np.asarray(reward_survives).sum()
    print ("[%d/%d](#total:%d) sumRwd:[%.3f](cntct:%.3f+ctrl:%.3f+fwd:%.3f+head:%.3f+srv:%.3f)"%
           (_epoch,maxEpoch,(_epoch+1)*batchSize,sumRwd,
           sumReward_contact,sumReward_ctrl,sumReward_forward,sumReward_heading,sumReward_survive))
    
    # SHOW EVERY 
    PLOT_EVERY = 20 
    DO_ANIMATE = False
    if ((_epoch%PLOT_EVERY)==0 ) | (_epoch==(maxEpoch-1)):
        ret = run_episode_vid(env, policy, scaler)
        print ("  [^] sumRwd:[%.3f] Xdisp:[%.3f] hDisp:[%.1f]"%
               (np.asarray(ret['rewards']).sum(),ret['xDisp'],ret['hDisp']))
        if MAKE_GIF:
            display_frames_as_gif(ret['frames'])
        if SAVE_VID:
            outputdata = np.asarray(ret['frames']).astype(np.uint8)
            vidName = 'vids/ant_ppo_epoch%03d.mp4'%(_epoch)
            skvideo.io.vwrite(vidName,outputdata)
            print ("[%s] saved."%(vidName))
print ("Done.") 

[0/10000](#total:50) sumRwd:[-10673.377](cntct:-5.155+ctrl:-9244.300+fwd:12.265+head:-6274.187+srv:4838.000)
Creating window glfw
  [^] sumRwd:[-8.423] Xdisp:[-0.034] hDisp:[5.3]
[vids/ant_ppo_epoch000.mp4] saved.
[1/10000](#total:100) sumRwd:[-8664.426](cntct:-4.282+ctrl:-7905.050+fwd:-188.290+head:-4803.805+srv:4237.000)
[2/10000](#total:150) sumRwd:[-7590.451](cntct:-3.927+ctrl:-6976.492+fwd:-120.645+head:-4149.386+srv:3660.000)
[3/10000](#total:200) sumRwd:[-12299.852](cntct:-5.238+ctrl:-9772.762+fwd:-250.157+head:-7309.695+srv:5038.000)
[4/10000](#total:250) sumRwd:[-9932.283](cntct:-4.735+ctrl:-8499.111+fwd:40.085+head:-5941.521+srv:4473.000)
[5/10000](#total:300) sumRwd:[-9368.434](cntct:-4.285+ctrl:-7767.812+fwd:182.308+head:-5866.645+srv:4088.000)
[6/10000](#total:350) sumRwd:[-7363.795](cntct:-4.047+ctrl:-7530.047+fwd:-110.979+head:-3620.723+srv:3902.000)
[7/10000](#total:400) sumRwd:[-10128.814](cntct:-4.892+ctrl:-8762.523+fwd:-76.960+head:-5821.439+srv:4537.000)
[8/10000](#

### Animate final motion

In [None]:
SAVE_VID_FINAL = True
MAKE_GIF_FINAL = False 

In [None]:
for _i in range(3):
    ret = run_episode_vid(env, policy, scaler)
    if MAKE_GIF_FINAL:
        display_frames_as_gif(ret['frames'])
    if SAVE_VID_FINAL:
        outputdata = np.asarray(ret['frames']).astype(np.uint8)
        vidName = 'vids/ant_ppo_final_%d.mp4'%(_i)
        skvideo.io.vwrite(vidName,outputdata)
        print ("[%s] saved."%(vidName))

### Finished

In [None]:
DO_CLOSE = False # There is no turning back. 
if DO_CLOSE:
    logger.close()
    policy.close_sess()
    val_func.close_sess()