# Run PPO on Ant from [here](https://github.com/pat-coady/trpo)
#### More descriptions can be found on Patrick Coady's blog regarding [gym and ppo](https://learningai.io/projects/2017/07/28/ai-gym-workout.html) or [descriptions about Ant env](https://gist.github.com/pat-coady/bac60888f011199aad72d2f1e6f5a4fa)

In [1]:
import gym,mujoco_py,warnings,time,os,glob,shutil,csv,skvideo.io
gym.logger.set_level(40)
warnings.filterwarnings("ignore") 
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym.envs import mujoco
from datetime import datetime
from util import PID_class,Scaler,Logger,display_frames_as_gif
from custom_ant import AntEnvCustom
from ppo import NNValueFunction,Policy,run_episode,run_policy,add_value,discount,\
    add_disc_sum_rew,add_gae,build_train_set,log_batch_stats,run_episode_vid
np.set_printoptions(precision=2,linewidth=150)
%matplotlib inline  
%config InlineBackend.figure_format = 'retina'
print ("Packages Loaded") 

Packages Loaded


### main

In [2]:
env = AntEnvCustom()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
env.reset() # Reset 
# render_img = env.render(mode='rgb_array')
print ("obs_dim:[%d] act_dim:[%d]"%(obs_dim,act_dim))

obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
# Logger
env_name = 'Ant'
now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
logger = Logger(logName=env_name,now=now,_NOTUSE=True)
aigym_path = os.path.join('/tmp', env_name, now)
# Scaler
scaler = Scaler(obs_dim)
# Value function
hid1_mult = 10
val_func = NNValueFunction(obs_dim, hid1_mult)
# Policy Function
kl_targ = 0.003
policy_logvar = -1.0
policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)

Custom Ant Environment made by SJ.
obs_dim:[111] act_dim:[8]
Value Params -- h1: 1120, h2: 74, h3: 5, lr: 0.00116
Policy Params -- h1: 1120, h2: 299, h3: 80, lr: 5.2e-05, logvar_speed: 16
setting up loss with KL penalty


### Run policy for the first time

In [3]:
trajectories = run_policy(env, policy, scaler, logger, episodes=5)
add_value(trajectories, val_func)  # add estimated values to episodes
gamma = 0.995 # Discount factor 
lam = 0.95 # Lambda for GAE
add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
add_gae(trajectories, gamma, lam)  # calculate advantage
print ('observes shape:',trajectories[0]['observes'].shape)
print ('actions shape:',trajectories[0]['actions'].shape)
print ('rewards shape:',trajectories[0]['rewards'].shape)
print ('unscaled_obs shape:',trajectories[0]['unscaled_obs'].shape)
print ('values shape:',trajectories[0]['values'].shape)
print ('disc_sum_rew shape:',trajectories[0]['disc_sum_rew'].shape)
print ('advantages shape:',trajectories[0]['advantages'].shape)

observes shape: (42, 112)
actions shape: (42, 8)
rewards shape: (42,)
unscaled_obs shape: (42, 112)
values shape: (42,)
disc_sum_rew shape: (42,)
advantages shape: (42,)


### Loop

In [4]:
SAVE_VID = True
MAKE_GIF = False 

In [None]:
maxEpoch  = 10000
batchSize = 50
for _epoch in range(maxEpoch):
    # 1. Run policy
    trajectories = run_policy(env, policy, scaler, logger, episodes=batchSize)
    # 2. Get (predict) value from the critic network 
    add_value(trajectories, val_func)  # add estimated values to episodes
    # 3. Get GAE
    gamma = 0.995 # Discount factor 
    lam = 0.95 # Lambda for GAE
    add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
    add_gae(trajectories, gamma, lam)  # calculate advantage
    # concatenate all episodes into single NumPy arrays
    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    # add various stats to training log:
    # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
    # Update
    policy.update(observes, actions, advantages, logger)  # update policy
    val_func.fit(observes, disc_sum_rew, logger)  # update value function
    # logger.write(display=True)  # write logger results to file and stdout
    
    # Print
    for _tIdx in range(len(trajectories)):
        rs = trajectories[_tIdx]['rewards']
        if _tIdx == 0: rTotal = rs
        else: rTotal = np.concatenate((rTotal,rs))
        # Reward details      
    reward_contacts,reward_ctrls,reward_forwards,reward_headings,reward_survives = [],[],[],[],[]
    tickSum = 0
    for traj in trajectories:
        tickSum += traj['rewards'].shape[0]
        cTraj = traj['rDetails']
        for _iIdx in range(len(cTraj)):
            reward_contacts.append(cTraj[_iIdx]['reward_contact'])
            reward_ctrls.append(cTraj[_iIdx]['reward_ctrl'])
            reward_forwards.append(cTraj[_iIdx]['reward_forward'])
            reward_headings.append(cTraj[_iIdx]['reward_heading'])
            reward_survives.append(cTraj[_iIdx]['reward_survive'])
    tickAvg = tickSum / batchSize
    sumRwd = rTotal.sum() / batchSize
    sumReward_contact = np.asarray(reward_contacts).sum() / batchSize
    sumReward_ctrl = np.asarray(reward_ctrls).sum() / batchSize
    sumReward_forward = np.asarray(reward_forwards).sum() / batchSize
    sumReward_heading = np.asarray(reward_headings).sum() / batchSize
    sumReward_survive = np.asarray(reward_survives).sum() / batchSize
    print ("[%d/%d](#total:%d) sumRwd:[%.3f](cntct:%.3f+ctrl:%.3f+fwd:%.3f+head:%.3f+srv:%.3f) tickAvg:[%d]"%
           (_epoch,maxEpoch,(_epoch+1)*batchSize,sumRwd,
           sumReward_contact,sumReward_ctrl,sumReward_forward,sumReward_heading,sumReward_survive,tickAvg))
    
    # SHOW EVERY 
    PLOT_EVERY = 20 
    DO_ANIMATE = False
    if ((_epoch%PLOT_EVERY)==0 ) | (_epoch==(maxEpoch-1)):
        ret = run_episode_vid(env, policy, scaler)
        print ("  [^] sumRwd:[%.3f] Xdisp:[%.3f] hDisp:[%.1f]"%
               (np.asarray(ret['rewards']).sum(),ret['xDisp'],ret['hDisp']))
        if MAKE_GIF:
            display_frames_as_gif(ret['frames'])
        if SAVE_VID:
            outputdata = np.asarray(ret['frames']).astype(np.uint8)
            vidName = 'vids/ant_ppo_epoch%03d.mp4'%(_epoch)
            skvideo.io.vwrite(vidName,outputdata)
            print ("[%s] saved."%(vidName))
print ("Done.") 

[0/10000](#total:50) sumRwd:[-129.968](cntct:-0.083+ctrl:-149.131+fwd:-9.885+head:-47.769+srv:76.900) tickAvg:[76]
Creating window glfw
  [^] sumRwd:[-33.288] Xdisp:[-0.314] hDisp:[-67.1]
[vids/ant_ppo_epoch000.mp4] saved.
[1/10000](#total:100) sumRwd:[-138.597](cntct:-0.093+ctrl:-167.291+fwd:-7.121+head:-51.072+srv:86.980) tickAvg:[86]
[2/10000](#total:150) sumRwd:[-151.172](cntct:-0.090+ctrl:-165.621+fwd:-7.401+head:-63.540+srv:85.480) tickAvg:[85]
[3/10000](#total:200) sumRwd:[-130.932](cntct:-0.085+ctrl:-151.000+fwd:-11.546+head:-46.801+srv:78.500) tickAvg:[78]
[4/10000](#total:250) sumRwd:[-176.840](cntct:-0.100+ctrl:-186.039+fwd:-12.293+head:-72.548+srv:94.140) tickAvg:[94]
[5/10000](#total:300) sumRwd:[-117.027](cntct:-0.081+ctrl:-149.985+fwd:-1.702+head:-42.759+srv:77.500) tickAvg:[77]
[6/10000](#total:350) sumRwd:[-131.626](cntct:-0.081+ctrl:-151.955+fwd:-5.846+head:-51.524+srv:77.780) tickAvg:[77]


### Animate final motion

In [None]:
SAVE_VID_FINAL = True
MAKE_GIF_FINAL = False 

In [None]:
for _i in range(3):
    ret = run_episode_vid(env, policy, scaler)
    if MAKE_GIF_FINAL:
        display_frames_as_gif(ret['frames'])
    if SAVE_VID_FINAL:
        outputdata = np.asarray(ret['frames']).astype(np.uint8)
        vidName = 'vids/ant_ppo_final_%d.mp4'%(_i)
        skvideo.io.vwrite(vidName,outputdata)
        print ("[%s] saved."%(vidName))

### Finished

In [None]:
DO_CLOSE = False # There is no turning back. 
if DO_CLOSE:
    logger.close()
    policy.close_sess()
    val_func.close_sess()