In [2]:
# Filter tensorflow version warnings
import os
# https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints/40426709
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import warnings
# https://stackoverflow.com/questions/15777951/how-to-suppress-pandas-future-warning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)
import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
import gym
from stable_baselines.common.policies import CnnPolicy #, MlpPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv 
from stable_baselines import PPO2

from stable_baselines.common.evaluation import evaluate_policy as test
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [4]:
## Choose one agent, see Docu for description
agent='CarRacing-v0'
agent='CarRacing-v1'
agent='CarRacing-v2'

# Stop training when the model reaches the reward threshold
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold = 130, verbose=1)

In [5]:
## This environment param  
## Changing these makes models incompatible!!
game_color = 2
indicators = False
fpst = 4
skip = 1
actions = [[0, 0, 0], [-0.4, 0, 0], [0.4, 0, 0], [0, 0.6, 0], [0, 0, 0.8]]

In [6]:
## This model param
use = 5       # number of times to use same track [1,100]
ept = 4       # different starting points on same track [1,20]
track_complexity = 8
patience = 2.0
seed = 1000

#using follow_centerline for this first leg of training
REWARD = [-0.0, 0.1, 0.0, 0.0, 1.0, 100, -20, -100, -50]

if agent=='CarRacing-v2': #cargo el env mod Pablo
    env1 = gym.make(agent, seed=seed, 
        use_track = use,       
        episodes_per_track = ept,  
        tr_complexity = track_complexity, 
        patience = patience,
        game_color=game_color,
        indicators = indicators,
        discre = actions,   #passing custom actions
        frames_per_state = fpst,
        skip_frames = skip,
        f_reward = REWARD      )  #passing a custom reward function
else: #cargo el env de gym
    env1 = gym.make(agent)

env1 = DummyVecEnv([lambda: env1])
env1.metadata

{'render.modes': ['human', 'rgb_array', 'state_pixels'],
 'FPS, 1/timebase': 33.333333333333336,
 'Zoom_level': 1.7,
 'Flight start': False,
 'show_track_1st': False,
 'state_pixels frame size': [96, 96]}

In [7]:
print(env1.action_space)
print(env1.observation_space)

Discrete(5)
Box(0, 255, (96, 96, 4), uint8)


In [8]:
## This training param
batch_size=256
updates=500
epochs=4

model = PPO2(CnnPolicy, env1, verbose=1, n_steps=batch_size, 
             gamma=0.995, learning_rate=0.001, nminibatches=epochs) #, seed=314, n_cpu_tf_sess=1)


In [9]:
## Separate evaluation env
test_episodes_per_track = 10
test_freq = 50      #policy updates until evaluation
eval_log = './evals/'

env_test = gym.make(agent, seed=int(3.14*seed), 
        use_track = 1,       
        episodes_per_track = test_episodes_per_track,  
        tr_complexity = 12, 
        patience = patience,  #1.0,
        game_color=game_color,
        indicators = indicators,
        discre = actions,
        frames_per_state = fpst,
        skip_frames = skip   )

env_test = DummyVecEnv([lambda: env_test])

eval_callback = EvalCallback(env_test, callback_on_new_best=callback_on_best,  #None,
                             n_eval_episodes = test_episodes_per_track*2, eval_freq = test_freq*batch_size,
                             best_model_save_path=eval_log, log_path=eval_log, 
                             deterministic=True, render=False)


In [10]:
##Independent test routine
#reward_test, epis = test(model, env_test, n_eval_episodes=test_episodes_per_track, 
#                         deterministic=True, render=False, callback=None, reward_threshold=100, 
#                         return_episode_rewards=True)
#reward_test

In [11]:
## Training #1

model.learn(total_timesteps = updates*batch_size, log_interval=1, callback=eval_callback)


Track generation: 1002..1257 -> 255-tiles track, complex 8
-------------------------------------
| approxkl           | 0.012829069  |
| clipfrac           | 0.19335938   |
| explained_variance | -0.0685      |
| fps                | 59           |
| n_updates          | 1            |
| policy_entropy     | 1.5965626    |
| policy_loss        | -0.004141393 |
| serial_timesteps   | 256          |
| time_elapsed       | 0            |
| total_timesteps    | 256          |
| value_loss         | 3.1661496    |
-------------------------------------
1  cut by time without progress. Steps 274  %advance 2.7  played reward -25.33  last penalty -20
2  cut by time without progress. Steps 177  %advance 1.9  played reward -16.41  last penalty -20
-------------------------------------
| approxkl           | 0.011564367  |
| clipfrac           | 0.23242188   |
| explained_variance | -0.0131      |
| fps                | 64           |
| n_updates          | 2            |
| policy_entropy     | 1.

17  cut by time without progress. Steps 184  %advance 7.4  played reward -11.6  last penalty -20
18  cut by time without progress. Steps 98  %advance 1.5  played reward -8.91  last penalty -20
------------------------------------
| approxkl           | 0.012139194 |
| clipfrac           | 0.22558594  |
| explained_variance | 0.494       |
| fps                | 61          |
| n_updates          | 15          |
| policy_entropy     | 1.3264613   |
| policy_loss        | 0.006636446 |
| serial_timesteps   | 3840        |
| time_elapsed       | 56.9        |
| total_timesteps    | 3840        |
| value_loss         | 4.4469924   |
------------------------------------
19  cut by time without progress. Steps 166  %advance 4.3  played reward -12.95  last penalty -20
20  cut by time without progress. Steps 162  %advance 4.3  played reward -12.55  last penalty -20
Track generation: 922..1157 -> 235-tiles track, complex 8
-------------------------------------
| approxkl           | 0.012894209

15  cut by time without progress. Steps 166  %advance 5.9  played reward -11.37  last penalty -20
16  cut by time without progress. Steps 186  %advance 5.5  played reward -13.79  last penalty -20
--------------------------------------
| approxkl           | 0.0027207742  |
| clipfrac           | 0.0107421875  |
| explained_variance | 0.961         |
| fps                | 63            |
| n_updates          | 28            |
| policy_entropy     | 1.3462328     |
| policy_loss        | -0.0029953467 |
| serial_timesteps   | 7168          |
| time_elapsed       | 108           |
| total_timesteps    | 7168          |
| value_loss         | 0.8334972     |
--------------------------------------
17  cut by time without progress. Steps 163  %advance 3.8  played reward -13.2  last penalty -20
--------------------------------------
| approxkl           | 0.0040262747  |
| clipfrac           | 0.03125       |
| explained_variance | 0.948         |
| fps                | 65            |
| n_u

10  cut by time without progress. Steps 290  %advance 5.1  played reward -24.58  last penalty -20
--------------------------------------
| approxkl           | 0.0027194605  |
| clipfrac           | 0.0029296875  |
| explained_variance | 0.988         |
| fps                | 65            |
| n_updates          | 42            |
| policy_entropy     | 1.268517      |
| policy_loss        | -0.0013749272 |
| serial_timesteps   | 10752         |
| time_elapsed       | 164           |
| total_timesteps    | 10752         |
| value_loss         | 0.16681689    |
--------------------------------------
11  cut by time without progress. Steps 257  %advance 3.4  played reward -23.01  last penalty -20
12  cut by time without progress. Steps 182  %advance 2.1  played reward -16.8  last penalty -20
-------------------------------------
| approxkl           | 0.010721558  |
| clipfrac           | 0.16796875   |
| explained_variance | 0.155        |
| fps                | 63           |
| n_update

18  cut by time without progress. Steps 254  %advance 3.8  played reward -22.28  last penalty -20
--------------------------------------
| approxkl           | 0.0027076302  |
| clipfrac           | 0.029296875   |
| explained_variance | 0.665         |
| fps                | 64            |
| n_updates          | 53            |
| policy_entropy     | 0.8867675     |
| policy_loss        | 0.00049843744 |
| serial_timesteps   | 13568         |
| time_elapsed       | 223           |
| total_timesteps    | 13568         |
| value_loss         | 0.490228      |
--------------------------------------
--------------------------------------
| approxkl           | 0.0033561718  |
| clipfrac           | 0.047851562   |
| explained_variance | 0.657         |
| fps                | 64            |
| n_updates          | 54            |
| policy_entropy     | 0.95214504    |
| policy_loss        | -0.0012238858 |
| serial_timesteps   | 13824         |
| time_elapsed       | 227           |
| tot

10  cut by time without progress. Steps 188  %advance 3.0  played reward -16.42  last penalty -20
------------------------------------
| approxkl           | 0.023186078 |
| clipfrac           | 0.09863281  |
| explained_variance | 0.747       |
| fps                | 62          |
| n_updates          | 68          |
| policy_entropy     | 0.655043    |
| policy_loss        | 0.008579869 |
| serial_timesteps   | 17408       |
| time_elapsed       | 285         |
| total_timesteps    | 17408       |
| value_loss         | 0.78429157  |
------------------------------------
11  cut by time without progress. Steps 288  %advance 7.5  played reward -21.88  last penalty -20
-------------------------------------
| approxkl           | 0.007623635  |
| clipfrac           | 0.12402344   |
| explained_variance | 0.897        |
| fps                | 61           |
| n_updates          | 69           |
| policy_entropy     | 0.833593     |
| policy_loss        | 0.0028099301 |
| serial_timesteps 

5  cut by time without progress. Steps 190  %advance 10.7  played reward -9.07  last penalty -20
6  cut by time without progress. Steps 199  %advance 7.5  played reward -13.1  last penalty -20
--------------------------------------
| approxkl           | 0.0063779284  |
| clipfrac           | 0.09082031    |
| explained_variance | 0.969         |
| fps                | 61            |
| n_updates          | 82            |
| policy_entropy     | 0.9386779     |
| policy_loss        | -0.0010704445 |
| serial_timesteps   | 20992         |
| time_elapsed       | 345           |
| total_timesteps    | 20992         |
| value_loss         | 0.5023587     |
--------------------------------------
7  cut by time without progress. Steps 140  %advance 5.3  played reward -9.43  last penalty -20
8  cut by time without progress. Steps 118  %advance 3.5  played reward -9.02  last penalty -20
----------------------------------------
| approxkl           | 0.008295857     |
| clipfrac           | 0.0

-------------------------------------
| approxkl           | 0.0007809719 |
| clipfrac           | 0.0029296875 |
| explained_variance | 0.875        |
| fps                | 62           |
| n_updates          | 94           |
| policy_entropy     | 0.7230726    |
| policy_loss        | -0.000768575 |
| serial_timesteps   | 24064        |
| time_elapsed       | 393          |
| total_timesteps    | 24064        |
| value_loss         | 0.8519595    |
-------------------------------------
6  out of limits. Steps 175  %advance 8.0  played reward -10.51  last penalty -100
7  out of limits. Steps 136  %advance 6.8  played reward -7.42  last penalty -100
-------------------------------------
| approxkl           | 0.03889422   |
| clipfrac           | 0.16601562   |
| explained_variance | -0.155       |
| fps                | 63           |
| n_updates          | 95           |
| policy_entropy     | 0.5930841    |
| policy_loss        | -0.011612862 |
| serial_timesteps   | 24320        |

<stable_baselines.ppo2.ppo2.PPO2 at 0x174fd063f28>

In [12]:
import pickle
root = 'ppo_cnn_gym-mod_'

file = root+'c{:d}_f{:d}_s{:d}_u{:d}_e{:d}_p{}_{}_bs{:d}'.format(game_color,fpst,skip,use,ept,patience,indicators,batch_size)

model.save(file, cloudpickle=True)
param_list=model.get_parameter_list()


In [13]:
#env1.reset_track()
env1.close()

In [14]:
## This model param  #2
#use = 5       # number of times to use same track [1,100]
#ept = 4       # different starting points on same track [1,20]
#patience = 2.0
seed    = 20000
track_complexity = 12
updates = 500

if agent=='CarRacing-v2': #cargo el env mod Pablo
    env2 = gym.make(agent, seed=seed, 
        use_track = use,       
        episodes_per_track = ept,  
        tr_complexity = track_complexity, 
        patience = patience,
        game_color=game_color,
        indicators = indicators,
        discre = actions,
        frames_per_state = fpst,
        skip_frames = skip   )  #here I use STD_REWARD, so no param needed
else: #cargo el env de gym
    env2 = gym.make(agent)

env2 = DummyVecEnv([lambda: env2])

In [15]:
## Training  #2
new_mod = False  #to change batch_size you need a new model !!
updates=500

if new_mod:
    batch_size2 = 512
    model2 = PPO2(CnnPolicy, env2, verbose=1, n_steps=batch_size2, 
             gamma=0.995, learning_rate=0.001, nminibatches=epochs) #, seed=314, n_cpu_tf_sess=1)
    model2.load_parameters(param_list, exact_match=True)
    model2.learn(total_timesteps = updates*batch_size2, log_interval=1, callback=eval_callback)
else:
    model.set_env(env2)
    model.learn(total_timesteps = updates*batch_size, log_interval=1, callback=eval_callback)


Track generation: 1276..1598 -> 322-tiles track, complex 12
1  cut by time without progress. Steps 135  %advance 5.6  played reward -8.41  last penalty -20
2  out of limits. Steps 108  %advance 4.3  played reward -7.27  last penalty -100
-------------------------------------
| approxkl           | 0.032204997  |
| clipfrac           | 0.3564453    |
| explained_variance | -0.118       |
| fps                | 55           |
| n_updates          | 1            |
| policy_entropy     | 0.8654776    |
| policy_loss        | 0.0054762294 |
| serial_timesteps   | 256          |
| time_elapsed       | 0.003        |
| total_timesteps    | 256          |
| value_loss         | 124.800285   |
-------------------------------------
3  cut by time without progress. Steps 157  %advance 9.0  played reward -7.18  last penalty -20
4  cut by time without progress. Steps 90  %advance 1.5  played reward -7.96  last penalty -20
------------------------------------
| approxkl           | 0.01015443  |
| c

------------------------------------
| approxkl           | 0.04437246  |
| clipfrac           | 0.25390625  |
| explained_variance | 0.611       |
| fps                | 58          |
| n_updates          | 14          |
| policy_entropy     | 1.0550444   |
| policy_loss        | 0.017743861 |
| serial_timesteps   | 3584        |
| time_elapsed       | 58          |
| total_timesteps    | 3584        |
| value_loss         | 0.26763102  |
------------------------------------
20  cut by time without progress. Steps 206  %advance 1.5  played reward -19.56  last penalty -20
Track generation: 1090..1366 -> 276-tiles track, complex 12
--------------------------------------
| approxkl           | 0.100758344   |
| clipfrac           | 0.109375      |
| explained_variance | 0.915         |
| fps                | 59            |
| n_updates          | 15            |
| policy_entropy     | 0.49954253    |
| policy_loss        | -0.0030039381 |
| serial_timesteps   | 3840          |
| time_ela

13  cut by time without progress. Steps 67  %advance 0.7  played reward -6.59  last penalty -20
14  cut by time without progress. Steps 216  %advance 2.5  played reward -19.68  last penalty -20
--------------------------------------
| approxkl           | 0.003306784   |
| clipfrac           | 0.01171875    |
| explained_variance | 0.573         |
| fps                | 57            |
| n_updates          | 28            |
| policy_entropy     | 0.18181124    |
| policy_loss        | 0.00024356082 |
| serial_timesteps   | 7168          |
| time_elapsed       | 117           |
| total_timesteps    | 7168          |
| value_loss         | 0.32396385    |
--------------------------------------
15  cut by time without progress. Steps 255  %advance 4.7  played reward -21.4  last penalty -20
-------------------------------------
| approxkl           | 0.0007119389 |
| clipfrac           | 0.013671875  |
| explained_variance | 0.472        |
| fps                | 61           |
| n_updates 

5  cut by time without progress. Steps 287  %advance 7.0  played reward -22.32  last penalty -20
-------------------------------------
| approxkl           | 0.0023516018 |
| clipfrac           | 0.033203125  |
| explained_variance | 0.961        |
| fps                | 62           |
| n_updates          | 42           |
| policy_entropy     | 0.17989947   |
| policy_loss        | 0.0013009903 |
| serial_timesteps   | 10752        |
| time_elapsed       | 176          |
| total_timesteps    | 10752        |
| value_loss         | 0.24222964   |
-------------------------------------
--------------------------------------
| approxkl           | 0.0026700757  |
| clipfrac           | 0.041992188   |
| explained_variance | 0.885         |
| fps                | 55            |
| n_updates          | 43            |
| policy_entropy     | 0.23668519    |
| policy_loss        | -0.0019969468 |
| serial_timesteps   | 11008         |
| time_elapsed       | 181           |
| total_timesteps  

14  cut by time without progress. Steps 336  %advance 9.0  played reward -25.26  last penalty -20
--------------------------------------
| approxkl           | 0.00027525125 |
| clipfrac           | 0.0           |
| explained_variance | 0.866         |
| fps                | 62            |
| n_updates          | 53            |
| policy_entropy     | 0.25270495    |
| policy_loss        | 0.00029649626 |
| serial_timesteps   | 13568         |
| time_elapsed       | 239           |
| total_timesteps    | 13568         |
| value_loss         | 0.24532913    |
--------------------------------------
15  cut by time without progress. Steps 293  %advance 6.2  played reward -23.7  last penalty -20
--------------------------------------
| approxkl           | 0.0030368376  |
| clipfrac           | 0.030273438   |
| explained_variance | 0.932         |
| fps                | 60            |
| n_updates          | 54            |
| policy_entropy     | 0.21023458    |
| policy_loss        | -0

---------------------------------------
| approxkl           | 0.00019427393  |
| clipfrac           | 0.0048828125   |
| explained_variance | 0.942          |
| fps                | 60             |
| n_updates          | 67             |
| policy_entropy     | 0.111802675    |
| policy_loss        | -0.00025101006 |
| serial_timesteps   | 17152          |
| time_elapsed       | 298            |
| total_timesteps    | 17152          |
| value_loss         | 0.2244978      |
---------------------------------------
5  cut by time without progress. Steps 377  %advance 5.7  played reward -32.6  last penalty -20
--------------------------------------
| approxkl           | 0.00026341478 |
| clipfrac           | 0.005859375   |
| explained_variance | 0.752         |
| fps                | 61            |
| n_updates          | 68            |
| policy_entropy     | 0.116367504   |
| policy_loss        | -3.220822e-05 |
| serial_timesteps   | 17408         |
| time_elapsed       | 302       

--------------------------------------
| approxkl           | 0.00032805535 |
| clipfrac           | 0.0048828125  |
| explained_variance | 0.751         |
| fps                | 59            |
| n_updates          | 81            |
| policy_entropy     | 0.036237948   |
| policy_loss        | -0.0002043538 |
| serial_timesteps   | 20736         |
| time_elapsed       | 358           |
| total_timesteps    | 20736         |
| value_loss         | 0.06783548    |
--------------------------------------
-------------------------------------
| approxkl           | 4.874659e-07 |
| clipfrac           | 0.0          |
| explained_variance | 0.971        |
| fps                | 61           |
| n_updates          | 82           |
| policy_entropy     | 0.02366533   |
| policy_loss        | 2.02097e-07  |
| serial_timesteps   | 20992        |
| time_elapsed       | 362          |
| total_timesteps    | 20992        |
| value_loss         | 0.3959296    |
-------------------------------------

17  cut by time without progress. Steps 363  %advance 7.4  played reward -29.5  last penalty -20
---------------------------------------
| approxkl           | 0.0010908282   |
| clipfrac           | 0.02734375     |
| explained_variance | 0.984          |
| fps                | 62             |
| n_updates          | 94             |
| policy_entropy     | 0.18013638     |
| policy_loss        | -0.00056110637 |
| serial_timesteps   | 24064          |
| time_elapsed       | 413            |
| total_timesteps    | 24064          |
| value_loss         | 0.049067397    |
---------------------------------------
18  cut by time without progress. Steps 382  %advance 11.8  played reward -27.07  last penalty -20
---------------------------------------
| approxkl           | 0.0010961685   |
| clipfrac           | 0.0078125      |
| explained_variance | 0.944          |
| fps                | 61             |
| n_updates          | 95             |
| policy_entropy     | 0.12717138     |
| po

In [16]:
#import pickle
if new_mod:
    file = root+'c{:d}_f{:d}_s{:d}_u{:d}_e{:d}_p{}_{}_bs{:d}'.format(game_color,fpst,skip,use,ept,patience,indicators,batch_size)
    model2.save(file, cloudpickle=True)
    param_list=model2.get_parameter_list()
else:
    model.save(file+'II', cloudpickle=True)
    param_list=model.get_parameter_list()


In [17]:
env2.close()
env_test.close()

In [18]:
## Enjoy last trained policy

if agent=='CarRacing-v2':  #cargo el env mod Pablo
    env3 = gym.make(agent, seed=None, 
        game_color=game_color,
        use_track = 1,       
        episodes_per_track = 1,  
        tr_complexity = 12, 
        patience = 4.0,
        discre = actions,
        indicators = True,
        frames_per_state = fpst,
        skip_frames = skip   )
else:
    env3 = gym.make(agent)

env3 = DummyVecEnv([lambda: env3])
obs = env3.reset()
print(obs.shape)        

done = False
pasos = 0
_states=None

while not done and pasos<1002:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env3.step(action)
    env3.render()
    pasos+=1
    
env3.close()
print()
print(reward, done, info, pasos)

Track generation: 927..1162 -> 235-tiles track, complex 12
(96, 96, 4)
1  cut by time without progress. Steps 134  %advance 0.8  played reward -13.29  last penalty -20

-20 True {} 134


In [19]:
## Enjoy best eval_policy

obs = env3.reset()
print(obs.shape)        

## Load bestmodel from eval
#if not isinstance(model_test, PPO2):
model_test = PPO2.load(eval_log+'best_model', env3)

done = False
pasos = 0
_states=None

while not done and pasos<1002:
    action, _states = model_test.predict(obs, deterministic=True)
    obs, reward, done, info = env3.step(action)
    env3.render()
    pasos+=1
    
env3.close()
print()
print(reward, done, pasos)
print(action, _states)

Track generation: 1038..1301 -> 263-tiles track, complex 12
(1, 96, 96, 4)
1  cut by time without progress. Steps 134  %advance 0.7  played reward -13.29  last penalty -20
Track generation: 920..1157 -> 237-tiles track, complex 12

[-20.] [ True] 134
[0] None


In [20]:
model_test.save(file+'_evalbest', cloudpickle=True)

In [21]:
env2.close()

In [22]:
env3.close()

In [23]:
env_test.close()

In [24]:
print(action, _states)

[0] None


In [25]:
obs.shape

(1, 96, 96, 4)