In [None]:
# Filter tensorflow version warnings
import os
# https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints/40426709
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import warnings
# https://stackoverflow.com/questions/15777951/how-to-suppress-pandas-future-warning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)
import logging
tf.get_logger().setLevel(logging.ERROR)

In [None]:
import gym
from stable_baselines.common.policies import CnnPolicy #, MlpPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv 
from stable_baselines import PPO2

from stable_baselines.common.evaluation import evaluate_policy as test
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [None]:
## Choose one agent, see Docu for description
#agent='CarRacing-v0'
#agent='CarRacing-v1'
agent='CarRacing-v3'

# Stop training when the model reaches the reward threshold
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold = 170, verbose=1)

seed = 2000

In [None]:
## SIMULATION param  
## Changing these makes world models incompatible!!
game_color = 2
indicators = True
fpst = 4
skip = 3
actions = [[0, 0, 0], [-0.4, 0, 0], [0.4, 0, 0], [0, 0.6, 0], [0, 0, 0.8]]  #this is ACT

obst_loc = [6, -12, 25, -50, 75, -37, 62, -87, 95, -29]  #track percentage, negative for obstacle to the left-hand side


In [None]:
## Loading drive_pretained model

import pickle
root = 'ppo_cnn_gym-mod_'
file = root+'c{:d}_f{:d}_s{:d}_{}_a{:d}'.format(game_color,fpst,skip,indicators,len(actions))

model = PPO2.load(file)

In [None]:
## This model param
use = 6       # number of times to use same track [1,100]
ept = 10      # different starting points on same track [1,20]
patience = 1.0
track_complexity = 12
#REWARD2 = [-0.05, 0.1, 0.0, 0.0,   2.0, 0.0,   100, -20, -100, -50,   -5, -100]

if agent=='CarRacing-v3': 
    env = gym.make(agent, seed=seed, 
        game_color=game_color,
        indicators=indicators,
        frames_per_state=fpst,
        skip_frames=skip,   
#        discre=actions,          #passing custom actions
        use_track = use,       
        episodes_per_track = ept,  
        tr_complexity = track_complexity, 
        tr_width = 45,
        patience = patience,
        off_track = patience,
        end_on_contact = True,     #learning to avoid obstacles the-hard-way
        oily_patch = False,
        num_obstacles = 5,         #some obstacles
        obst_location = obst_loc,  #passing fixed obstacle location
#        f_reward = REWARD2,        #passing a custom reward function
        verbose = 2 )            
else: 
    env = gym.make(agent)

env = DummyVecEnv([lambda: env])

In [None]:
## Training on obstacles
model.set_env(env)
batch_size = 256
updates = 700

In [None]:
model.learn(total_timesteps = updates*batch_size, log_interval=1) #, callback=eval_callback)

In [None]:
#Save last updated model

file = root+'c{:d}_f{:d}_s{:d}_{}_a{:d}__u{:d}_e{:d}_p{}_bs{:d}'.format(
    game_color,fpst,skip,indicators,len(actions),use,ept,patience,batch_size)

model.save(file, cloudpickle=True)
param_list=model.get_parameter_list()


In [None]:
env.close()

In [None]:
## This model param #2
use = 6       # number of times to use same track [1,100]
ept = 10      # different starting points on same track [1,20]
patience = 1.0
track_complexity = 12
#REWARD2 = [-0.05, 0.1, 0.0, 0.0,   2.0, 0.0,   100, -20, -100, -50,   -5, -100]
seed = 25000

if agent=='CarRacing-v3': 
    env2 = gym.make(agent, seed=seed, 
        game_color=game_color,
        indicators=indicators,
        frames_per_state=fpst,
        skip_frames=skip,   
#        discre=actions,          #passing custom actions
        use_track = use,       
        episodes_per_track = ept,  
        tr_complexity = track_complexity, 
        tr_width = 45,
        patience = patience,
        off_track = patience,
        end_on_contact = False,    # CHANGED 
        oily_patch = False,
        num_obstacles = 5,         #some obstacles
        obst_location = 0,         #using random obstacle location
#        f_reward = REWARD2,        #passing a custom reward function
        verbose = 3 )            
else: 
    env2 = gym.make(agent)

env2 = DummyVecEnv([lambda: env2])

In [None]:
## Training on obstacles
model.set_env(env2)
#batch_size = 384
updates = 1500

In [None]:
## Separate evaluation env
test_freq = 100      #policy updates until evaluation
test_episodes_per_track = 5   #number of starting points on test_track
eval_log = './evals/'

env_test = gym.make(agent, seed=int(3.14*seed), 
        game_color=game_color,
        indicators=indicators,
        frames_per_state=fpst,
        skip_frames=skip,   
#        discre=actions,            #passing custom actions
        use_track = 1,           #change test track after 1 ept round
        episodes_per_track = test_episodes_per_track,  
        tr_complexity = 12,      #test on a medium complexity track
        tr_width = 45,
        patience = 2.0,
        off_track = 2.0,
        end_on_contact = False,
        oily_patch = False,
        num_obstacles = 5,
        obst_location = obst_loc)  #passing fixed obstacle location

env_test = DummyVecEnv([lambda: env_test])

eval_callback = EvalCallback(env_test, callback_on_new_best=callback_on_best,  #None,
                             n_eval_episodes=test_episodes_per_track*3, eval_freq=test_freq*batch_size,
                             best_model_save_path=eval_log, log_path=eval_log, deterministic=True, 
                             render = False)


In [None]:
model.learn(total_timesteps = updates*batch_size, log_interval=1, callback=eval_callback)

In [None]:
#Save last updated model

#file = root+'c{:d}_f{:d}_s{:d}_{}_a{:d}__u{:d}_e{:d}_p{}_bs{:d}'.format(
#    game_color,fpst,skip,indicators,len(actions),use,ept,patience,batch_size)

model.save(file+'_II', cloudpickle=True)
param_list=model.get_parameter_list()


In [None]:
env2.close()
env_test.close()

In [None]:
## Enjoy last trained policy

if agent=='CarRacing-v3':  #create an independent test environment, almost everything in std/random definition
    env3 = gym.make(agent, seed=None, 
        game_color=game_color,
        indicators = True,
        frames_per_state=fpst,
        skip_frames=skip,   
#        discre=actions,
        use_track = 2,       
        episodes_per_track = 1,  
        patience = 5.0,
        off_track = 3.0    )
else:
    env3 = gym.make(agent)

env3 = DummyVecEnv([lambda: env3])
obs = env3.reset()
print(obs.shape)        

done = False
pasos = 0
_states=None

while not done: # and pasos<1500:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env3.step(action)
    env3.render()
    pasos+=1
    
env3.close()
print()
print(reward, done, pasos) #, info)

In [None]:
## Enjoy best eval_policy

obs = env3.reset()
print(obs.shape)        

## Load bestmodel from eval
#if not isinstance(model_test, PPO2):
model_test = PPO2.load(eval_log+'best_model', env3)

done = False
pasos = 0
_states=None

while not done: # and pasos<1500:
    action, _states = model_test.predict(obs, deterministic=True)
    obs, reward, done, info = env3.step(action)
    env3.render()
    pasos+=1
    
env3.close()
print()
print(reward, done, pasos)
print(action, _states)

In [None]:
model_test.save(file+'_evalbest', cloudpickle=True)

In [None]:
env2.close()

In [None]:
env3.close()

In [None]:
env_test.close()

In [None]:
print(action, _states)

In [None]:
obs.shape