In [None]:
import gym
import numpy as np
import pandas as pd
import os
import cv2
import copy
import matplotlib.pyplot as plt
import matplotlib.animation
from IPython.display import display, HTML
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

In [None]:
CHECK_FREQ_NUMB = 10000
TOTAL_TIMESTEP_NUMB = 100000
LEARNING_RATE = 0.00005
N_STEPS = 2048
GAMMA = 0.9
BATCH_SIZE = 64
N_EPOCHS = 10
DOWN_SAMPLE_RATE = 3
SKIP_NUMB = 2
EPISODE_NUMBERS = 10

In [None]:
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

In [None]:
env = gym.make('CarRacing-v1')
env = GrayScaleObservation(env, keep_dim=1)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
# state = env.reset()
# n_step = 0;
# done = False
# while not done:
#     action , _ = model.predict(state)
#     state, reward, done, info = env.step(env.action_space.sample())
#     n_step +=1
#     env.render()

In [None]:
model = PPO('CnnPolicy', env, verbose=0, tensorboard_log=LOG_DIR, learning_rate=LEARNING_RATE, n_steps=N_STEPS,
            batch_size=BATCH_SIZE, n_epochs=N_EPOCHS, gamma=GAMMA)

In [None]:
%%time
model.learn(total_timesteps=TOTAL_TIMESTEP_NUMB)

In [None]:
model = PPO.load('./train/300000', env)

In [None]:
model.save(path='./train/400000')

In [None]:
forvideo = PPO.load('./train/400000')

In [None]:
total_reward = [0] * EPISODE_NUMBERS
total_time = [0] * EPISODE_NUMBERS
best_reward = 0
frames_best = []

for i in range(EPISODE_NUMBERS):
    state = env.reset()  # reset for each new trial
    done = False
    total_reward[i] = 0
    total_time[i] = 0
    frames = []
    while not done and total_time[i] < 10000:
        action, _ = forvideo.predict(state)
        state, reward, done, info = env.step(action)
        total_reward[i] += reward[0]
        total_time[i] += 1
        frames.append(copy.deepcopy(env.render(mode='rgb_array')))

    if total_reward[i] > best_reward:
        best_reward = total_reward[i]
        frames_best = copy.deepcopy(frames)

    print('test episode:', i, 'reward:', total_reward[i], 'time:', total_time[i])

print('average reward:', (sum(total_reward) / EPISODE_NUMBERS),
      'average time:', (sum(total_time) / EPISODE_NUMBERS),
      'best_reward:', best_reward)

In [None]:
frames_new = np.array(frames_best)
matplotlib.rcParams['animation.embed_limit'] = 2**128
plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
patch = plt.imshow(frames_new[0])
plt.axis('off')
animate = lambda i: patch.set_data(frames_new[i])
ani = matplotlib.animation.FuncAnimation(plt.gcf(), animate, frames=len(frames_new), interval = 50)
display(HTML(ani.to_jshtml()))
plt.close()

In [None]:
# evaluate_policy(model, env, n_eval_episodes=10, render=True)