In [None]:
# install following packages (and other dependencies if code throws an error)

# torch
# tmrl
# stable baselines3
# gym (offered by openai)
# tensorboard (to be able to see logs stored)

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
from tmrl import get_environment
from time import sleep

In [None]:
import os

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECK_FREQ_NUMB = 10000
TOTAL_TIMESTEP_NUMB = 100000   #  you can train it for longer period of time
LEARNING_RATE = 0.0003
N_STEPS = 1024
GAMMA = 0.995
BATCH_SIZE = 256
N_EPOCHS = 10
DOWN_SAMPLE_RATE = 3
SKIP_NUMB = 2
EPISODE_NUMBERS = 10

In [None]:
# you will have to make 2 folders named 'train' and 'logs' in the same folder where notebook is present 
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

In [None]:
callback = TrainAndLoggingCallback(check_freq=CHECK_FREQ_NUMB, save_path=CHECKPOINT_DIR)

In [None]:
from gym import Env
from gym.spaces import Box, MultiDiscrete

In [None]:
class TrackMania(Env):
    def __init__(self):
        super().__init__()
        self.game = get_environment()
        sleep(1.0)
        self.observation_space = Box(low=0.0, high=float('inf'), shape=(4,19), dtype=np.float32) 
        self.action_space = self.game.action_space

    
    def step(self, action):
        obs, rew, terminated, truncated, info = self.game.step(action)

        return obs[2], rew, terminated, truncated, info
    
    def render():
        pass
    
    def reset(self):
        obs, info = self.game.reset()
        state = obs[2]
        return state, info
    
    def wait(self):
        self.game.wait()

In [None]:
env = TrackMania()

In [None]:
obs = env.reset() # reset environment

In [None]:
obs

In [None]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=LOG_DIR)

In [None]:
%%time
model.learn(total_timesteps=TOTAL_TIMESTEP_NUMB, callback=callback)

In [None]:
mean_score = 0.0
mean_time = 0.0
t_step = 0
for i in range(1):
    t_step = 0
    t_st = 0
    score = 0
    obs, _ = env.reset()
    while True:  # rtgym ensures this runs at 20Hz by default
        act, _ = model.predict(obs)  # compute action
#         act = env.action_space.sample()
        obs, rew, terminated, truncated, info = env.step(act)  # apply action (rtgym ensures healthy time-steps)
        score +=rew
        t_st +=1
        if terminated or truncated:
            break
        t_step+=1
    print(f"score: {score}, time: {t_st}")
    mean_score+=score
    mean_time+=t_st
print(f"m_score: {mean_score/10}, m_time: {mean_time/10}")

In [None]:
# code to load saved model.
model = PPO.load('./train_2/best_model_100000.zip', env=env, verbose=1, tensorboard_log=LOG_DIR)