# RL Exploration

Done by:  
- Chua Shao Cong
- Lim Sheng Wei
- Png Qun Shen

In [None]:
from AREgym import AREEnv

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

from torch.utils.tensorboard import SummaryWriter
# not used in this file but useful for launching tensorboard session within vscode

## Small World

### Params

In [None]:
''' This is for 'small world' environment'''

# world params
grid_size = 250
step_distance = 10
num_laserscan = 10
max_steps = 256
save_map = False # turn off for faster training (no map saving)

# training params
gamma = 0.95
lamda = 0.99
learning_rate = 0.0003
n_steps = 256
batch_size = 256
n_epochs = 10
timesteps = 2000000

# model params
policy_kwargs = dict(net_arch=[dict(pi=[128, 64], vf=[128, 64])])
# MLP with two hidden layers with 128 and 64 units respectively for both actor and critic
# using default activation function Tanh

# vectorize
vectorize = True
n_envs = 4



### Environment

In [None]:
# initialize environment
if vectorize:
    env_kwargs = dict(grid=grid_size,
                      step_distance=step_distance,
                      num_laser_scan=num_laserscan,
                      save_map=save_map,
                      max_steps=max_steps)
    env = make_vec_env(AREEnv,
                       n_envs=n_envs,
                       env_kwargs=env_kwargs)
    
else:
    env = AREEnv(grid=grid_size,
                 step_distance=step_distance,
                 num_laser_scan=num_laserscan,
                 save_map=save_map,
                 max_steps=max_steps)

### Model

In [None]:
# initialize PPO training algo
if vectorize:
    batch_size = batch_size * n_envs

model = PPO("MlpPolicy",
            env,
            gamma=gamma,
            gae_lambda=lamda,
            learning_rate=learning_rate,
            n_steps=n_steps,
            n_epochs=n_epochs,
            tensorboard_log='sb_runs/256',
            batch_size=batch_size,
            policy_kwargs=policy_kwargs,
            seed=1
            )

### Training

In [None]:
# training
saved_file = 'stable_baselines_ppo_256_2m'
model.learn(total_timesteps=timesteps)
model.save(saved_file)

## Large World

### Params

In [None]:
''' This is for 'large world' environment'''

# world params
grid_size = 500
step_distance = 20
num_laserscan = 36
max_steps = 256
save_map = False # turn off for faster training (no map saving)

# training params
gamma = 0.95
lamda = 0.99
learning_rate = 0.0003
n_steps = 256
batch_size = 256
n_epochs = 10
timesteps = 1000000 # model used for submission was halted at ~600k steps

# model params
policy_kwargs = dict(net_arch=[dict(pi=[128, 64], vf=[128, 64])])
# MLP with two hidden layers with 128 and 64 units respectively for both actor and critic
# using default activation function Tanh

# vectorize
vectorize = False
n_envs = 1


### Environment

In [None]:
# initialize environment
if vectorize:
    env_kwargs = dict(grid=grid_size,
                      step_distance=step_distance,
                      num_laser_scan=num_laserscan,
                      save_map=save_map,
                      max_steps=max_steps)
    env = make_vec_env(AREEnv,
                       n_envs=n_envs,
                       env_kwargs=env_kwargs)
    
else:
    env = AREEnv(grid=grid_size,
                 step_distance=step_distance,
                 num_laser_scan=num_laserscan,
                 save_map=save_map,
                 max_steps=max_steps)

### Model

In [None]:
# initialize PPO training algo
if vectorize:
    batch_size = batch_size * n_envs

model = PPO("MlpPolicy",
            env,
            gamma=gamma,
            gae_lambda=lamda,
            learning_rate=learning_rate,
            n_steps=n_steps,
            n_epochs=n_epochs,
            tensorboard_log='sb_runs/full',
            batch_size=batch_size,
            policy_kwargs=policy_kwargs,
            seed=1
            )

### Training

In [None]:
# training
saved_file = 'best_policy_full'
model.learn(total_timesteps=timesteps)
model.save(saved_file)

## Best Policy

Best trained policy so far

### Params

In [None]:
''' for loading trained policy '''
# world params
grid_size = 500
step_distance = 20
num_laserscan = 36
max_steps = 256
save_map = True

saved_file = 'best_policy_full.zip'

### Environment and Model

In [None]:
# initialize env
env = AREEnv(grid=grid_size,
                 step_distance=step_distance,
                 num_laser_scan=num_laserscan,
                 save_map=save_map,
                 max_steps=max_steps)

# load model
model = PPO.load("best_policy_full")

### Rendering

Generates 3 gifs of current_state, global_map and world_map

In [None]:
import imageio
import numpy as np

obs = env.reset()

img1_lst, img2_lst, img3_lst = [], [], []
for i in range(256):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:
        print('terminate')
        break
    # print(i)
    img1, img2, img3 = env.render()
    img1_lst.append(img1)
    img2_lst.append(img2)
    img3_lst.append(img3)

imageio.mimsave("global_map.gif", [np.array(img) for i, img in enumerate(img1_lst)], fps=15)
imageio.mimsave("world_map.gif", [np.array(img) for i, img in enumerate(img2_lst)], fps=15)
imageio.mimsave("current_state.gif", [np.array(img) for i, img in enumerate(img3_lst)], fps=15)