# CHANGE THESE CONFIGS, THEN UPDATE MODEL INITIALIZATION

In [16]:
MODEL_PATH = "./models/highway/a2c/a2c"
TENSORBOARD_LOG_DIR = "./models/highway/a2c/logs"
IMAGE_TAG = "a2c_rewards_highway"
IMAGE_DIR = "./images/highway/a2c"
RUNS_FILE = "./models/highway/a2c/a2c_success_runs.pkl"

#### ENV CONFIGS ####
CONFIG = {
    "observation": {"type": "Kinematics"},
                "action": {
                    "type": "DiscreteMetaAction",
                },
                "lanes_count": 4,
                "vehicles_count": 50,
                "controlled_vehicles": 1,
                "initial_lane_id": None,
                "duration": 40,  # [s]
                "ego_spacing": 2,
                "vehicles_density": 1,
                "collision_reward": -1,  # The reward received when colliding with a vehicle.
                "right_lane_reward": 0.1,  # The reward received when driving on the right-most lanes, linearly mapped to
                # zero for other lanes.
                "high_speed_reward": 0.4,  # The reward received when driving at full speed, linearly mapped to zero for
                # lower speeds according to config["reward_speed_range"].
                "lane_change_reward": 0,  # The reward received at each lane change action.
                "reward_speed_range": [20, 30],
                "normalize_reward": True,
                "offroad_terminal": False,
}

## Imports

In [18]:
import time
import torch
import pickle
import random
import gymnasium as gym

from fvcore.nn import FlopCountAnalysis

from stable_baselines3 import A2C, PPO, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import ProgressBarCallback

from custom_highway_env import CustomHighwayEnv
from custom_training_callback import RewardTrackingCallback
# import highway_env

## Register Env with Gymnasium

In [19]:
gym.envs.registration.register(
    id='custom-highway-v0',
    entry_point='custom_highway_env:CustomHighwayEnv',
)

## Create and Wrap Env

In [21]:
env = gym.make("custom-highway-v0", render_mode='rgb_array', config=CONFIG)
env = DummyVecEnv([lambda: env])
# env = gym.make("highway-v0", render_mode='rgb_array')

## UPDATE HERE: Set Up Correct Model

In [23]:
model = A2C(
    "MlpPolicy",
    env,
    n_steps=5,
    learning_rate=7e-4,
    gamma=0.99,
    gae_lambda=1.0,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    rms_prop_eps=1e-5,
    use_rms_prop=True,
    normalize_advantage=False,
    tensorboard_log=TENSORBOARD_LOG_DIR,
    verbose=0,
    device='cpu'
)

## Training

In [None]:
reward_callback = RewardTrackingCallback(
    tag=IMAGE_TAG,
    path_dir=IMAGE_DIR
)

steps = 5000
config = CONFIG.copy()
env = gym.make("custom-highway-v0", render_mode='rgb_array', config=config)
env = DummyVecEnv([lambda: env])
model.set_env(env)  # Update the model with the new environment
model.learn(
    total_timesteps=steps,
    callback=[ProgressBarCallback(), reward_callback]
)
reward_callback.start_new_phase()
reward_callback.save_all_plot()

Output()

### Save Model

In [10]:
model.save(MODEL_PATH)

## Evaluation

### Load Model

In [11]:
model = A2C.load(MODEL_PATH)

### Run Evaluation

In [12]:
collisions = 0
destination_arrivals = 0
success_count = 0
successful_flopcount = 0
episodes = 100

# Store successful runs for rendering
successful_runs = []

for eps in range(100):
    # config = CONFIG.copy()
    # config["destination"] = "o" + str(random.randint(1, 3))
    # env = gym.make("custom-intersection-v0", render_mode='rgb_array', config=config)
    env = gym.make("highway-v0", render_mode='rgb_array')

    seed = random.randint(0, 10000)

    obs, _ = env.reset(seed=seed)
    episode_flops = 0
    done = False
    truncated = False
    episode_reward = 0
    trajectory = []

    while not (done or truncated):
        # Flop Counting
        input_tensor, _ = model.policy.obs_to_tensor(obs)
        flops = FlopCountAnalysis(model.policy, input_tensor)
        flops.unsupported_ops_warnings(False)
        flops = flops.total()
        episode_flops += flops

        action, _states = model.predict(obs, deterministic=True)
        trajectory.append((obs, action))  # Save for later render if successful
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward
    
    crashed = info.get("crashed", False)
    arrived = info.get("arrived", False)
    if crashed:
        collisions += 1
    if arrived:
        destination_arrivals += 1
    if (not crashed) and arrived:
        success_count += 1
        successful_flopcount += episode_flops
        # successful_runs.append((seed, config.copy(), trajectory))

    # print(f"Episode {eps + 1} finished, total reward: {episode_reward}, destination: {config['destination']}")
    print(f"Episode {eps + 1} finished, total reward: {episode_reward}")
    time.sleep(1)

print(f"Total collisions: {collisions} out of {episodes} episodes")
print(f"Total destination arrivals: {destination_arrivals} out of {episodes} episodes")
if success_count > 0:
    print(f"FLOPS per successful episode: {successful_flopcount / success_count:.2}")
else:
    print("No successful episodes.")



Episode 1 finished, total reward: 26.694148001393476
Episode 2 finished, total reward: 29.360814668060133
Episode 3 finished, total reward: 27.58303689028236
Episode 4 finished, total reward: 26.694148001393476
Episode 5 finished, total reward: 26.694148001393476
Episode 6 finished, total reward: 29.360814668060133
Episode 7 finished, total reward: 28.471925779171247
Episode 8 finished, total reward: 27.58303689028236
Episode 9 finished, total reward: 27.58303689028236
Episode 10 finished, total reward: 26.694148001393476
Episode 11 finished, total reward: 28.471925779171247
Episode 12 finished, total reward: 26.694148001393476
Episode 13 finished, total reward: 28.471925779171247
Episode 14 finished, total reward: 29.360814668060133
Episode 15 finished, total reward: 27.58303689028236
Episode 16 finished, total reward: 28.471925779171247
Episode 17 finished, total reward: 27.58303689028236
Episode 18 finished, total reward: 29.360814668060133
Episode 19 finished, total reward: 26.6941

### Save Successful Runs

In [13]:
with open(RUNS_FILE, "wb") as f:
    pickle.dump(successful_runs, f)

### Load Successful Runs File

In [14]:
with open(RUNS_FILE, "rb") as f:
    successful_runs = pickle.load(f)

### Render Successful Episodes

In [15]:
for i, (seed, config, trajectory) in enumerate(successful_runs):
    print(f"\nRendering successful episode {i + 1}")
    # env = gym.make("custom-intersection-v0", render_mode='human', config=config)
    env = gym.make("highway-v0", render_mode='rgb_array')
    obs, _ = env.reset(seed=seed)
    for obs, action in trajectory:
        env.step(action)
        env.render()
        time.sleep(0.05)
    env.close()