In [None]:
import numpy as np
import time
import wandb
from agents.DistillPPO import DistillPPOAgent
from common.hyperparameters import HYPERPARAMS
import torch
from common.env import make_env
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wandb.login()
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# Training Agent notebook

# Subset of procgen Games

- ## heist: 
 The player must steal the gem hidden behind a network of locks. Each lock comes in one of three colors, and the necessary keys to open these locks are scattered throughout the level. The level layout takes the form of a maze, again generated by Kruskal's algorithm. Once the player collects a key of a certain color, the player may open the lock of that color. All keys in the player's possession are shown in the top right corner of the screen.
- ## Bossfight
The player controls a small starship and must destroy a much bigger boss starship. The boss randomly selects from a set of possible attacks when engaging the player. The player must dodge the incoming projectiles or be destroyed. The player can also use randomly scattered meteors for cover. After a set timeout, the boss becomes vulnerable and its shields go down. At this point, the players projectile attacks will damage the boss. Once the boss receives a certain amount of damage, the player receives a reward, and the boss re-raises its shields. If the player damages the boss several times in this way, the boss is destroyed, the player receives a large reward, and the episode ends.
- ## leaper
Inspired by the classic game “Frogger”. The player must cross several lanes to reach the finish line and earn a reward. The first group of lanes contains cars which must be avoided. The second group of lanes contains logs on a river. The player must hop from log to log to cross the river. If the player falls in the river, the episode ends.

descriptions from [OpenAI](https://openai.com/index/procgen-benchmark/)

# Distilled PPO Model

## Hyperparameters initialization

In [None]:
params = HYPERPARAMS['distill']

## Environment Initialization

In [None]:
game = 'heist'
env = make_env(game, n_envs=params.num_envs) #Initialise just for the agent to get obs and action spaces
log = wandb.init(project=f"procgen-ppo-explorations-{game}-{params.name}", config=params)

## Training Loop

In [None]:
agent = DistillPPOAgent(env.observation_space.shape, env.action_space.n, params=params)
env = make_env(game, n_envs=params.num_envs, params=params, student_model=agent.student, teacher_model= agent.teacher)
test_reward = []
best_reward = None
mean_rewards = []
steps = 0
test_count = 0

while steps < params.total_epochs:
    tot_reward_episode = []
    state = env.reset()  
    done = np.zeros(params.num_envs)
    for _ in range(params.traj_steps):
        action, log_prob, value_ext, value_int = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.trajectories.log_transition(state, action, reward, done, log_prob, value_ext, value_int)
        state = next_state
        tot_reward_episode.append(reward[0])
    _,_,last_ext_value, last_int_value = agent.select_action(state)
    agent.trajectories.save_final_state(state, last_ext_value, last_int_value)
    mean_reward = np.mean(np.sum([tot_reward_episode[i][0] for i in range(len(tot_reward_episode))]))
    tot_reward_episode = []
 
    '''
    Log mean reward of only one environment
    '''
    wandb.log({"mean_reward": mean_reward})
    print(f"Episode: {steps}")
    mean_rewards.append(mean_reward)

    ''' 
    Compute estimates of advantage function and the discounted returns
    Try to decrease gamma as training goes on
    '''
    agent.trajectories.calculate_advantages_and_returns(discount_factor=min(0.999, params.gamma + steps * (params.total_epochs - params.gamma) / params.total_epochs*0.1))
    agent.trajectories.compute_reference_values(discount_factor=min(0.999, params.gamma + steps * (params.total_epochs - params.gamma) / params.total_epochs*0.1))

    print("TRAINING!!!!")
    agent.train(steps)
    steps += params.traj_steps * params.num_envs
    agent.optimizer = agent.improv_lr(agent.optimizer,params.lr, steps, params.total_epochs)
    agent.distillation_optimizer = agent.improv_lr(agent.distillation_optimizer,params.lr_distillation, steps, params.total_epochs)
    if steps > ((test_count+1) * (params.total_epochs // params.tests_to_do)):
        print("TESTING!!!!!!!!!!")
        agent.net.train(False)
        ts = time.time()
        rewards = agent.testing(game)
        print(f"Test finished in {time.time() - ts:.2f}s, Test Rewards {rewards}")
        test_reward.append(rewards)
        wandb.log({"test_reward": rewards})
        if best_reward is None or best_reward < rewards:
            print(f"New Best Reward: {rewards}")
            best_reward = rewards
            name = f"checkpoints/best_{params.name}_{game}.dat"
            torch.save(agent.net.state_dict(), name)
            if best_reward >= params.stop_reward:
                print(f"Stopped Training!")
                break
        agent.net.train(True)
        test_count += 1
env.close()