# Import

In [1]:
#import gym
import gymnasium as gym
from gymnasium import spaces
import numpy as np

import torch as th
import torch.nn as nn

import math

from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_checker import check_env

# Importing the optimzation frame - HPO
import optuna
# Import os to deal with filepaths
import os

from snake_game import Snake
from env import SnakeCnnEnv

  from .autonotebook import tqdm as notebook_tqdm


# Dirs

In [2]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'
TRAIN_DIR = './train/'
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))
CHECKPOINT_DIR = './train/'

# Check Enviroment 

In [3]:
#Check enviroment
env = SnakeCnnEnv()
check_env(env)

# Hyperparameter tune

In [4]:
# Function to return test hyperparameters - define the object function
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_categorical("n_steps", [512, 1024, 2048]), #:trial.suggest_int('n_steps', 512, 4096),
        'gamma':trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_float('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.99),
        'batch_size':trial.suggest_categorical("batch_size", [16, 32, 64]),
        'ent_coef':trial.suggest_categorical('ent_coef', [0.0, 0.01, 0.05])
    }

In [5]:
# Run a training loop and return mean reward 
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = SnakeCnnEnv()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        #env = VecFrameStack(env, 4, channels_order='first')
        
        # Create algo 
        model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        
        model.learn(total_timesteps=250000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

In [None]:
# Creating the experiment,
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

In [None]:
{'n_steps': 512,
 'gamma': 0.9907046080717223,
 'learning_rate': 3.1170077903648384e-05,
 'clip_range': 0.18200075440617328,
 'gae_lambda': 0.9554746351940955,
 'batch_size': 32,
 'ent_coef': 0.01}

In [7]:
study.best_params

{'n_steps': 512,
 'gamma': 0.9907046080717223,
 'learning_rate': 3.1170077903648384e-05,
 'clip_range': 0.18200075440617328,
 'gae_lambda': 0.9554746351940955,
 'batch_size': 32,
 'ent_coef': 0.01}

In [None]:
study.best_trial

# Setup Callback

In [4]:
# Auto save models with best results
class TrainAndLoggingCallback(BaseCallback):
    
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [5]:
callback = TrainAndLoggingCallback(check_freq=50000, save_path=CHECKPOINT_DIR)

# Train Model

In [3]:
# Load best opt model
model = PPO.load(os.path.join(OPT_DIR, 'trial_5_best_model'))

In [8]:
# Create environment 
env = SnakeCnnEnv()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
#env = VecFrameStack(env, 4, channels_order='first')

In [9]:
model.set_env(env)

In [None]:
# Init new model with best opt params
#model_params = study.best_params

# Create algo 
#model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)

In [None]:
# Kick off training
model.learn(total_timesteps=100000, callback=callback)

# Draft

In [None]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, 'trial_4_best_model'))

In [None]:
model = PPO.load(os.path.join(CHECKPOINT_DIR, 'best_model_990000'))

In [None]:
model = PPO.load('best36_02')

In [None]:
model.save('best36_02')

In [None]:
#model.load(os.path.join(TRAIN_DIR, 'best_model_510000'))
model = PPO.load(os.path.join(TRAIN_DIR, 'best_model_510000'))
params = model.get_parameters()

#policy_kwargs = dict(
#    features_extractor_class=CustomCNN#,
#    #normalize_images=False
#)

model_params = {
    'n_steps': 2048,
    'gamma': 0.9144123932704298,
    'learning_rate': 3.585320520367126e-05,
    'clip_range': 0.2692047740205521,
    'gae_lambda': 0.8326092523979186,
    'batch_size': 32
}


# Create algo 
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params) # policy_kwargs=policy_kwargs,
model.set_parameters(params)

In [None]:
model.set_env(env)

In [None]:
env = SnakeCnnEnv()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])

CHECKPOINT_DIR = './train_05/'
callback = TrainAndLoggingCallback(check_freq=25000, save_path=CHECKPOINT_DIR)

model.set_env(env)
# Kick off training
model.learn(total_timesteps=5000000, callback=callback)
# model.learn(total_timestep=5000000) 

# Custom Policy (Draft)

In [None]:
# Neural network for predicting action values
class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int=128):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # CxHxW images (channels first)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            #nn.Conv2d(64,  4, kernel_size=1, stride=1, padding=0),
            #nn.ReLU(),
            #nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            #nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

model_params = {
    #'n_steps': 1024,
    #'gamma': 0.9076135656146878,
    'learning_rate': 3.0e-05,
    #'clip_range': 0.12,
    #'gae_lambda': 0.9362730857937813,
    #'batch_size': 32,
    'ent_coef': 0.01
}

policy_kwargs = dict(
    features_extractor_class=CustomCNN#,
    #normalize_images=False
)

#env = SnakeCnnEnv()
#model = PPO("CnnPolicy", env, verbose=1, policy_kwargs=policy_kwargs, **model_params)