# Self-Critic experiments

### Imports

In [1]:
%matplotlib inline
import sys
from time import time

import gym
import numpy as np
import torch
from torch import optim
from tqdm import tqdm as _tqdm
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from envs.gridworld import GridworldEnv
from envs.windy_gridworld import WindyGridworldEnv
from models import PolicyNetwork, ValueNetwork
from utils import (compare_baselines_plot, get_running_time, plot_episodes_durations_losses, run_episode,
                   sample_greedy_return, select_action, set_seeds, smooth)

def tqdm(*args, **kwargs):
    return _tqdm(*args, **kwargs, mininterval=1)  # Safety, do not overflow buffer

EPS = float(np.finfo(np.float32).eps)

assert sys.version_info[:3] >= (3, 6, 0), "Make sure you have Python 3.6 installed!"

### Experimental settings

In [2]:
num_runs = 50
num_episodes = 1000
discount_factor = 0.99
learn_rate = 0.001
grid_shape = [10, 10]
init_temperature = 1.1
stochasticity = 0.0

## REINFORCE w/ No Baseline

In [3]:
def compute_reinforce_loss_no_baseline(episode, discount_factor):
    discounted_return_list = []
    log_p_list = []
    G = 0
    
    for s, a, log_p, s_next, reward in reversed(episode):
        G = reward + discount_factor * G
        
        discounted_return_list.append(G)
        log_p_list.append(log_p)
    
    log_p_tensor = torch.stack(log_p_list)
    discounted_return_tensor = torch.FloatTensor(discounted_return_list)
    
    loss = - torch.sum(log_p_tensor * discounted_return_tensor)
    
    return loss

def run_episodes_no_baseline(model, env, num_episodes, discount_factor, learn_rate, init_temp = init_temperature): 
    optimizer = optim.Adam(model.parameters(), learn_rate)
    episode_durations = []
    losses = []
    
    for i in range(num_episodes):
        optimizer.zero_grad()
        
        episode = run_episode(env, model, i, init_temp, stochasticity)
        loss = compute_reinforce_loss_no_baseline(episode, discount_factor)
        
        loss.backward()
        optimizer.step()
          
        losses.append(loss.detach().numpy())
        episode_durations.append(len(episode))
    
        del episode
        
    return np.asanyarray(episode_durations), np.asanyarray(losses)

In [None]:
def opt_nobaseline(param_dict):
    print(param_dict)
    env = gym.make('CartPole-v1')
    opt_num_runs = 5
    result = 0
    for i in range(opt_num_runs):
        start_time = time()

        model = PolicyNetwork(input_dim=4, hidden_dim=int(param_dict['policy_hiddens']), output_dim=2)
        seed = 40 + i
        set_seeds(env, seed)

        episode_durations, _ = run_episodes_no_baseline(model, 
                                                        env, 
                                                        num_episodes, 
                                                        param_dict['discount_factor'], 
                                                        param_dict['learn_rate'],
                                                        init_temp = param_dict['init_temp'])
        result += sum(episode_durations)/len(episode_durations)

        del model
    
    result /= opt_num_runs
    print("average number of episodes:", result)
    return result

space = {
    'discount_factor': hp.uniform('discount_factor', 0.000001, 0.9999999),
    'learn_rate': hp.loguniform('learn_rate', -5, -1),
    'policy_hiddens': hp.quniform('policy_hiddens', 16, 512, 2),
    'init_temp': hp.uniform('init_temp', 1, 2),
}

trials = Trials()
best = fmin(fn=opt_nobaseline, space=space, algo=tpe.suggest, max_evals=25, trials=trials)
print("best run:", best)

In [None]:
env = gym.make('CartPole-v1')

no_episode_durations = []
no_policy_losses = []

# hyper parameter settings found through tuning
discount_factor = 0.7919018685923426
init_temp = 1.018128600121523
learn_rate = 0.21449972356736796
hidden_dim = 28


for i in range(num_runs):
    start_time = time()

    model = PolicyNetwork(input_dim=4, hidden_dim=hidden_dim, output_dim=2)
    seed = 40 + i
    set_seeds(env, seed)

    episode_durations, policy_losses = run_episodes_no_baseline(model, 
                                                                env, 
                                                                num_episodes, 
                                                                discount_factor, 
                                                                learn_rate,
                                                                init_temp)
    
    no_episode_durations.append(episode_durations)
    no_policy_losses.append(policy_losses)
    
    del model
    
    end_time = time()
    h, m, s = get_running_time(end_time - start_time)
    
    print(f'Done with run {i+1}/{num_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds')

In [None]:
plot_episodes_durations_losses(no_episode_durations, no_policy_losses, None, 'No baseline')

## REINFORCE w/ Learned Baseline (Value Network)

In [4]:
def compute_reinforce_loss_with_learned_baseline(value_model, episode, discount_factor, env):    
    discounted_return_list = []
    log_p_list = []
    G = 0
    
    for s, a, log_p, s_next, reward in reversed(episode):
        G = reward + discount_factor * G
        
        # state = np.unravel_index(s, env.shape)
        baseline = value_model(torch.FloatTensor(s))
        
        discounted_return_list.append(G - baseline)
        log_p_list.append(log_p)
        
    log_p_tensor = torch.stack(log_p_list)
    discounted_return_tensor = torch.FloatTensor(discounted_return_list)
    
    loss = - torch.sum(log_p_tensor * discounted_return_tensor)
    
    return loss

def compute_value_loss(value_model, episode, discount_factor, env):
    returns = []
    value_estimates = []
    G = 0
    
    for s, a, log_p, s_next, reward in reversed(episode):
        G = reward + discount_factor * G
        returns.append(G)
        
        # state = np.unravel_index(s, env.shape)
        value_estimates.append(value_model(torch.FloatTensor(s)))

    value_estimates_tensor = torch.stack(value_estimates) 
    returns_tensor = torch.FloatTensor(returns)
    
    loss = torch.sum(torch.abs(returns_tensor - value_estimates_tensor))
    
    return loss

def run_episodes_with_learned_baseline(policy_model, value_model, env, num_episodes, discount_factor, 
                                       learn_rate_policy, learn_rate_value, init_temp = init_temperature):
    policy_optimizer = optim.Adam(policy_model.parameters(), learn_rate_policy)
    value_optimizer = optim.Adam(value_model.parameters(), learn_rate_value)
    
    episode_durations = []
    value_losses = []
    reinforce_losses = []
    
    for i in range(num_episodes):    
        policy_optimizer.zero_grad()
        value_optimizer.zero_grad()
        
        episode = run_episode(env, policy_model, i, init_temp, stochasticity)
        reinforce_loss = compute_reinforce_loss_with_learned_baseline(value_model, episode, discount_factor, env)
        value_loss = compute_value_loss(value_model, episode, discount_factor, env)
        
        reinforce_loss.backward()
        policy_optimizer.step()
        
        value_loss.backward()
        value_optimizer.step()
            
        episode_durations.append(len(episode))
        reinforce_losses.append(reinforce_loss.detach().numpy())
        value_losses.append(value_loss.detach().numpy())
    
        del episode
        
    return np.asanyarray(episode_durations), np.asanyarray(reinforce_losses), np.asanyarray(value_losses)

In [None]:
def opt_learned_baseline(param_dict):
    print(param_dict)
    env = gym.make('CartPole-v1')
    opt_num_runs = 5
    result = 0
    for i in range(opt_num_runs):
        start_time = time()

        policy_model = PolicyNetwork(input_dim=4, hidden_dim = int(param_dict['policy_hiddens']), output_dim=2)
        value_model = ValueNetwork(input_dim=4, hidden_dim = int(param_dict['value_hiddens']))
        seed = 40 + i
        set_seeds(env, seed)

        episode_durations, _, _ = run_episodes_with_learned_baseline(policy_model,
                                                                     value_model,
                                                                     env,
                                                                     num_episodes,
                                                                     param_dict['discount_factor'], 
                                                                     param_dict['learn_rate_policy'],
                                                                     param_dict['learn_rate_value'],
                                                                     init_temp = param_dict['init_temp'],)
        result += sum(episode_durations)/len(episode_durations)

        del policy_model
        del value_model
    
    result /= opt_num_runs
    print("average number of episodes:", result)
    return result

space = {
    'discount_factor': hp.uniform('discount_factor', 0.000001, 0.9999999),
    'learn_rate_policy': hp.loguniform('learn_rate_policy', -5, -1),
    'learn_rate_value': hp.loguniform('learn_rate_value', -5, -1),
    'policy_hiddens': hp.quniform('policy_hiddens', 16, 512, 2),
    'value_hiddens': hp.quniform('value_hiddens', 16, 512, 2),
    'init_temp': hp.uniform('init_temp', 1, 2),
}

trials = Trials()
best = fmin(fn=opt_learned_baseline, space=space, algo=tpe.suggest, max_evals=25, trials=trials)
print(best)

In [None]:
env = gym.make('CartPole-v1')

learned_baseline_episode_durations = []
learned_baseline_policy_losses = []
learned_baseline_value_losses = []

# hyper parameter settings found through tuning
discount_factor = 0.9067307370442468
init_temp = 1.9831668926885457
learn_rate_policy = 0.3266448140250817
learn_rate_value = 0.13129023566325262
hidden_dim_policy = 510
hidden_dim_value = 432

for i in range(num_runs):
    start_time = time()
    
    policy_model = PolicyNetwork(input_dim=4, hidden_dim=hidden_dim_policy, output_dim=2)
    value_model = ValueNetwork(input_dim=4, hidden_dim=hidden_dim_value)
    seed = 40 + i
    set_seeds(env, seed)

    episode_durations, policy_losses, value_losses = run_episodes_with_learned_baseline(policy_model,
                                                                                        value_model,
                                                                                        env,
                                                                                        num_episodes,
                                                                                        discount_factor,
                                                                                        learn_rate_policy,
                                                                                        learn_rate_value,
                                                                                        init_temp)
    
    learned_baseline_episode_durations.append(episode_durations)
    learned_baseline_policy_losses.append(policy_losses)
    learned_baseline_value_losses.append(value_losses)
    
    del policy_model
    del value_model
    
    end_time = time()
    h, m, s = get_running_time(end_time - start_time)
    
    print(f'Done with run {i+1}/{num_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds')

In [None]:
plot_episodes_durations_losses(learned_baseline_episode_durations, learned_baseline_policy_losses, learned_baseline_value_losses, 'Learned baseline')

## REINFORCE w/ Self-Critic Baseline

In [5]:
def compute_reinforce_loss_with_SC_baseline(model, episode, discount_factor, env):    
    discounted_return_list = []
    log_p_list = []
    G = 0
    
    for s, a, log_p, s_next, reward in reversed(episode):
        G = reward + discount_factor * G
        
        baseline = sample_greedy_return(model, env, discount_factor, s)
        
        discounted_return_list.append(G - baseline)
        log_p_list.append(log_p)
        
    log_p_tensor = torch.stack(log_p_list)
    discounted_return_tensor = torch.FloatTensor(discounted_return_list)
    
    loss = - torch.sum(log_p_tensor * discounted_return_tensor)
    
    return loss


def run_episodes_with_SC_baseline(model, env, num_episodes, discount_factor, learn_rate, init_temp = init_temperature):
    optimizer = optim.Adam(model.parameters(), learn_rate)
    
    episode_durations = []
    policy_losses = []
    
    for i in range(num_episodes):
        optimizer.zero_grad()
        
        episode = run_episode(env, model, i, init_temp, stochasticity)
        loss = compute_reinforce_loss_with_SC_baseline(model, episode, discount_factor, env)
        
        loss.backward()
        optimizer.step()
        
        episode_durations.append(len(episode))
        policy_losses.append(loss.detach().numpy())
        
        del episode
        
    return np.asanyarray(episode_durations), np.asanyarray(policy_losses)

In [None]:
def opt_SC_baseline(param_dict):
    print(param_dict)
    env = gym.make('CartPole-v1')
    opt_num_runs = 5
    result = 0
        
    for i in range(opt_num_runs):
        start_time = time()

        policy_model = PolicyNetwork(input_dim=4, hidden_dim = int(param_dict['policy_hiddens']), output_dim=2)
        seed = 40 + i
        set_seeds(env, seed)

        episode_durations, _ = run_episodes_with_SC_baseline(policy_model,
                                                            env,
                                                            num_episodes,
                                                            param_dict['discount_factor'], 
                                                            param_dict['learn_rate_policy'],
                                                            init_temp = param_dict['init_temp'],)    
        result += sum(episode_durations)/len(episode_durations)

        del policy_model
    
    result /= opt_num_runs
    print("average number of episodes:", result)
    return result

space = {
    'discount_factor': hp.uniform('discount_factor', 0.97, 0.9999999),
    'learn_rate_policy': hp.loguniform('learn_rate_policy', -5, -1),
    'policy_hiddens': hp.quniform('policy_hiddens', 16, 512, 2),
    'init_temp': hp.uniform('init_temp', 1, 1.3),
}

trials = Trials()
best = fmin(fn=opt_SC_baseline, space=space, algo=tpe.suggest, max_evals=25, trials=trials)
print(best)

{'discount_factor': 0.9990030370482987, 'init_temp': 1.0623639255845339, 'learn_rate_policy': 0.04699806551504531, 'policy_hiddens': 50.0}
average number of episodes:                                                                                            
36.6546                                                                                                                
{'discount_factor': 0.9813725603397145, 'init_temp': 1.1173355838842431, 'learn_rate_policy': 0.05530423927049258, 'policy_hiddens': 400.0}
average number of episodes:                                                                                            
10.345600000000001                                                                                                     
{'discount_factor': 0.9822410077379976, 'init_temp': 1.250339814041185, 'learn_rate_policy': 0.2031167875849609, 'policy_hiddens': 86.0}
average number of episodes:                                                                             

In [None]:
env = gym.make('CartPole-v1')

sc_baseline_episode_durations = []
sc_baseline_policy_losses = []

# hyper parameter settings found through tuning
discount_factor = 0.9067307370442468
init_temp = 1.9831668926885457
learn_rate = 0.3266448140250817
hidden_dim = 510

for i in range(num_runs):
    start_time = time()
    
    policy_model = PolicyNetwork(input_dim=4, hidden_dim =hidden_dim, output_dim=2)
    seed = 40 + i
    set_seeds(env, seed)

    episode_durations, policy_losses = run_episodes_with_SC_baseline(policy_model, 
                                                                     env, 
                                                                     num_episodes, 
                                                                     discount_factor, 
                                                                     learn_rate)
    
    sc_baseline_episode_durations.append(episode_durations)
    sc_baseline_policy_losses.append(policy_losses)
    
    del policy_model
    
    end_time = time()
    h, m, s = get_running_time(end_time - start_time)
    
    print(f'Done with run {i+1}/{num_runs} in {f"{h} hours, " if h else ""}{f"{m} minutes and " if m else ""}{s} seconds')

In [None]:
plot_episodes_durations_losses(sc_baseline_episode_durations, sc_baseline_policy_losses, None, 'Self-Critic baseline')

## Comparison of baselines

In [None]:
baselines_dict = {
    'No baseline': {
        'episode_durations': no_episode_durations,
        'policy_losses': no_policy_losses,
        'color': 'red'
    }, 'Learned baseline': {
        'episode_durations': learned_baseline_episode_durations,
        'policy_losses': learned_baseline_policy_losses,
        'color': 'green'
    }, 'Self-Critic baseline': {
        'episode_durations': sc_baseline_episode_durations,
        'policy_losses': sc_baseline_policy_losses,
        'color': 'blue'
    }
}

compare_baselines_plot(baselines_dict)