# Advantage Actor Critic for Wazuhl Environment

In this notebook we implement A2C algorithm for Wazuhl Environment.

In [10]:
import logging
logging.getLogger().setLevel(logging.WARNING)

In [11]:
import random
import gym
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
import utils

In [16]:
from models.a2c import ActorCritic

In [13]:
env = gym.make('wazuhl_gym:wazuhl-v0')

In [14]:
actions = utils.get_possible_actions(env)

In [17]:
actions = actions + ["terminal"] # we need this for A2C

Apply Actor-Critic for Wazuhl environment:

In [22]:
max_episodes = 3
num_steps = 10000

In [25]:
num_inputs = len(actions)
num_outputs = env.action_space.n
   
actor_critic = ActorCritic(num_inputs, num_outputs)
ac_optimizer = optim.Adam(actor_critic.parameters(), lr=1e-4)

all_lengths = []
average_lengths = []
all_rewards = []
entropy_term = 0

for episode in range(max_episodes):
    log_probs = []
    values = []
    rewards = []

    state, reward, done, _ = env.reset()
    for steps in range(num_steps):
        value, policy_dist = actor_critic.forward(torch.tensor(state, dtype=torch.float32))
        value = value.detach().numpy()[0,0]
        dist = policy_dist.detach().numpy() 

        action = np.random.choice(num_outputs, p=np.squeeze(dist))
        log_prob = torch.log(policy_dist.squeeze(0)[action])
        entropy = -np.sum(np.mean(dist) * np.log(dist))
        new_state, reward, done, _ = env.step(action)

        rewards.append(reward)
        values.append(value)
        log_probs.append(log_prob)
        entropy_term += entropy
        state = new_state
            
        if done or steps == num_steps-1:
            Qval, _ = actor_critic.forward(new_state)
            Qval = Qval.detach().numpy()[0,0]
            all_rewards.append(np.sum(rewards))
            all_lengths.append(steps)
            average_lengths.append(np.mean(all_lengths[-10:]))
            if episode % 10 == 0:                    
                sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
            break
        
    # compute Q values
    Qvals = np.zeros_like(values)
    for t in reversed(range(len(rewards))):
        Qval = rewards[t] + GAMMA * Qval
        Qvals[t] = Qval
  
    #update actor critic
    values = torch.FloatTensor(values)
    Qvals = torch.FloatTensor(Qvals)
    log_probs = torch.stack(log_probs)
        
    advantage = Qvals - values
    actor_loss = (-log_probs * advantage).mean()
    critic_loss = 0.5 * advantage.pow(2).mean()
    ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

    ac_optimizer.zero_grad()
    ac_loss.backward()
    ac_optimizer.step()

smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean()
smoothed_rewards = [elem for elem in smoothed_rewards]
plt.plot(all_rewards)
plt.plot(smoothend_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

plt.plot(all_lengths)
plt.plot(average_lengths)
plt.xlabel('Episode')
plt.ylabel('Episode length')
plt.show()

RuntimeError: size mismatch, m1: [1 x 4853], m2: [102 x 256] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:961