In [1]:
from unityagents import UnityEnvironment
from collections import deque

import numpy as np
import progressbar as pb

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.distributions import normal

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
import gym

env = gym.make('LunarLanderContinuous-v2')

In [4]:
num_agents = 1
action_size = env.action_space.shape[0]
state_size = env.observation_space.shape[0]

In [5]:
# Normal
# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py
# FixedNormal = torch.distributions.Normal

# log_prob_normal = FixedNormal.log_prob
# FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True)

# normal_entropy = FixedNormal.entropy
# FixedNormal.entropy = lambda self: normal_entropy(self).sum(-1)
# FixedNormal.mode = lambda self: self.mean

class Policy(nn.Module):
    
    def __init__(self, state_size, action_size=1, n_agents=1, fc1_size=256, fc2_size=256):
        super(Policy, self).__init__()
        
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3_mu = nn.Linear(fc2_size, action_size)
        self.fc3_sigma = nn.Linear(fc2_size, action_size)
        
        self.fc3_mu.bias.data.fill_(0.3)
        self.fc3_mu.weight.data.fill_(0.1)
        
        self.fc3_sigma.bias.data.fill_(0.5)
        self.fc3_sigma.weight.data.fill_(0.3)

        
    def forward(self, state):
        x = F.relu(self.fc1(state)) 
        x = F.relu(self.fc2(x))

        mu = torch.tanh(self.fc3_mu(x))
        sigma = torch.sigmoid(self.fc3_sigma(x))
        
        return Normal(mu, 0.01)
    
class Value(nn.Module):
    
    def __init__(self, state_size, action_size=1, n_agents=1, fc1_size=400, fc2_size=300):
        
        super(Value, self).__init__()
        
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [6]:
def interact(action):
    next_state, reward, done, _ = env.step(action.reshape(action_size))
#     env.render()
    return next_state.reshape(num_agents, -1), np.array(reward).reshape(num_agents, -1), np.array(done).reshape(num_agents, -1)

def reset():
    state = env.reset()
    return state.reshape(num_agents, -1)

In [7]:
from agent import Agent

agent = Agent(
    state_size=state_size, 
    action_size=action_size,
    policy_network=Policy,
    value_network=Value,
    n_agents=num_agents, 
    device=device,
)

def run(fn_interact, fn_reset, agent):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=200)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        states = fn_reset()
        score = 0
        for t in range(max_t):
            
            actions = agent.act(states)
            
            next_states, rewards, dones = fn_interact(actions)
            
            agent.step(states, actions, rewards, next_states, dones)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        timer.update(i_episode)
    return scores

TypeError: __init__() missing 1 required positional argument: 'q_network'

In [None]:
%time scores = run(t_max=300, n_episodes=int(1000), print_every=100)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()