In [2]:
import gym 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical, Normal
import time
import numpy as np

from policy_gradients import PolicyGradients

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

env = gym.make('Pendulum-v1')
#env = gym.make('CartPole-v1')

continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_states = env.observation_space.shape[0]
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

class Policy(nn.Module):

    def __init__(self, dim_states, dim_actions, continuous_control):
        super(Policy, self).__init__()
        # MLP, fully connected layers, ReLU activations, linear ouput activation
        # dim_states -> 64 -> 64 -> dim_actions

        self._continuous_control = continuous_control

        self.fc1 = nn.Linear(dim_states, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, dim_actions)

        if continuous_control:
            # trainable parameter
            self._log_std = torch.tensor([-0.5] * dim_actions)
            self._log_std = nn.Parameter(self._log_std)

    def forward(self, input):
        
        input = F.relu(self.fc1(input))
        input = F.relu(self.fc2(input))
        
        if self._continuous_control:
            mean = self.fc3(input)
            std = torch.exp(self._log_std)
            return mean, std
        else:
            probs = F.softmax(self.fc3(input), dim = 1)
            return probs
        

policy = Policy(dim_states=dim_states, dim_actions=dim_actions, continuous_control=continuous_control).to(device)

obs_t = env.reset()
obs_t = torch.from_numpy(obs_t).float().unsqueeze(0).to(device)

if continuous_control:
    mean, std = policy(obs_t)
    distr = Normal(mean, std)
    
else:
    probs = policy(obs_t)
    distr = Categorical(probs)

action = distr.sample()
log_prob = distr.log_prob(action) # log probability of action

action

device: cpu


tensor([[-0.2833]])

In [44]:
def perform_single_rollout(env, agent, episode_nb, render=False):

    # Modify this function to return a tuple of numpy arrays containing (observations, actions, rewards).
    # (np.array(obs), np.array(acs), np.array(rws))
    # np.array(obs) -> shape: (time_steps, nb_obs)
    # np.array(acs) -> shape: (time_steps, nb_acs) if actions are continuous, (time_steps,) if actions are discrete
    # np.array(rws) -> shape: (time_steps,)

    ob_t = env.reset()
    
    done = False
    episode_reward = 0
    nb_steps = 0
    
    obs, acs, rws = [], [], []

    while not done:

        if render:
            env.render()
            time.sleep(1. / 60)

        action, pr = agent.select_action(ob_t) # cambiar a que solo reciba la accion?

        ob_t1, reward, done, _ = env.step(action)

        ob_t = np.squeeze(ob_t1) # <-- may not be needed depending on gym version
        episode_reward += reward
        
        nb_steps += 1

        obs.append(ob_t)
        acs.append(action)
        rws.append(reward)

        if done:
            
            obs = np.array(obs)
            acs = np.array(acs)
            rws = np.array(rws)
            
            assert obs.shape == (nb_steps, dim_states), 'shape of np.array(obs) is not (time_steps, nb_obs)'
            if continuous_control:
                assert acs.shape == (nb_steps, dim_actions), 'shape of np.array(acs) is not (time_steps, nb_acs)'
            else:
                assert acs.shape == (nb_steps,), 'shape of np.array(acs) is not (time_steps,)'
            assert rws.shape == (nb_steps,), 'shape of np.array(rws) is not (time_steps,)'
            
            return obs, acs, rws
        
        
env = gym.make('Pendulum-v1')
#env = gym.make('CartPole-v1')
continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_states = env.observation_space.shape[0]
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

agent = PolicyGradients(dim_states = dim_states, dim_actions = dim_actions, lr = 1e-3, gamma = .99, 
                        continuous_control = continuous_control)

perform_single_rollout(env, agent, 1)

(array([[ 0.85372853, -0.5207184 , -1.3495443 ],
        [ 0.8104147 , -0.5858567 , -1.5648906 ],
        [ 0.75089324, -0.66042364, -1.9089212 ],
        [ 0.6659998 , -0.7459519 , -2.4116044 ],
        [ 0.54512584, -0.83835423, -3.0458825 ],
        [ 0.39306608, -0.91951025, -3.4515097 ],
        [ 0.20258002, -0.9792657 , -3.999437  ],
        [-0.0291538 , -0.99957496, -4.662996  ],
        [-0.29380226, -0.9558662 , -5.3808866 ],
        [-0.5702182 , -0.82149327, -6.171384  ],
        [-0.8139288 , -0.5809646 , -6.8822303 ],
        [-0.9661921 , -0.2578232 , -7.182896  ],
        [-0.9955992 ,  0.09371343, -7.092394  ],
        [-0.901505  ,  0.43276867, -7.074211  ],
        [-0.7148371 ,  0.69929105, -6.5368705 ],
        [-0.4816986 ,  0.87633693, -5.8759828 ],
        [-0.24440828,  0.96967244, -5.1136513 ],
        [-0.02434369,  0.99970365, -4.4512663 ],
        [ 0.15560737,  0.98781896, -3.6117675 ],
        [ 0.30180037,  0.95337117, -3.006765  ],
        [ 0.40891096

In [82]:
def sample_rollouts(env, agent, training_iter, min_batch_steps):

    sampled_rollouts = []
    total_nb_steps = 0
    episode_nb = 0
    
    while total_nb_steps < min_batch_steps:

        episode_nb += 1
        render = training_iter%10 == 0 and len(sampled_rollouts) == 0 # Change training_iter%10 to any number you want

        # Use perform_single_rollout to get data 
        # Uncomment once perform_single_rollout works.
        # Return sampled_rollouts
        
        """
        sample_rollout = perform_single_rollout(env, agent, episode_nb, render=render)
        total_nb_steps += len(sample_rollout[0])

        sampled_rollouts.append(sample_rollout)
        """
        
        sample_rollout = perform_single_rollout(env, agent, episode_nb, render=render)
        total_nb_steps += len(sample_rollout[0])

        sampled_rollouts.append(sample_rollout)
        
    return sampled_rollouts

In [288]:
_use_reward_to_go = False
_use_baseline = False
_gamma = 0.99

def estimate_returns(rollouts_rew):
    estimated_returns = []
    for rollout_rew in rollouts_rew:
            
        if _use_reward_to_go:
            # only for part 2
            estimated_return = None
        else:
            estimated_return = [rollout_rew[t] * (_gamma ** t) for t in range(len(rollout_rew))]
        
        estimated_returns = np.concatenate([estimated_returns, estimated_return])

    if _use_baseline:
        # only for part 2
        average_return_baseline = None
        # Use the baseline:
        #estimated_returns -= average_return_baseline

    return np.array(estimated_returns, dtype=np.float32)

In [292]:
env = gym.make('Pendulum-v1')
#env = gym.make('CartPole-v1')
continuous_control = isinstance(env.action_space, gym.spaces.Box)
dim_states = env.observation_space.shape[0]
dim_actions = env.action_space.shape[0] if continuous_control else env.action_space.n

training_iterations = 1000
min_batch_steps = 5000

policy = Policy(dim_states=dim_states, dim_actions=dim_actions, continuous_control=continuous_control).to(device)

agent = PolicyGradients(dim_states = dim_states, dim_actions = dim_actions, lr = 1e-3, gamma = .99, 
                        continuous_control = continuous_control)

optimizer = torch.optim.Adam(params = policy.parameters(), lr = 1e-3)

output = sample_rollouts(env, agent, training_iter=training_iterations, min_batch_steps=min_batch_steps)

sampled_obs = [output[i][0] for i in range(len(output))]
sampled_acs = [output[i][1] for i in range(len(output))]
sampled_rew = [output[i][2] for i in range(len(output))]

rewards = sampled_rew[0]
gamma = 0.99

estimated_returns = estimate_returns(sampled_rew)
estimated_returns.shape

(5000,)

: 

In [4]:
device

device(type='cpu')

In [6]:
a = np.array([1, 2])

torch.from_numpy(a).to(device)

tensor([1, 2])

In [287]:
rollout_obs = np.concatenate(sampled_obs)
rollout_acs = np.concatenate(sampled_acs)

rollout_obs = torch.from_numpy(rollout_obs)
rollout_acs = torch.from_numpy(rollout_acs)

if continuous_control:
    mean, std = policy(rollout_obs)
    distr = Normal(mean, std) # 200 mean, 1 std
else:
    probs = policy(rollout_obs) #.cpu()
    distr = Categorical(probs)

log_probs = distr.log_prob(rollout_acs) # return log_prob for each pair mean-action
log_probs = log_probs.squeeze()

loss = log_probs * torch.from_numpy(estimated_returns)

loss = loss.mean()

optimizer.zero_grad()
loss.backward()
optimizer.step()

In [12]:
c

tensor([-3, -8])

In [17]:
a = torch.tensor([1, 2]).to(device)
b = torch.tensor([3, 4]).to(device)

c = torch.multiply(-a, b).float()

torch.mean(c)

tensor(-5.5000)