In [None]:
import gym
import pybullet_envs
import torch as T
import torch.nn as nn
import pickle
from torch.optim import Adam
import itertools
import numpy as np
import scipy.signal

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.normal import Normal

import dmc2gym
import os
import pandas as pd

In [None]:
def combined_shape(length, shape=None):
    if shape is None:
        return (length,)
    return (length, shape) if np.isscalar(shape) else (length, *shape)

def mlp(sizes, activation, output_activation=nn.Identity):
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)

def count_vars(module):
    return sum([np.prod(p.shape) for p in module.parameters()])


LOG_STD_MAX = 2
LOG_STD_MIN = -20

class SquashedGaussianMLPActor(nn.Module):

    def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
        super().__init__()
        self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation)
        self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim)
        self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim)
        self.act_limit = act_limit

    def forward(self, obs, deterministic=False, with_logprob=True):
        net_out = self.net(obs)
        mu = self.mu_layer(net_out)
        log_std = self.log_std_layer(net_out)
        log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        std = torch.exp(log_std)

        # Pre-squash distribution and sample
        pi_distribution = Normal(mu, std)
        if deterministic:
            # Only used for evaluating policy at test time.
            pi_action = mu
        else:
            pi_action = pi_distribution.rsample()

        if with_logprob:
            # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
            # NOTE: The correction formula is a little bit magic. To get an understanding 
            # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 
            # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
            # Try deriving it yourself as a (very difficult) exercise. :)
            logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
            logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1)
        else:
            logp_pi = None

        pi_action = torch.tanh(pi_action)
        pi_action = self.act_limit * pi_action

        return pi_action, logp_pi


class MLPQFunction(nn.Module):

    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
        super().__init__()
        self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)

    def forward(self, obs, act):
        q = self.q(torch.cat([obs, act], dim=-1))
        return torch.squeeze(q, -1) # Critical to ensure q has right shape.

class MLPActorCritic(nn.Module):

    def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
                 activation=nn.ReLU):
        super().__init__()

        obs_dim = observation_space.shape[0]
        act_dim = action_space.shape[0]
        act_limit = action_space.high[0]

        # build policy and value functions
        self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
        self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
        self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)

    def act(self, obs, deterministic=False):
        with torch.no_grad():
            a, _ = self.pi(obs, deterministic, False)
            return a.numpy()

In [None]:
seeds = ['123', '666', '742', '637', '4637']

In [None]:
test_env = dmc2gym.make(domain_name='walker', task_name='stand')
for k in range(0,5):
    
    ac_kwargs = {}
    lr = 0.001

    dir_path = "data/sac_dmWalkerStand_256/sac_dmWalkerStand_256_s" + seeds[k]
    
    df = pd.DataFrame(columns = ['Checkpoint', 'Return', 'Undiscounted Return', 'Soft Return', 'Q1',  
                                                  'Q2', 'AvgQ', 'MSE', 'MAE']) 
    
    i = 10000
    while True:
        print(i)
        
        if i==2000000:
            ac2 = T.load(dir_path+"/pyt_save/model.pt")
        
        elif i>2000000:
            break

        else:
            ac2 = MLPActorCritic(test_env.observation_space, test_env.action_space, **ac_kwargs)
            pi_optimizer = Adam(ac2.pi.parameters(), lr=lr)
            q_params = itertools.chain(ac2.q1.parameters(), ac2.q2.parameters())
            q_optimizer = Adam(q_params, lr=lr)

            model_path = "/checkpoints/model_checkpoint_{}.tar".format(i)
            checkpoint = T.load(dir_path+model_path)
            ac2.load_state_dict(checkpoint['ac_state_dict'])
            pi_optimizer.load_state_dict(checkpoint['pi_optimizer_state_dict'])
            q_optimizer.load_state_dict(checkpoint['q_optimizer_state_dict'])
            ac2.eval()

        
        max_ep_len = 1000

        ep_rets = []
        soft_ep_rets = []
        undisc_ep_rets = []
        q1s = []
        q2s = []
        aveqs = []
        
        
        for j in range(0, 20):
            trajs = []
            scale = 1.0
            gamma = 0.99
            o, d, ep_ret, undisc_ep_ret, ep_len = test_env.reset(), False, 0, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                prev_o = o
                action = ac2.act(T.as_tensor(o, dtype=T.float32), deterministic = True)
                
                if ep_len == 0:
                    q1 = ac2.q1(T.as_tensor(o, dtype=T.float32).view(1,-1), T.as_tensor(action, dtype=T.float32).view(1,-1))
                    q1s.append(q1.item())
                    q2 = ac2.q2(T.as_tensor(o, dtype=T.float32).view(1,-1), T.as_tensor(action, dtype=T.float32).view(1,-1))
                    q2s.append(q2.item())
                    aveqs.append((q1.item()+q2.item())/2)
                
                _, log_prob = ac2.pi(T.as_tensor(o, dtype=T.float32).view(1,-1))
                o, r, d, _ = test_env.step(action)
                
                trajs.append([prev_o, action, o, r, log_prob])
                
                ep_ret += r*scale
                scale *= gamma
                
                undisc_ep_ret += r
                
                ep_len += 1
                
            ep_rets.append(ep_ret)
            undisc_ep_rets.append(undisc_ep_ret)

            v_tp1 = 0
            gamma = 0.99
            for t in reversed(trajs):
                o, a, next_o, r, log_prob = t
                v_t = r + gamma*v_tp1
                v_tp1 = v_t - 0.2*log_prob
            
            soft_ep_rets.append(v_t.item())
        
        row = pd.Series({'Checkpoint': int(i/10000), 'Return': np.mean(ep_rets), 'Undiscounted Return': np.mean(undisc_ep_rets), 
                        'Soft Return': np.mean(soft_ep_rets), 'Q1': np.mean(q1s), 'Q2': np.mean(q2s), 
                         'AvgQ': np.mean(aveqs), 'MSE': (np.mean(soft_ep_rets) - np.mean(aveqs))**2, 
                         'MAE': np.absolute((np.mean(soft_ep_rets) - np.mean(aveqs)))})
        df = df.append(row, ignore_index=True)
        
        i = i+10000
    
    df.to_csv(dir_path +'/q_error_wrt_MonteCarlo_estimates.csv', index=False)