Import All Libraries

In [12]:
import random
import yaml, argparse, os, json
#from multiprocessing import Process

import numpy as np
import pandas as pd

import scipy
from scipy import signal

import gym
from gym.spaces import Box, Discrete

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter
from torch.distributions.categorical import Categorical

Set Run Configurations

In [13]:
figure_count = 0
tb = None

In [14]:

# env = gym.make("CartPole-v1")
# env_obs_size = env.observation_space.shape[0]
# env_obs_size
# dummy_input = torch.randn(1, env_obs_size, requires_grad=True)
# dummy_input

Set Run Seed

In [15]:
# seed = 0
# np.random.seed(seed)
# random.seed(seed)
# torch.manual_seed(seed)

Torch Dataset Creation

In [16]:
class RegDataset(Dataset):
    def __init__(self, df_x, df_y, device="cpu", seed=0): 
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        self.seed = seed
        self.device=device
        x=df_x.values
        y=df_y.values
        self.x_train=torch.tensor(x,dtype=torch.float32).to(device=device)
        self.y_train=torch.tensor(y,dtype=torch.float32).to(device=device)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]

Util Functions

In [17]:
def combined_shape(length, shape=None):
    if shape is None:
        return (length,)
    return (length, shape) if np.isscalar(shape) else (length, *shape)

def count_vars(module):
    return sum([np.prod(p.shape) for p in module.parameters()])

def discount_cumsum(x, discount):
    return signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

def statistics_scalar(x, with_min_and_max=False):
    x = np.array(x, dtype=np.float32)
    global_sum, global_n = (np.sum(x), len(x))
    mean = global_sum / global_n

    global_sum_sq = np.sum((x - mean)**2)
    std = np.sqrt(global_sum_sq / global_n)  # compute global std

    if with_min_and_max:
        global_min = np.min(x) if len(x) > 0 else np.inf
        global_max = np.max(x) if len(x) > 0 else -np.inf
        return mean, std, global_min, global_max
    return mean, std

class Configs(object):
    def __init__(self, confg):
        for key in confg:
            setattr(self, key, confg[key])

def get_exec_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Using device:', device)
    return device

def calc_accuracy(lst_y, lst_y_h):
    #print(lst_y, lst_y_h)
    total_count = len(lst_y)
    total_correct = 0
    for y, y_h in zip(lst_y, lst_y_h):
        #print(y, y_h)
        if y == y_h:
            total_correct += 1
    return float(total_correct/total_count)

def calc_mse(lst_y, lst_y_h):
    np_lst_y = np.array(lst_y).flatten()
    np_lst_y_h = np.array(lst_y_h).flatten()
    return np.sum((np_lst_y - np_lst_y_h)**2)/len(np_lst_y_h)


def evaluate_ppo(env, ppo, evl_epochs, evl_max_epoch_steps, device="cpu"):
    lst_ep_ret = []
    lst_ep_len = []
    for epoch in range(evl_epochs):
        #print(epoch)
        o, ep_ret, ep_len = env.reset(), 0, 0
        for t in range(evl_max_epoch_steps):
            a_h = ppo.ac.evaluate(torch.as_tensor(o, dtype=torch.float32).to(device=device))#rule_list.get_arg_max([o])[0]
            #print(ep_ret, ">>", a_h)
            next_o, r, d, _ = env.step(a_h)
            o = next_o
            terminal = d
            ep_ret += float(r)
            ep_len += 1
            if terminal:
                break
        lst_ep_ret.append(ep_ret)
        lst_ep_len.append(ep_len)
        if epoch%100==0:
            print(epoch, "ppo ep ret", ep_ret, ep_len)
        o, ep_ret, ep_len = env.reset(), 0, 0
    print("Average ppo ep ret for ", evl_epochs, sum(lst_ep_ret)/len(lst_ep_ret))
    return lst_ep_ret, lst_ep_len




def save_list_as_csv(lst_x, path, name, columns=None):
    #np.savetxt(path+"/"+name, np.array(lst_x), delimiter =", ", fmt ='% s')
    pd.DataFrame(np.transpose(np.array(lst_x)), columns=columns).to_csv(path_or_buf=path+"/"+name, sep=',')

def write_to_tensorboard(lst_x, lst_y, name):
    pass

def read_config(file_name):
    with open(file_name) as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    return config

def update_config(config, file_name):
    with open(file_name, 'w') as f:
        yaml.dump(config, f)

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    gym.utils.seeding.np_random(seed)

def create_dir(dir_name):
    success=True
    try:
        os.makedirs(dir_name)
    except:
        success=False
    return success

def get_env_action_space_type(action_space):
    if isinstance(action_space, Box):
        return "box"
    elif isinstance(action_space, Discrete):
        return "discrete"

def get_env_dims(env):
    env_obs_size = 0
    env_act_size = 0
    env_obs_size = env.observation_space.shape[0]
    if get_env_action_space_type(env.action_space)=="box":
        env_act_size = env.action_space.shape[0]
    elif get_env_action_space_type(env.action_space)=="discrete":
        env_act_size = env.action_space.n
    return env_obs_size, env_act_size



asdasd

In [18]:
class MLP(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.nn_layers = nn.ModuleList()
        self.nn_layers_act = []
        self.x_m_1 = None
        self.x_w_r = None
        self.layers_size = len(layers)
        
        for i in range(0, len(layers), 2):
            ly = layers[i]
            act = layers[i+1]
            self.nn_layers.append(ly)
            self.nn_layers.append(act)
        
    def forward(self, x):
        x_m_1 = None        
        #for ly, act in zip(self.nn_layers, self.nn_layers_act):
        for i in range(0, self.layers_size, 2):
            x_m_1 = x
            ly = self.nn_layers[i]
            act = self.nn_layers[i+1]
            x = ly(x)
            self.x_w_r = x
            x = act(x)
        self.x_m_1 = x_m_1
        return x


def construct_mlp(sizes, activation, output_activation=nn.Identity, device="cpu"):
    layers = []
    print("MLP_Sizes", sizes)
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    m__ = MLP(layers).to(device=device)
    print("NN Model:", m__, activation, output_activation)
    return  m__


class Actor(nn.Module):
    def _distribution(self, obs):
        raise NotImplementedError

    def _log_prob_from_distribution(self, pi, act):
        raise NotImplementedError

    def forward(self, obs, act=None):
        pi = self._distribution(obs)
        logp_a = None
        if act is not None:
            logp_a = self._log_prob_from_distribution(pi, act)
        return pi, logp_a


class MLPCategoricalActor(Actor):
    def __init__(self, obs_dim, act_dim, hidden_sizes, activation, device="cpu"):
        super().__init__()
        self.logits_net = construct_mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation, device=device)
        self.nn_net = self.logits_net

    def _distribution(self, obs):
        logits = self.logits_net(obs)
        return Categorical(logits=logits)
    
    def arg_max(self, obs):
        logits = self.logits_net(obs)
        return logits.argmax(dim=-1)

    def _log_prob_from_distribution(self, pi, act):
        return pi.log_prob(act)


class MLPGaussianActor(Actor):
    def __init__(self, obs_dim, act_dim, hidden_sizes, activation, device="cpu"):
        super().__init__()
        log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
        self.mu_net = construct_mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation, device=device)
        self.nn_net = self.mu_net

    def _distribution(self, obs):
        mu = self.mu_net(obs)
        std = torch.exp(self.log_std.to(device=device))
        return Normal(mu, std)
    
    def arg_max(self, obs):
        mu = self.mu_net(obs)
        #std = torch.exp(self.log_std)
        return mu#.detach().cpu().numpy()#Normal(mu, std).sample()

    def _log_prob_from_distribution(self, pi, act):
        return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution


class MLPCritic(nn.Module):
    def __init__(self, obs_dim, hidden_sizes, activation, device="cpu"):
        super().__init__()
        self.v_net = construct_mlp([obs_dim] + list(hidden_sizes) + [1], activation, device=device)

    def forward(self, obs):
        return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.


class MLPActorCritic(nn.Module):
    def __init__(self, observation_space, action_space, device="cpu", hidden_sizes=[64,64], activation=nn.ReLU, seed=100):
        super().__init__()

        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        self.seed = seed
        obs_dim = observation_space.shape[0]

        # policy builder depends on action space
        if get_env_action_space_type(action_space)=="box":
            self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation, device=device)
        elif get_env_action_space_type(action_space)=="discrete":
            self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation, device=device)

        # build value function
        self.v  = MLPCritic(obs_dim, hidden_sizes, activation, device=device)

    def step(self, obs):
        with torch.no_grad():
            pi = self.pi._distribution(obs)
            a = pi.sample()
            logp_a = self.pi._log_prob_from_distribution(pi, a)
            v = self.v(obs)
        return a.cpu().numpy(), v.cpu().numpy(), logp_a.cpu().numpy()
    
    def get_prev_layer_val(self):
        return self.pi.nn_net.x_m_1
    
    def evaluate(self, obs):
        with torch.no_grad():
            a = self.pi.arg_max(obs)
        return a.cpu().numpy()

    def act(self, obs):
        return self.step(obs)[0]


In [19]:
class PPOBuffer:
    def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95, device="cpu"):
        self.obs_buf = np.zeros(combined_shape(size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(combined_shape(size, act_dim), dtype=np.float32)
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size
        self.device=device

    def store(self, obs, act, rew, val, logp):
        assert self.ptr < self.max_size     # buffer has to have room so you can store
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.val_buf[self.ptr] = val
        self.logp_buf[self.ptr] = logp
        self.ptr += 1

    def finish_path(self, last_val=0):
        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)
        
        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
        
        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
        
        self.path_start_idx = self.ptr

    def get(self):
        assert self.ptr == self.max_size    # buffer has to be full before you can get
        self.ptr, self.path_start_idx = 0, 0
        # the next two lines implement the advantage normalization trick
        adv_mean, adv_std = statistics_scalar(self.adv_buf)
        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
        data = dict(obs=self.obs_buf, act=self.act_buf, ret=self.ret_buf,
                    adv=self.adv_buf, logp=self.logp_buf)
        return {k: torch.as_tensor(v, dtype=torch.float32).to(self.device) for k,v in data.items()}

In [20]:
class PPO():
    def __init__(self, env, actor_critic=MLPActorCritic, ac_kwargs=dict(), seed=0, 
            steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
            vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
            target_kl=0.01, logger_kwargs=dict(), save_freq=10, exp_name=".", exp_dir=".", device="cpu", act_fn=nn.ReLU, tb=None):
        self.lst_avg_episode_ret = []
        self.lst_test_traj = []
        self.env = env
        self.actor_critic = actor_critic
        self.ac_kwargs = ac_kwargs
        self.seed = seed
        self.steps_per_epoch = steps_per_epoch
        self.epochs = epochs
        self.gamma = gamma
        self.clip_ratio = clip_ratio
        self.pi_lr = pi_lr
        self.vf_lr = vf_lr
        self.train_pi_iters = train_pi_iters
        self.train_v_iters = train_v_iters
        self.lam = lam
        self.max_ep_len = max_ep_len
        self.target_kl = target_kl
        self.logger_kwargs = logger_kwargs
        self.save_freq = save_freq
        self.exp_name = exp_name
        self.exp_dir = exp_dir
        self.device = device
        self.act_fn = act_fn
        self.tb = tb #TBWriter(exp_dir+"/ppo")
        self.lst_train_obs = []

        # Random seed
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

        # Instantiate environment
        #self.env = self.env_fn()
        self.env.seed(seed)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape
        #print(">>>>>>>>>>>>>>>>", self.obs_dim, self.act_dim)

        # Create actor-critic module
        self.ac = self.actor_critic(self.env.observation_space, self.env.action_space, self.device, activation=self.act_fn, **ac_kwargs)

        # Count variables
        self.var_counts = tuple(count_vars(module) for module in [self.ac.pi, self.ac.v])

        # Set up experience buffer
        self.local_steps_per_epoch = self.steps_per_epoch #int(steps_per_epoch / num_procs())
        self.buf = PPOBuffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, self.gamma, self.lam, device=self.device)

        # Set up optimizers for policy and value function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr)
        self.vf_optimizer = Adam(self.ac.v.parameters(), lr=vf_lr)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(self, data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']

        # Policy loss
        pi, logp = self.ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1-self.clip_ratio, 1+self.clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1+self.clip_ratio) | ratio.lt(1-self.clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(self, data):
        obs, ret = data['obs'], data['ret']
        return ((self.ac.v(obs) - ret)**2).mean()

    def update(self, epoch):
        data = self.buf.get()

        pi_l_old, pi_info_old = self.compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = self.compute_loss_v(data).item()

        pi_upd_iter = 0
        total_loss_pi = 0.0
        v_upd_iter = 0
        total_loss_v = 0.0
        # Train policy with multiple steps of gradient descent
        for i in range(self.train_pi_iters):
            pi_upd_iter += 1
            self.pi_optimizer.zero_grad()
            loss_pi, pi_info = self.compute_loss_pi(data)
            total_loss_pi += loss_pi.item()
            kl = pi_info['kl']
            if kl > 1.5 * self.target_kl:
                #logger.log('Early stopping at step %d due to reaching max kl.'%i)
                break
            loss_pi.backward()   # average grads across MPI processes
            self.pi_optimizer.step()

        # Value function learning
        for i in range(self.train_v_iters):
            v_upd_iter += 1
            self.vf_optimizer.zero_grad()
            loss_v = self.compute_loss_v(data)
            total_loss_v += loss_v.item()
            loss_v.backward()    # average grads across MPI processes
            self.vf_optimizer.step()
        
        if self.tb is not None:
            self.tb.writer.add_scalar('Train/KL', pi_info['kl'], epoch)    
            self.tb.writer.add_scalar("Train/ENT", pi_info_old['ent'], epoch)
            self.tb.writer.add_scalar("Train/CF", pi_info['cf'], epoch)
            self.tb.writer.add_scalar("Train/pi_l_old", pi_l_old, epoch)
            self.tb.writer.add_scalar("Train/v_l_old", v_l_old, epoch)
            self.tb.writer.add_scalar("Train/pi_l_avg", total_loss_pi/pi_upd_iter, epoch)
            self.tb.writer.add_scalar("Train/v_l_avg", total_loss_v/v_upd_iter, epoch)
        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']


    def ppo_train(self):
        #start_time = time.time()
        o, ep_ret, ep_len = self.env.reset(), 0, 0

        # Main loop: collect experience in env and update/log each epoch
        lst_epochs_avg_ep_ret = []
        episodes = 0
        for epoch in range(self.epochs):
            total_epoch_ret = 0.0
            epoch_ep_count = 0
            for t in range(self.local_steps_per_epoch):
                self.lst_train_obs.append(o)
                a, v, logp = self.ac.step(torch.as_tensor(o, dtype=torch.float32).to(device=self.device))
                next_o, r, d, _ = self.env.step(a)
                ep_ret += r
                ep_len += 1

                # save and log
                self.buf.store(o, a, r, v, logp)
                
                # Update obs (critical!)
                o = next_o

                timeout = ep_len == self.max_ep_len
                terminal = d or timeout
                epoch_ended = t==self.local_steps_per_epoch-1

                if terminal or epoch_ended:
                    if epoch_ended and not(terminal):
                        print(epoch, 'Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True, end="")
                    else:
                        total_epoch_ret += ep_ret
                        epoch_ep_count += 1
                        if self.tb is not None:
                            self.tb.writer.add_scalar('Train/Episode_Return', ep_ret, episodes)
                            self.tb.writer.add_scalar('Train/Episode_Length', ep_len, episodes)
                        episodes += 1
                    # if trajectory didn't reach terminal state, bootstrap value target
                    if timeout or epoch_ended:
                        _, v, _ = self.ac.step(torch.as_tensor(o, dtype=torch.float32).to(device=self.device))
                    else:
                        v = 0
                    self.buf.finish_path(v)
                    o, ep_ret, ep_len = self.env.reset(), 0, 0

            # Perform PPO update!
            self.update(epoch)

            if epoch_ep_count == 0:
                epoch_ep_count = 1
            lst_epochs_avg_ep_ret.append(total_epoch_ret/epoch_ep_count)
            print(" Average EP return :", total_epoch_ret/epoch_ep_count, total_epoch_ret, epoch_ep_count)
            if self.tb is not None:
                self.tb.writer.add_scalar('Train/Epoch_Return_Avg', total_epoch_ret/epoch_ep_count, epoch)    
                self.tb.writer.add_histogram("Train/pi_model_weight_last_layer", self.get_ppo_actor_weights_std()[0], epoch)
                self.tb.writer.add_histogram("Train/pi_model_bias_last_layer", self.get_ppo_actor_weights_std()[1], epoch)
            if epoch%self.save_freq == 0:
                self.save_ppo_model()        
        self.save_ppo_model_final()
        np.savetxt(self.exp_dir+"/"+"lst_ppo_train_obs.csv", np.array(self.lst_train_obs), delimiter =", ", fmt ='% s')
        return lst_epochs_avg_ep_ret
            
        #def ppo_test():
    def save_ppo_model(self):
        torch.save(self.ac.pi.state_dict(), self.exp_dir+"/"+"ac_pi.mld")
        torch.save(self.ac.v.state_dict(), self.exp_dir+"/"+"ac_v.mld")
    
    def save_ppo_model_final(self):
        torch.save(self.ac.pi.state_dict(), self.exp_dir+"/"+"ac_pi.mld")
        torch.save(self.ac.v.state_dict(), self.exp_dir+"/"+"ac_v.mld")

    def load_ppo_model(self):
        self.ac.pi.load_state_dict(torch.load(self.exp_dir+"/"+"ac_pi.mld"))
        self.ac.v.load_state_dict(torch.load(self.exp_dir+"/"+"ac_v.mld"))
    
    def get_ppo_actor_weights_std(self, layer_indx=-2):
        weights = self.ac.pi.nn_net.nn_layers[layer_indx].weight.detach().cpu().numpy()
        bias = self.ac.pi.nn_net.nn_layers[layer_indx].bias.detach().cpu().numpy()
        if get_env_action_space_type(env.action_space)=="box":
            std = torch.exp(self.ac.pi.log_std).detach().cpu().numpy()
        else:
            std = None
        return weights, bias, std

    def ppo_evaluation_dataset(self, epochs_count=0, max_epoch_steps=0):
        print("Generating PPO Evaluation Dataset.....")
        lst_test_trajs = []
        if os.path.exists(self.exp_dir+"/"+"lst_ppo_train_obs.csv"):
            self.lst_train_obs = np.loadtxt(self.exp_dir+"/"+"lst_ppo_train_obs.csv", delimiter=",", dtype=float)
            for o in (self.lst_train_obs):
                a = self.ac.evaluate(torch.as_tensor(o, dtype=torch.float32).to(device=self.device))
                prev_layer_val = self.ac.get_prev_layer_val().cpu().detach().cpu().numpy()
                if get_env_action_space_type(self.env.action_space) == "box":
                    lst_test_trajs.append(np.array(o.tolist()+prev_layer_val.tolist()+a.tolist()+[-1, -1, -1],dtype=float))
                else:
                    lst_test_trajs.append(np.array(o.tolist()+prev_layer_val.tolist()+[a[()]]+[-1, -1, -1],dtype=float))
        
        o, ep_ret, ep_len = self.env.reset(), 0, 0
        for epoch in range(epochs_count):
            for t in range(max_epoch_steps):
                a = self.ac.evaluate(torch.as_tensor(o, dtype=torch.float32).to(device=self.device))
                prev_layer_val = self.ac.get_prev_layer_val().cpu().detach().cpu().numpy()
                #print(o, prev_layer_val, a)
                next_o, r, d, _ = self.env.step(a)
                ep_ret += r
                ep_len += 1
                if get_env_action_space_type(self.env.action_space) == "box":
                    lst_test_trajs.append(np.array(o.tolist()+prev_layer_val.tolist()+a.tolist()+[epoch, ep_ret, ep_len],dtype=float))
                else:
                    lst_test_trajs.append(np.array(o.tolist()+prev_layer_val.tolist()+[a[()]]+[epoch, ep_ret, ep_len],dtype=float))
                o = next_o
                terminal = d
                if terminal:
                    break
            o, ep_ret, ep_len = self.env.reset(), 0, 0
        print("epochs :", epoch, "ep_ret :", ep_ret, "ep_len :", ep_len)
        return lst_test_trajs

In [21]:
def fn_train_ppo(env, ppo_hid_nodes, seed, ppo_dir, ppo_act_fn, ppo_epochs, ppo_epoch_steps, ppo_gamma):
    device=get_exec_device()
    env.seed(seed)
    env_obs_size, env_act_size = get_env_dims(env)

    ppo_tb = None #TBWriter(ppo_dir)
    ppo = PPO(env, actor_critic=MLPActorCritic, ac_kwargs=dict(hidden_sizes=ppo_hid_nodes), gamma=ppo_gamma, seed=seed, steps_per_epoch=ppo_epoch_steps, epochs=ppo_epochs, exp_dir=ppo_dir, device=device, act_fn=ppo_act_fn, tb=ppo_tb)
    
    if not os.path.exists(ppo_dir+"/"+"ac_pi.mld"):
        lst_epochs_avg_ep_ret = ppo.ppo_train()
        lst_epochs = list(range(0,len(lst_epochs_avg_ep_ret)))
        save_list_as_csv([lst_epochs, lst_epochs_avg_ep_ret], ppo_dir, "ppo_original_train_epoch_ret.csv", columns=["epoch", "avg ep ret"])
    else:
        ppo.load_ppo_model()
        
    ppo_weights, ppo_biases, ppo_std = ppo.get_ppo_actor_weights_std()
    return ppo, ppo_weights, ppo_biases, ppo_std


def fn_load_ppo(env, ppo_hid_nodes, seed, run_dir, ppo_dir, ppo_act_fn, ppo_epochs, ppo_epoch_steps, ppo_gamma):
    device=get_exec_device()
    env.seed(seed)
    env_obs_size, env_act_size = get_env_dims(env)
    ppo_tb = None
    ppo = PPO(env, actor_critic=MLPActorCritic, ac_kwargs=dict(hidden_sizes=ppo_hid_nodes), gamma=ppo_gamma, seed=seed, steps_per_epoch=ppo_epoch_steps, epochs=ppo_epochs, exp_dir=ppo_dir, device=device, act_fn=ppo_act_fn, tb=ppo_tb)
    
    ppo.load_ppo_model()
    
    ppo_weights, ppo_biases, ppo_std = ppo.get_ppo_actor_weights_std()
    print("ppo_weights", ppo_weights)
    print("ppo_biases", ppo_biases)
    print("ppo_std", ppo_std)
    return ppo, ppo_weights, ppo_biases, ppo_std
    

        
def fn_evaluate_ppo(env, ppo, ppo_dir, evl_epochs,evl_max_epoch_steps, ppo_tb, device):
    print("Evaluating PPO... Epochs:", evl_epochs, "  Steps per Epoch:", evl_max_epoch_steps)
    if not os.path.exists(ppo_dir+"/"+"lst_ppo_eval_ep_ret.csv"):
        lst_ep_ret_ppo, lst_ep_len_ppo = evaluate_ppo(env=env, ppo=ppo, evl_epochs=evl_epochs, evl_max_epoch_steps=evl_max_epoch_steps, device=device)
        np.savetxt(ppo_dir+"/"+"lst_ppo_eval_ep_ret.csv", np.array(lst_ep_ret_ppo), delimiter =", ", fmt ='% s')
        np.savetxt(ppo_dir+"/"+"lst_ppo_eval_ep_len.csv", np.array(lst_ep_len_ppo), delimiter =", ", fmt ='% s')
    else:
        lst_ep_ret_ppo = np.loadtxt(ppo_dir+"/"+"lst_ppo_eval_ep_ret.csv", delimiter=",", dtype=float)
        lst_ep_len_ppo = np.loadtxt(ppo_dir+"/"+"lst_ppo_eval_ep_len.csv", delimiter=",", dtype=float)
    
    if ppo_tb is not None:
        for ep, (ppo_ret, ppo_len) in enumerate(zip(lst_ep_ret_ppo, lst_ep_len_ppo)):
            ppo_tb.writer.add_scalar('Evaluation/Episode_Return', ppo_ret, ep)
            ppo_tb.writer.add_scalar('Evaluation/Episode_Length', ppo_len, ep)
        ppo_tb.writer.close()


In [22]:
parser = argparse.ArgumentParser()
parser.add_argument('--conf_file', type=str, default='config_NCartPole_1.yaml')
args = parser.parse_args("")
print("Config_File_Name : ", args.conf_file)

#Reading Experiment Configurations
config_yaml = read_config(args.conf_file)
config = Configs(config_yaml)
ppo_act_fn = eval(config.ppo_act_fn)



#Get the execution device
run_count = 1
run_dir = ""
ppo_dir = ""
exp_name = ""

seed = config.aa_exp_seeds[0]
ppo_hid_nodes = config.ppo_hid_nodes[0]
env_name = config.env_name[0]

np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
overwrite_flag = True

kwargs={"ncart":4}
env = gym.make(env_name, **kwargs)
obs_dim = env.env.observation_space.shape
act_dim = env.env.action_space.shape

ppo_dir = "runs/" + str(env_name) + "/ppo_" + str(ppo_hid_nodes) + "/seed_" + str(seed)
create_dir(ppo_dir)

device = get_exec_device()

#Original PPO
org_ppo, ppo_weights, ppo_biases, ppo_std = fn_train_ppo(env, ppo_hid_nodes, seed, ppo_dir, ppo_act_fn, config.ppo_epochs, config.ppo_epoch_steps, config.ppo_gamma)
fn_evaluate_ppo(env, org_ppo, ppo_dir, config.evl_epochs, config.evl_max_epoch_steps, None, device)

Config_File_Name :  config_NCartPole_1.yaml
Using device: cpu
Using device: cpu
MLP_Sizes [16, 64, 64, 16]
NN Model: MLP(
  (nn_layers): ModuleList(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=16, bias=True)
    (5): Identity()
  )
) <class 'torch.nn.modules.activation.ReLU'> <class 'torch.nn.modules.linear.Identity'>
MLP_Sizes [16, 64, 64, 1]
NN Model: MLP(
  (nn_layers): ModuleList(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Identity()
  )
) <class 'torch.nn.modules.activation.ReLU'> <class 'torch.nn.modules.linear.Identity'>


 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8


 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4000.0 8
 Average EP return : 500.0 4