In [5]:
import gym
import numpy as np
import torch
import torchkit.pytorch_utils as ptu
import torchsde
from torch.nn import functional as F
import random as rnd
import copy as cp
# import environments
import envs.pomdp
import pdb
# import recurrent model-free RL (separate architecture)
from policies.models.policy_rnn import ModelFreeOffPolicy_Separate_RNN as Policy_RNN
from policies.models.policy_rnn_shared import ModelFreeOffPolicy_Shared_RNN as Policy_Shared_RNN
from policies.models.policy_mlp import ModelFreeOffPolicy_MLP as Policy_MLP
from tqdm import tqdm
# import the replay buffer
from buffers.seq_replay_buffer_vanilla import SeqReplayBuffer
from buffers.simple_replay_buffer import SimpleReplayBuffer 
from utils import helpers as utl
from typing import Sequence
from read_ini import read_ini
conf =read_ini("C:/Users/alexander.vasilyev/pomdp-baselines-main/configfile.ini")

## Build a POMDP environment: Pendulum-V (only observe the velocity)

In [6]:
cuda_id = 0  # -1 if using cpu
ptu.set_gpu_mode(torch.cuda.is_available() and cuda_id >= 0, cuda_id)

env = gym.make(conf["env_name"])
max_trajectory_len = env._max_episode_steps
act_dim = env.action_space.shape[0]
obs_dim = env.observation_space.shape[0]+1

shared = False
markov = False
regularization = True

if markov:
    agent = Policy_MLP(
        obs_dim=obs_dim,
        action_dim=act_dim,
        algo_name=conf["algo_name"],
        dqn_layers=[128, 128],
        policy_layers=[128, 128],
        lr=3e-4,
        gamma=0.99,
        tau=5e-3,
    ).to(ptu.device)
    encoder="Nan"
else:     
    agent = Policy_RNN(
        obs_dim=obs_dim,
        action_dim=act_dim,
        encoder=conf["encoder"],
        algo_name=conf["algo_name"],
        action_embedding_size=int(conf["action_embedding_size"]),
        observ_embedding_size=int(conf["observ_embedding_size"]),
        reward_embedding_size=int(conf["reward_embedding_size"]),
        rnn_hidden_size=int(conf["hidden_size"]),
        dqn_layers=[128, 128],
        policy_layers=[128, 128],
        lr=float(conf["lr"]),
        gamma=0.9,
        tau=0.005,
        radii=60,
        activation = conf["activation"],
        ini_regularization = regularization,
    ).to(ptu.device)
    
print(agent)
lr=float(conf["lr"])
encoder=conf["encoder"]
num_updates_per_iter = int(conf["num_updates_per_iter"])  # training frequency
sampled_seq_len = int(conf["sampled_seq_len"])  # context length
buffer_size = int(float(conf["buffer_size"]))
batch_size = int(conf["batch_size"])
dropout_rate=float(conf["dropout_rate"])
num_iters = int(conf["num_iters"])
num_init_rollouts_pool = int(conf["num_init_rollouts_pool"])
num_rollouts_per_iter = int(conf["num_rollouts_per_iter"])
total_rollouts = num_init_rollouts_pool + num_iters * num_rollouts_per_iter
n_env_steps_total = max_trajectory_len * total_rollouts
_n_env_steps_total = 0
print("total env episodes", total_rollouts, "total env steps", n_env_steps_total)


ModelFreeOffPolicy_Separate_RNN(
  (critic): Critic_RNN(
    (observ_embedder): FeatureExtractor(
      (fc): Linear(in_features=4, out_features=2, bias=True)
    )
    (action_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=2, bias=True)
    )
    (reward_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=2, bias=True)
    )
    (rnn): NeuralCDE(
      (func): CDEFunc(
        (linear0): Linear(in_features=32, out_features=32, bias=True)
        (linear1): Linear(in_features=32, out_features=32, bias=True)
        (linear2): Linear(in_features=32, out_features=384, bias=True)
      )
      (realini): MLP(
        (_model): Sequential(
          (0): Linear(in_features=12, out_features=32, bias=True)
          (1): ReLU()
          (2): Linear(in_features=32, out_features=32, bias=True)
          (3): ReLU()
          (4): Linear(in_features=32, out_features=32, bias=True)
        )
      )
      (readout): Linear(in_features=32, out

In [3]:
act_dim

1

## Build a recurent model-free RL agent: separate architecture, `lstm` encoder, `oar` policy input space, `td3` RL algorithm (context length set later)

## Define other training parameters such as context length and training frequency

## Define key functions: collect rollouts and policy update

In [7]:
def create_ncde_row(obs, next_obs, prev_action, action, prev_reward, reward, steps,non_drops,init, drop=True):
    
    if init:
        obs_row= obs
        rew_row = prev_reward
        act_row = prev_action
    else:
        obs_row=torch.cat((obs, next_obs),0)
        rew_row=torch.cat((prev_reward, reward),0)
        act_row=torch.cat((prev_action, action),0)
    #pdb.set_trace()
    if shared: 
        obs_row=agent.observ_embedder(obs_row)
        rew_row=agent.reward_embedder(rew_row)
        act_row=agent.action_embedder(act_row)
    else: 
        obs_row=agent.actor.observ_embedder(obs_row)
        rew_row=agent.actor.reward_embedder(rew_row)
        act_row=agent.actor.action_embedder(act_row)
        #pdb.set_trace()
    if init:
        time_tensor=torch.tensor([[steps]]).to(ptu.device)
        drop_tensor=torch.tensor([[non_drops]]).to(ptu.device)
    else:
        time_tensor=torch.tensor([[steps],[steps+1]]).to(ptu.device)
        if drop:
            drop_tensor=torch.tensor([[non_drops],[non_drops+1]]).to(ptu.device)
        else:
            drop_tensor=torch.tensor([[non_drops],[non_drops]]).to(ptu.device)
            
          
    ncde_row=torch.cat((time_tensor,drop_tensor,act_row,obs_row),1)
    ncde_row=ncde_row[None,:]
   
    return ncde_row



@torch.no_grad()
def collect_rollouts(
    num_rollouts, dropout_rate=0.0, random_actions=False, deterministic=True, train_mode=True
):
    """collect num_rollouts of trajectories in task and save into policy buffer
    :param
        random_actions: whether to use policy to sample actions, or randomly sample action space
        deterministic: deterministic action selection?
        train_mode: whether to train (stored to buffer) or test
    """
    if not train_mode:
        assert random_actions == False and deterministic == True

    total_steps = 0
    total_rewards = 0.0
    trewards =[]
    for idx in range(num_rollouts):
        steps = 0
        rewards = 0.0
        energy = 0.0
        non_drops=0
        obs = ptu.from_numpy(env.reset())
        #pdb.set_trace()
        obs = obs.reshape(1, obs.shape[-1])
        done_rollout = False
        init=True
        drop=True
        # get hidden state at timestep=0, None for mlp
        
        if not markov:
            action, reward, internal_state = agent.get_initial_info()
            obs=torch.cat((obs,reward),dim=1)
            if encoder == "ncde":
                internal_state= None
                #obs=torch.cat((obs,reward))
                ncde_row= create_ncde_row(obs, obs, action, action, reward, reward, steps,non_drops,init, drop)
                prev_action= action.clone()
                prev_reward= reward.clone()
                next_obs= obs.clone()
        
        
        if train_mode:
            # temporary storage
            obs_list, act_list, rew_list, next_obs_list, term_list, drop_list = (
                [],
                [],
                [],
                [],
                [],
                [],
            )
                           

        while not done_rollout:
            if markov: 
                action = agent.act(obs=obs, 
                                   deterministic=deterministic)[0]
            else:
                if encoder == "ncde":
                    (action,_,_,_), internal_state= agent.ncde_act(ncde_row=ncde_row,
                                                                   prev_internal_state=internal_state,
                                                                   obs=obs,
                                                                   deterministic=deterministic,)
                else:
                    (action, _, _, _), internal_state = agent.act(
                        prev_internal_state=internal_state,
                        prev_action=action,
                        reward=reward,
                        obs=obs,
                        drop=drop,
                        deterministic=deterministic,
                    )
            # observe reward and next obs (B=1, dim)
            #pdb.set_trace()
        
            #print(torch.norm(internal_state))
            next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0))
            next_obs=torch.cat((next_obs,reward),dim=1)
            done_rollout = False if ptu.get_numpy(done[0][0]) == 0.0 else True
            init=False
            #print(reward)
            #switch on/off dropouts
            if dropout_rate>0:
                drop_trigger=rnd.uniform(0,1)
                if drop_trigger<dropout_rate:

                    next_obs= obs.clone()
                    drop=False
                else:
                    drop=True
                    
                    
            if not markov:
                if encoder == "ncde":  
                    ncde_row= create_ncde_row(obs, next_obs, prev_action, action, prev_reward, reward, steps,non_drops,init, drop)
            
            #switch on/off dropouts

            # update statistics
           
            rewards += reward.item()
            energy += action*action
           
            # early stopping env: such as rmdp, pomdp, generalize tasks. term ignores timeout
            term = (
                False
                if "TimeLimit.truncated" in info or steps >= max_trajectory_len
                else done_rollout
            )

            if train_mode:
                # append tensors to temporary storage
                obs_list.append(obs)  # (1, dim)
                act_list.append(action)  # (1, dim)
                rew_list.append(reward)  # (1, dim)
                term_list.append(term)  # bool
                next_obs_list.append(next_obs)  # (1, dim)
            steps += 1
            # set: obs <- next_obs
            obs = next_obs.clone()
            prev_reward= reward.clone()
            prev_action= action.clone()
            if drop:
                non_drops +=1
        if train_mode:
            # add collected sequence to buffer
            policy_storage.add_episode(
                observations=ptu.get_numpy(torch.cat(obs_list, dim=0)),  # (L, dim)
                actions=ptu.get_numpy(torch.cat(act_list, dim=0)),  # (L, dim)
                rewards=ptu.get_numpy(torch.cat(rew_list, dim=0)),  # (L, dim)
                terminals=np.array(term_list).reshape(-1, 1),  # (L, 1)
                next_observations=ptu.get_numpy(
                    torch.cat(next_obs_list, dim=0)
                ),  # (L, dim)
            )
        print(
            "Mode:",
            "Train" if train_mode else "Test",
            "env_steps",
            steps,
            "total rewards",
            rewards,
            "total energy",
            energy,
        )
        total_steps += steps
        total_rewards += rewards
        trewards.append(rewards)
    if train_mode:
        return total_steps
    else:
        return total_rewards / num_rollouts, np.std(trewards)


def update(num_updates, factor):
    rl_losses_agg = {}
    # print(num_updates)
    for update in tqdm(range(num_updates), leave=True):
        # sample random RL batch: in transitions
        batch = ptu.np_to_pytorch_batch(policy_storage.random_episodes(batch_size))
        # RL update
        
        rl_losses = agent.update(batch, factor)

        for k, v in rl_losses.items():
            if update == 0:  # first iterate - create list
                rl_losses_agg[k] = [v]
            else:  # append values
                rl_losses_agg[k].append(v)
    # statistics
    for k in rl_losses_agg:
        rl_losses_agg[k] = np.mean(rl_losses_agg[k])
    return rl_losses_agg

In [8]:
dropouts=0.0
learning_curves=[]
while dropouts<1:
    _n_env_steps_total =0 
    
    policy_storage = SeqReplayBuffer(
        max_replay_buffer_size=buffer_size,
        observation_dim=obs_dim,
        action_dim=act_dim,
        sampled_seq_len=sampled_seq_len,
        sample_weight_baseline=0.0,
    )

    env_steps = collect_rollouts(
        num_rollouts=num_init_rollouts_pool, dropout_rate=dropouts, random_actions=False, train_mode=True
    )
    _n_env_steps_total += env_steps

    # evaluation parameters
    last_eval_num_iters = 10
    log_interval = 5
    eval_num_rollouts = 10
    learning_curve = {
        "x": [],
        "y": [],
        "z": [],
    }
    epoch=0
    lambda_pat = 0.65
    
    while _n_env_steps_total < n_env_steps_total:

        env_steps = collect_rollouts(num_rollouts=num_rollouts_per_iter, dropout_rate=dropouts, train_mode=True)
        _n_env_steps_total += env_steps

        #train_stats = update(int(num_updates_per_iter * env_steps))
        factor= lambda_pat **(epoch )
        #train_stats = update(int(num_updates_per_iter * env_steps))
        train_stats = update(25, lr)

        epoch += 1
        current_num_iters = _n_env_steps_total // (
            num_rollouts_per_iter * max_trajectory_len
        )
        if (
            current_num_iters != last_eval_num_iters
            and current_num_iters % log_interval == 0
        ):
            last_eval_num_iters = current_num_iters
            average_returns, std_returns = collect_rollouts(
                num_rollouts=eval_num_rollouts, dropout_rate=dropouts,
                train_mode=False,
                random_actions=False,
                deterministic=True,
            )
            learning_curve["x"].append(_n_env_steps_total)
            learning_curve["y"].append(average_returns)
            learning_curve["z"].append(std_returns)
            print(_n_env_steps_total, average_returns)
    learning_curves.append(learning_curve)
    dropouts+=2

buffer RAM usage: 0.04 GB
Mode: Train env_steps 200 total rewards -1386.3829669952393 total energy tensor([[0.0005]])
Mode: Train env_steps 200 total rewards -1828.3410663604736 total energy tensor([[0.0005]])


KeyboardInterrupt: 

## Train and Evaluate the agent: only costs < 20 min

## Draw the learning curve

In [None]:
import matplotlib.pyplot as plt
print(learning_curve)
plt.plot(learning_curves[0]["x"], learning_curves[0]["y"])
plt.fill_between(np.array(learning_curves[0]["x"]), np.array(learning_curves[0]["y"])-np.array(learning_curves[0]["z"]), np.array(learning_curves[0]["y"])+np.array(learning_curves[0]["z"]))
plt.xlabel("env steps")
plt.ylabel("return")
plt.show()

In [None]:
leaning_ncde_05= learning_curve

In [None]:
lr=np.array(learning_curve)


In [None]:
learning_curves

In [None]:
timess=torch.linspace(0, 65-1, 65)

In [None]:
file1 = open('config.txt', 'w')
file1.write(str(conf))

file1.close()
file2 = open('results.txt', 'w')
file2.write(str(learning_curves))
file2.close()

In [None]:
import matplotlib.pyplot as plt
print(learning_curve)
plt.plot(learning_curves[0]["x"], learning_curves[0]["y"], label = "lstm_0drop")
plt.legend(loc = "lower right", fontsize = "large")
plt.plot(learning_curves[1]["x"], learning_curves[1]["y"], label = "lstm_02drop")
plt.legend(loc = "lower right", fontsize = "large")
plt.fill_between(np.array(learning_curves[0]["x"]), np.array(learning_curves[0]["y"])-np.array(learning_curves[0]["z"]), np.array(learning_curves[0]["y"])+np.array(learning_curves[0]["z"]))
plt.fill_between(np.array(learning_curves[1]["x"]), np.array(learning_curves[1]["y"])-np.array(learning_curves[1]["z"]), np.array(learning_curves[1]["y"])+np.array(learning_curves[1]["z"]))
plt.xlabel("env steps", fontsize = "x-large")
plt.ylabel("return", fontsize = "x-large")
plt.show()

In [None]:
import matplotlib.pyplot as plt
print(learning_curve)
plt.plot(learning_curves[2]["x"], learning_curves[2]["y"], label = "lstm_04drop")
plt.legend(loc = "lower right", fontsize = "large")
plt.plot(learning_curves[3]["x"], learning_curves[3]["y"], label = "lstm_06drop")
plt.legend(loc = "lower right", fontsize = "large")
plt.fill_between(np.array(learning_curves[2]["x"]), np.array(learning_curves[2]["y"])-np.array(learning_curves[2]["z"]), np.array(learning_curves[2]["y"])+np.array(learning_curves[2]["z"]))
plt.fill_between(np.array(learning_curves[3]["x"]), np.array(learning_curves[3]["y"])-np.array(learning_curves[3]["z"]), np.array(learning_curves[3]["y"])+np.array(learning_curves[3]["z"]))
plt.xlabel("env steps", fontsize = "x-large")
plt.ylabel("return", fontsize = "x-large")
plt.show()

In [None]:
import matplotlib.pyplot as plt
print(learning_curve)
plt.plot(learning_curves[4]["x"], learning_curves[4]["y"], label = "lstm_04drop")
plt.legend(loc = "lower right", fontsize = "large")
plt.plot(learning_curves[5]["x"], learning_curves[5]["y"], label = "lstm_06drop")
plt.legend(loc = "lower right", fontsize = "large")
plt.fill_between(np.array(learning_curves[4]["x"]), np.array(learning_curves[4]["y"])-np.array(learning_curves[4]["z"]), np.array(learning_curves[4]["y"])+np.array(learning_curves[4]["z"]))
plt.fill_between(np.array(learning_curves[5]["x"]), np.array(learning_curves[5]["y"])-np.array(learning_curves[5]["z"]), np.array(learning_curves[5]["y"])+np.array(learning_curves[5]["z"]))
plt.xlabel("env steps", fontsize = "x-large")
plt.ylabel("return", fontsize = "x-large")
plt.show()