<a href="https://colab.research.google.com/github/ppfenninger/Sensorimotor_Learning_Final/blob/main/Changes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports Added
from google.colab import files, drive
import pickle

In [1]:
# New door Key Env
class SizedDoorKeyEnv(DoorKeyEnv):
    def __init__(self, size=5):
        super().__init__(size=size)
    
    def _reward(self):
        """
        Compute the reward to be given upon success
        """
        return 1

In [2]:
# New Config
class Config:
    def __init__(self,
                score_threshold=0.93,
                discount=0.995,
                lr=1e-3,
                max_grad_norm=0.5,
                log_interval=10,
                max_episodes=2000,
                bernoulli_param=0.5,
                gae_lambda=0.95,
                num_critics=0,
                clip_ratio=0.2,
                target_kl=0.01,
                train_ac_iters=5,
                use_discounted_reward=False,
                exploration_beta = 0.1,
                entropy_coef=0.01,
                use_gae=False,
                exploration_update_freq = 10,
                tau = .95,
                use_soft_target_update = False, 
                seed=0, ####added
                env_size=5 ####added
              ):
        
        self.score_threshold = score_threshold
        self.discount = discount
        self.lr = lr
        self.max_grad_norm = max_grad_norm
        self.log_interval = log_interval
        self.bernoulli_param = bernoulli_param
        self.max_episodes = max_episodes
        self.num_critics = num_critics
        self.clip_ratio = clip_ratio
        self.target_kl = target_kl
        self.train_ac_iters = train_ac_iters
        self.gae_lambda=gae_lambda
        self.use_discounted_reward=use_discounted_reward
        self.exploration_beta = exploration_beta
        self.entropy_coef = entropy_coef
        self.use_gae = use_gae
        self.exploration_update_freq = exploration_update_freq
        self.tau = tau
        self.use_soft_target_update = use_soft_target_update
        self.seed=seed #### added
        self.env_size=env_size #### added

In [3]:
# Changed environment used in run_experiment - line 8 is the only one I changed
def run_experiment(args, parameter_update, seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = SizedDoorKeyEnv(args.env_size) ####CHANGED LINE

    acmodel = ACModel(env.action_space.n, num_critics=args.num_critics)
    target_acmodel = ACModel(env.action_space.n, num_critics=args.num_critics)
    # acmodel.move_to_device(device)
    acmodel.to(device)
    target_acmodel.to(device)

    is_solved = False
    
    SMOOTH_REWARD_WINDOW = 50

    pd_logs, rewards = [], [0]*SMOOTH_REWARD_WINDOW
    
    optimizer = torch.optim.Adam(acmodel.parameters(), lr=args.lr)
    num_frames = 0

    num_eps = 0
    pbar = tqdm(range(args.max_episodes))
    for update in pbar:
        exps, logs1 = collect_experiences(env, acmodel, args, device, target_acmodel=target_acmodel)
        logs2 = parameter_update(optimizer, acmodel, exps, args)
        # TODO: they do it as steps but that's a little unclear what that is rn
        # TODO: this works for the hard update but the soft update should update like 
        # every step and right now it's not updating often enough
        if num_eps % args.exploration_update_freq == 0 : 

          update_target_vnet(acmodel, target_acmodel, soft=args.use_soft_target_update)

        ## Use this function to caluclate states on the experience (one episode)
        logs_stats = calculate_logging_stats(exps)


        logs = {**logs1, **logs2, **logs_stats}

        num_frames += logs["num_frames"]
        
        rewards.append(logs["return_per_episode"])
        
        smooth_reward = np.mean(rewards[-SMOOTH_REWARD_WINDOW:])

        data = {'episode':update, 'num_frames':num_frames, 'smooth_reward':smooth_reward,
                'reward':logs["return_per_episode"], 'policy_loss':logs["policy_loss"],
                'avg_std_dev':logs["avg_std_dev"]}
        
        if args.num_critics > 0:
          #TODO MR: Consider logging value loss of each critic
            data['value_loss'] = logs["value_loss"]

        pd_logs.append(data)

        pbar.set_postfix(data)

        # Early terminate
        if smooth_reward >= args.score_threshold:
            is_solved = True
            break
        num_eps += 1 

    if is_solved:
        print('Solved!')
    
    return pd.DataFrame(pd_logs).set_index('episode')

  
def update_target_vnet(acmodel, target_acmodel, soft=False):
    if not soft:
        #### TODO: update the target Q function in a "hard" way
        #### copy the parameter values in self.qnet into self.target_qnet
        target_acmodel.load_state_dict(acmodel.state_dict())

    else:
        #### TODO: soft update on taget Q network.
        #### similar to polyak averaging, we update the target Q network slowly
        #### $\theta_Qtgt = \tau*\theta_Qtgt + (1-\tau)*\theta_Q
        for param, new_param in zip(target_acmodel.parameters(), acmodel.parameters()) :
              #  param.data =  self.tau*param.data + (1.0-self.tau)*new_param.data
              param.data.copy_(args.tau * param.data + (1.0 - args.tau)*new_param.data)

In [None]:
# Param Loop Code
##### ADDED CODE
def param_loop(folder_location="/content/gdrive/My Drive/Sensorimotor_Final/",
               file_information_path= "/content/gdrive/My Drive/Sensorimotor_Final/file_infos.pckl",
               exploration_betas=[0.1],
               nums_critics=[3],
               env_sizes=[5],
               seeds=[0],
               use_soft_target_update=True):
  file_infos = []
  drive.mount('/content/gdrive')

  for seed in seeds:
    print("seed", seed)
    for exploration_beta in exploration_betas:
      print("exploration_beta", exploration_beta)
      for num_critics in nums_critics:
        print("num_critics", num_critics)
        for env_size in env_sizes:
          print("env_size", env_size)
          args = Config(use_gae=True, num_critics=num_critics, seed=seed, exploration_beta=exploration_beta, env_size=env_size, use_soft_target_update=use_soft_target_update)

          logs = run_experiment(args, update_parameters_with_baseline, seed=seed)

          file_name = folder_location + "_beta-" + str(exploration_beta) + "_critic-" + str(num_critics) + "_env-" + str(env_size) + "_seed-" + str(seed) + "_soft-" + str(use_soft_target_update) + ".pckl"
          f = open(file_name, "wb")
          pickle.dump(logs, f)
          f.close()

          file_info = {"exploration_beta": exploration_beta,
                       "num_critics": num_critics,
                       "env_size": env_size,
                       "seed": seed,
                       "use_soft_target_update": use_soft_target_update,
                       "file_name": file_name
                       }
          file_infos.append(file_info)
  
          # do this every time in case something dies mid run
          f = open(file_information_path, "wb")
          pickle.dump(file_infos, f)
          f.close()

In [None]:
# code to read data from a file_info file
def read_files(file_infos_path):
  f = open(file_infos_path, "rb")
  file_infos = pickle.load(f)
  f.close()

  for file_info in file_infos:
    file_name = file_info["file_name"]

    f = open(file_name, "rb")
    data = pickle.load(f)
    f.close

    file_info["data"] = data

  return file_infos

  file_infos = read_files(file_infos_path)

In [None]:
#example
param_loop()
file_infos = read_files("/content/gdrive/My Drive/Sensorimotor_Final/file_infos.pckl")
file_infos[0]["data"].plot(x='num_frames', y=['reward', 'smooth_reward'])