## Setup environment

Goal: Learn the optimal threshold for a classification problem, objective function used here is the F1 score.

1. There is a real optimal threshold that is passed as a parameter to the environment, this will be unknown to the agent, but is used to generate feedback such as TP, FP, FN and F1 score (used as the objective here).
2. The agent starts at threshold = 0.5 and takes continuous actions between (-0.5, 0.5). The agent state traverses the space (0,1).
3. The agent samples 'draws_per_objective_calculation' number of samples from this feedback data and computes the F1 score
4. The agent performs a linear decay exploration strategy to perform single step episodes (superflous RL).
5. Reward is characterized as the objective function here.
6. Steps 4 and 5 turn this into an optimization problem.
7. Run several episodes
8. The state, action pair with the highest reward is noted (This is very different from an RL problem)
9. The target optimal threshold is the state calculated using the state action pair (s,a) such that Q(s,a) is the maximum value (1.0 here). This gives target threshold state = s + a
10. This can run continually and identify new thresholds given enough data

In [441]:
import gym
import numpy as np
import random
from gym.wrappers import TimeLimit
from gym.spaces import Discrete, Box
from d3rlpy.algos import DQN
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy, ConstantEpsilonGreedy


class Continuous_states(gym.Env):
    """Example of a custom env in which you have to walk down a corridor.
    Get a reward of -0.1 if you are not at the end, a random reward that is positive if you do.
    Move +1 if you move forward, -1 if you move backward. The total length is 5.
    We should want to reach the end in 5 steps in the perfectly trained world. 
    You can configure the length of the corridor via the env config."""

    def __init__(self, config):
        self.end_pos = 1
        self.start_pos = 0.5
        self.cur_pos = np.array([self.start_pos])
        self.episode_samples = 0
        self.max_episode_samples = 1
        
        self.total_time = 0
        self.draws_per_objective_calculation = 100
        self.data_dist = np.random.uniform
        self.real_threshold = config['real_threshold']
        
        self.action_space = Box(-0.5, 0.5, shape=(1,), dtype=np.float32)
        self.observation_space = Box(0, 1, shape=(1,), dtype=np.float32)
        # Set the seed. This is only used for the final (reach goal) reward.
        self.reset()

    def reset(self, *, seed=None, options=None):
        random.seed(seed)
        pos = random.uniform(a=self.start_pos - 0.5 , b=self.start_pos + 0.5)
        self.cur_pos = np.array([pos])
        self.episode_samples = 0
        return self.cur_pos

    def step(self, action):
        
        assert ((action <= 0.5) and  (action >= -0.5))
        
        objective = self.objective()
        
        #if(objective > 0.9):
        #    reward_scale = 0.5
        #elif((objective >= 0.75) and (objective < 0.9)):
        #    reward_scale = 0.2
        #elif((objective >= 0.5) and (objective < 75)):
        #    reward_scale = 0.01
        #elif(objective < 0.5):
        #    reward_scale = -0.1 * (0.5 - objective)
            
        reward_scale = objective
        
        print(f"Current state: {self.cur_pos}, action: {action}, objective: {objective}, reward: {reward_scale}")
        #print(self.cur_pos[0], (self.cur_pos[0] > 0.99) or (self.cur_pos[0] < 0.001))
            
        reward = reward_scale
        
        self.cur_pos = self.cur_pos + action
        
        self.episode_samples += 1
        
        done = truncated = False
        
        if(self.episode_samples == self.max_episode_samples) or (action > 0.5) or (action < -0.5) or (self.cur_pos[0] > 0.99) or (self.cur_pos[0] < 0.001):
            done = truncated = True
            print(done)
            
        # Produce a random reward when we reach the goal.
        return (
            self.cur_pos,
            reward, # Setting to 2 instead of random reward has no real impact
            truncated,
            {},
        )
    
    def objective(self):
        draws = self.data_dist(size=self.draws_per_objective_calculation)
        classification = draws > self.cur_pos
        true_label = draws > self.real_threshold
        tp = classification & true_label
        fp = classification & ~(true_label)
        fn = ~(classification) & true_label
        f1 = tp.sum() / (tp.sum() + 0.5 *(fp.sum() + fn.sum()))
        objective = f1
        return(objective)

### Run CQL to learn

In [447]:
from gym.wrappers import TimeLimit
from d3rlpy.algos import DQN, CQL, SAC, DDPG
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy, ConstantEpsilonGreedy

config = {'real_threshold': 0.45}
env = Continuous_states(config)

# setup algorithm
cql = CQL(batch_size=32,
          scaler='min_max',
          learning_rate=2.5e-7,
          target_update_interval=100,
          use_gpu=False)

# setup replay buffer
buffer = ReplayBuffer(maxlen=100000, env=env)

# setup explorers - if you increase steps, increase the duration for exploration as well otherwise it is 
# simply adding more episodes without exploration which is not helpful unless it has explored well
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.1,
                                    duration=200)

# start training
cql.fit_online(env,
               buffer,
               explorer=explorer, # you don't need this with probablistic policy algorithms
               eval_env=env,
               n_steps=100, # the number of total steps to train.
               n_steps_per_epoch=25,
               update_interval=10) # update parameters every 10 steps.

# export online dataset as MDPDataset
dataset_online = buffer.to_mdp_dataset()

2023-03-17 17:56:37 [info     ] Directory is created at d3rlpy_logs/SAC_online_20230317175637
2023-03-17 17:56:37 [debug    ] Fitting scaler...              scler=min_max
2023-03-17 17:56:37 [debug    ] Building model...
2023-03-17 17:56:37 [debug    ] Model has been built.
2023-03-17 17:56:37 [info     ] Parameters are saved to d3rlpy_logs/SAC_online_20230317175637/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 0.0003, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 32, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'gamma': 0.

  0%|          | 0/100 [00:00<?, ?it/s]

Current state: [0.73571322], action: [0.], objective: 0.6666666666666666, reward: 0.6666666666666666
True
Current state: [0.68837905], action: [0.], objective: 0.6976744186046512, reward: 0.6976744186046512
True
Current state: [0.45711476], action: [0.], objective: 1.0, reward: 1.0
True
Current state: [0.46656059], action: [0.], objective: 0.9904761904761905, reward: 0.9904761904761905
True
Current state: [0.96853198], action: [0.], objective: 0.11764705882352941, reward: 0.11764705882352941
True
Current state: [0.90804456], action: [0.], objective: 0.2153846153846154, reward: 0.2153846153846154
True
Current state: [0.23567273], action: [0.], objective: 0.8031496062992126, reward: 0.8031496062992126
True
Current state: [0.96269357], action: [0.], objective: 0.2222222222222222, reward: 0.2222222222222222
True
Current state: [0.31779343], action: [0.], objective: 0.8947368421052632, reward: 0.8947368421052632
True
Current state: [0.653259], action: [0.], objective: 0.7575757575757576, re

Current state: [0.77103242], action: [0.], objective: 0.6933333333333334, reward: 0.6933333333333334
Current state: [0.77103242], action: [0.], objective: 0.5526315789473685, reward: 0.5526315789473685
Current state: [0.77103242], action: [0.], objective: 0.6285714285714286, reward: 0.6285714285714286
Current state: [0.77103242], action: [-0.12214544], objective: 0.5789473684210527, reward: 0.5789473684210527
Current state: [0.64888697], action: [0.], objective: 0.7160493827160493, reward: 0.7160493827160493
Current state: [0.64888697], action: [0.], objective: 0.8444444444444444, reward: 0.8444444444444444
Current state: [0.64888697], action: [-0.10833202], objective: 0.8041237113402062, reward: 0.8041237113402062
Current state: [0.54055495], action: [0.], objective: 0.9072164948453608, reward: 0.9072164948453608
Current state: [0.54055495], action: [-0.09843021], objective: 0.9666666666666667, reward: 0.9666666666666667
Current state: [0.44212474], action: [-0.08868174], objective: 1

### Print only the highest reward state and action pairs

In [448]:
def get_episode_actions(dataset):
    for episode in dataset.episodes:
        print(episode.actions, episode.compute_return())
        
for episode in dataset_online.episodes:
    print("\n")
    episode_return = episode.compute_return()
    if(episode_return > 0.9):
        print(list(zip(episode.observations, episode.actions, episode.rewards)), episode_return)
#get_episode_actions(dataset_online)







[(array([0.45711476], dtype=float32), array([0.], dtype=float32), 1.0)] 1.0


[(array([0.4665606], dtype=float32), array([0.], dtype=float32), 0.9904762)] 0.9904762
















[(array([0.52942276], dtype=float32), array([0.], dtype=float32), 0.95454544)] 0.95454544




[(array([0.34988758], dtype=float32), array([0.], dtype=float32), 0.90163934)] 0.90163934






[(array([0.28140953], dtype=float32), array([0.], dtype=float32), 0.9076923)] 0.9076923






[(array([0.42825067], dtype=float32), array([0.], dtype=float32), 0.9904762)] 0.9904762














[(array([0.48660102], dtype=float32), array([0.], dtype=float32), 0.991453)] 0.991453


















[(array([0.49816376], dtype=float32), array([0.], dtype=float32), 0.9433962)] 0.9433962


[(array([0.5565454], dtype=float32), array([0.], dtype=float32), 0.90909094)] 0.90909094








[(array([0.4785633], dtype=float32), array([0.], dtype=float32), 0.991453)] 0.991453


[(array([0.4772746], dtype=float32), array([0.], 

### Test on new environment with different threshold parameters but with the same RL agent

In [449]:
config = {'real_threshold': 0.75}
env = Continuous_states(config)

# start training
cql.fit_online(env,
               buffer,
               explorer=explorer, # you don't need this with probablistic policy algorithms
               eval_env=env,
               n_steps=100, # the number of total steps to train.
               n_steps_per_epoch=25,
               update_interval=10) # update parameters every 10 steps.

2023-03-17 18:05:23 [info     ] Directory is created at d3rlpy_logs/SAC_online_20230317180523
2023-03-17 18:05:23 [debug    ] Fitting scaler...              scler=min_max
2023-03-17 18:05:23 [info     ] Parameters are saved to d3rlpy_logs/SAC_online_20230317180523/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 0.0003, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 32, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'gamma': 0.99, 'generated_maxlen': 100000, 'initial_temperature': 1.0, 'n_critics': 2, 'n_frames': 1, 'n_steps': 1,

  0%|          | 0/100 [00:00<?, ?it/s]

Current state: [0.22309883], action: [0.], objective: 0.3838383838383838, reward: 0.3838383838383838
True
Current state: [0.1988061], action: [0.], objective: 0.44680851063829785, reward: 0.44680851063829785
True
Current state: [0.49959992], action: [0.], objective: 0.6376811594202898, reward: 0.6376811594202898
True
Current state: [0.08641219], action: [0.], objective: 0.4107142857142857, reward: 0.4107142857142857
True
Current state: [0.88786594], action: [0.], objective: 0.5454545454545454, reward: 0.5454545454545454
True
Current state: [0.31530217], action: [0.], objective: 0.5714285714285714, reward: 0.5714285714285714
True
Current state: [0.82251001], action: [0.], objective: 0.8717948717948718, reward: 0.8717948717948718
True
Current state: [0.744394], action: [0.], objective: 0.975609756097561, reward: 0.975609756097561
True
Current state: [0.57131803], action: [0.], objective: 0.75, reward: 0.75
True
Current state: [0.59213506], action: [0.], objective: 0.72, reward: 0.72
True

Current state: [0.2183966], action: [0.04793759], objective: 0.40425531914893614, reward: 0.40425531914893614
Current state: [0.26633419], action: [0.], objective: 0.43010752688172044, reward: 0.43010752688172044
Current state: [0.26633419], action: [0.], objective: 0.4842105263157895, reward: 0.4842105263157895
Current state: [0.26633419], action: [0.], objective: 0.5531914893617021, reward: 0.5531914893617021
Current state: [0.26633419], action: [0.06762098], objective: 0.5048543689320388, reward: 0.5048543689320388
Current state: [0.33395516], action: [0.], objective: 0.5050505050505051, reward: 0.5050505050505051
Current state: [0.33395516], action: [0.06131301], objective: 0.5161290322580645, reward: 0.5161290322580645
Current state: [0.39526818], action: [0.], objective: 0.6428571428571429, reward: 0.6428571428571429
Current state: [0.39526818], action: [0.], objective: 0.5617977528089888, reward: 0.5617977528089888
Current state: [0.39526818], action: [0.], objective: 0.68965517