In [100]:
from collections import deque
import sys
import math
import numpy as np

def interact(env, agent, num_episodes=20000, window=100):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of OpenAI Gym's Taxi-v1 environment
    - agent: instance of class Agent (see Agent.py for details)
    - num_episodes: number of episodes of agent-environment interaction
    - window: number of episodes to consider when calculating average rewards

    Returns
    =======
    - avg_rewards: deque containing average rewards
    - best_avg_reward: largest value in the avg_rewards deque
    """
    # initialize average rewards
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=window)
    # for each episode
    for i_episode in range(1, num_episodes+1):
        # begin the episode
        state = env.reset()
        # initialize the sampled reward
        samp_reward = 0
        exploration_rate  = np.max([1.0 / i_episode, agent.epsilon_cut])
    
        while True:
            # agent selects an action
            action = agent.select_action(state,exploration_rate)
            # agent performs the selected action
            next_state, reward, done, _ = env.step(action)
            # agent performs internal updates based on sampled experience
            agent.step(state, action, reward, next_state, done)
            # update the sampled reward
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || exploration_rate {} || Best average reward {}".format(i_episode, num_episodes,exploration_rate, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n Episode Ended \n')
    return avg_rewards, best_avg_reward

In [None]:
    """ 
        exploration_rate = agent.epsilon_start
        if i_episode % 100 == 0:
            print("\n Before :{} ".format(exploration_rate))
            print("Update the exploration_rate")
            
             # Exploration rate decay once the current window ends
            decay_by = np.exp(-agent.epsilon_decay *i_episode)
            epsilon_diff = (agent.epsilon_start - agent.epsilon_cut)
            rhs = epsilon_diff * decay_by
            print("epsilon_diff: {} || decay by: {} || rhs : {}".format(epsilon_diff,decay_by, rhs))
            
            exploration_rate = agent.epsilon_cut + rhs
            print("After :{}".format(exploration_rate))
            
    """

In [101]:
import numpy as np
from collections import defaultdict
import random

class Agent:

    def __init__(self, alpha, gamma, nA=6,epsilon_start=1, epsilon_decay=0.9, epsilon_cut=0.1):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_decay = epsilon_decay
        self.epsilon_cut = epsilon_cut
        
        print("\n Testing with alpha {} || gamma {} ".format(alpha, gamma), end="")
        

    def select_action(self, state, exploration_rate):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        
        if random.random() > exploration_rate:
            return np.argmax(self.Q[state])
        else:
            return np.random.choice(self.nA)

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        
        current_SAR = self.Q[state][action]
        next_SAR = np.max(self.Q[next_state])
        target = reward + self.gamma * next_SAR
        self.Q[state][action] = current_SAR + self.alpha * (target - current_SAR)


In [92]:
from bayes_opt import BayesianOptimization

import gym
import numpy as np

env = gym.make('Taxi-v3')

agent = Agent(alpha=0.1, gamma=1)
avg_rewards, best_avg_reward = interact(env, agent)

        



Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.76rage reward 8.766
 Episode Ended 



In [102]:
from bayes_opt import BayesianOptimization

import gym
import numpy as np

env = gym.make('Taxi-v3')
alpha = [0.1, 0.05, 0.01]
gamma =  [0.5, 0.9, 1]

for a in alpha:
    for g in gamma: 
        agent = Agent(alpha=a, gamma=g)
        avg_rewards, best_avg_reward = interact(env, agent)
        print(" alpha ({}) || gamma ({}) || best_avg_reward ({})".format(a, g, best_avg_reward), end="")
        



Episode 20000/20000 || exploration_rate 0.1 || Best average reward 4.655eward -inff
 Episode Ended 

 alpha (0.1) || gamma (0.5) || best_avg_reward (4.65)
Episode 20000/20000 || exploration_rate 0.1 || Best average reward 4.831eward -inff
 Episode Ended 

 alpha (0.1) || gamma (0.9) || best_avg_reward (4.83)
Episode 20000/20000 || exploration_rate 0.1 || Best average reward 4.914eward -inff
 Episode Ended 

 alpha (0.1) || gamma (1) || best_avg_reward (4.91)
Episode 20000/20000 || exploration_rate 0.1 || Best average reward 3.9228ward -inff
 Episode Ended 

 alpha (0.05) || gamma (0.5) || best_avg_reward (3.92)
Episode 20000/20000 || exploration_rate 0.1 || Best average reward 4.465eward -inff
 Episode Ended 

 alpha (0.05) || gamma (0.9) || best_avg_reward (4.46)
Episode 20000/20000 || exploration_rate 0.1 || Best average reward 5.225eward -inff
 Episode Ended 

 alpha (0.05) || gamma (1) || best_avg_reward (5.22)
Episode 20000/20000 || exploration_rate 0.1 || Best average reward -5.

# Results 
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.76rage reward 8.766
 Episode Ended 

 alpha (0.1) || gamma (0.5) || best_avg_reward (8.76)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.79rage reward 8.799
 Episode Ended 

 alpha (0.1) || gamma (0.9) || best_avg_reward (8.79)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.92rage reward 8.922
 Episode Ended 

 alpha (0.1) || gamma (1) || best_avg_reward (8.92)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.48rage reward 8.4883
 Episode Ended 

 alpha (0.05) || gamma (0.5) || best_avg_reward (8.48)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.71rage reward 8.711
 Episode Ended 

 alpha (0.05) || gamma (0.9) || best_avg_reward (8.71)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.92rage reward 8.922
 Episode Ended 

 alpha (0.05) || gamma (1) || best_avg_reward (8.92)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 0.51rage reward 0.51185
 Episode Ended 

 alpha (0.01) || gamma (0.5) || best_avg_reward (0.51)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.21rage reward 8.21133
 Episode Ended 

 alpha (0.01) || gamma (0.9) || best_avg_reward (8.21)
Episode 20000/20000 || exploration_rate 5e-05 || Best average reward 8.57rage reward 8.57713
 Episode Ended 

 alpha (0.01) || gamma (1) || best_avg_reward (8.57)

In [3]:
#from agent import Agent
#from monitor import interact
from bayes_opt import BayesianOptimization

import gym
import numpy as np

env = gym.make('Taxi-v3')
agent = Agent()


def taxi_best_params(alpha, gamma):
    agent = Agent(alpha=alpha, gamma=gamma)
    avg_rewards, best_avg_reward = interact(env, agent, num_episodes)
    return best_avg_reward


optimizer = BayesianOptimization(
    taxi_best_params,
    { 
        'alpha': (0.1, 0.05, 0.01), 
        'gamma': (0.5, 0.9, 1)
    }
)


optimizer.maximize(3, 2)

print("The optimizer params : {0}".format(optimizer.max['params']))
print("The optimaizer result: {0}".format(optimizer.max['target']))

|   iter    |  target   |   alpha   |   gamma   |
-------------------------------------------------


ValueError: too many values to unpack (expected 2)