In [1]:
import time
import numpy as np
import gym
import arlenvs


### Environment Creation
    
* uncomment the desired environment
* init env class

In [2]:
# env_name = "MexicanWorld-v0"
# env_name = "ContinuousMexicanWorld-v0"
# env_name = "MarsLander-v0"  # gym.make accepts level=1, level=2 or level=3 as kwarg, to control the difficultly
# env_name = "GettingOverIt-v0"
# env_name = "PlanetWorld-v0"
# env_name = 'GameOfDrones-v0'
# env_name = "FlappyBird-v0"
# env_name = "DronePathFinding-v0"
env_name = "DronePathTracking-v0"
# env_name = "HaxBall-v0"
# unwrapped to get rid of this TimeLimitWrapper, which might reset the environment twice and thus breaks ergodicity
env = gym.make(env_name).unwrapped

### Random agent for environment testing

In [73]:

state = env.reset()
print(state)

counter = 0
time_start = time.time()

#while True:
for i in range(100):
  env.render(mode="human")

  action = env.action_space.sample()

  state, reward, done, _ = env.step(action)

  print(reward)

  # For very fast running environments ...
  time.sleep(0.02)

  if done:
    print("Resetting")
    env.reset()
  #end
#end

env.close()

print("time consumed", time.time() - time_start)

None
-0.06017930052651508
-0.12035861114643576
-0.1805364966839424
-0.24070898360208454
-0.30086984043252507
-0.3610107990110926
-0.42112172347819127
-0.4811919766467913
-0.5412086156590697
-0.6011642344268622
-0.6610366134162022
-0.7208116102427736
-0.7804730806051021
-0.8400049825276887
-0.8993890771471409
-0.9586092760248904
-1.0176495646344104
-1.0764904973287734
-1.1351100321976297
-1.1934907382086457
-1.2516180454041317
-1.309470546489618
-1.367035124381341
-1.4242944738320589
-1.4812305636029488
-1.5378190881832663
-1.5940486359345927
-1.6499005471427632
-1.7053396155780434
-1.7603601006772276
-1.8149596304427253
-1.869119267860552
-1.9228036382468368
-1.9760190549817371
-2.028732619880643
-2.080945302827958
-2.1326328956355525
-2.1837618161753993
-2.2343472760773353
-2.2843822588295293
-2.333848016302879
-2.382678517359694
-2.430927369841689
-2.478535548486596
-2.5255148679829063
-2.5718629515535105
-2.617534673714502
-2.662520996975122
-2.7068528919016197
-2.750454933225098
-2

#### Init qtable and policy table

- main idea: discretize simple performing a rounding operation

In [64]:


action_size = env.action_space.shape
print("Action size ", action_size)
print("Action bounds: ", env.action_space.low, env.action_space.high)
action_dim = env.action_space.high -env.action_space.low +1
print("Action dim: ", action_dim )

a_sample = env.action_space.sample()
print("action sample: ", np.round(a_sample), a_sample, "\n \n")


state_size = env.observation_space.shape
print("State size ", state_size)
print("State bounds: ", env.observation_space.low, env.observation_space.high)
state_dim = env.observation_space.high -env.observation_space.low +1
print("State dim: ", state_dim )

sample = env.observation_space.sample()
print("state sample: ", np.round(sample), sample)
state_dim_real = np.array([20, 20 , 2])

Action size  (3,)
Action bounds:  [0. 0. 0.] [50.  5. 50.]
Action dim:  [51.  6. 51.]
action sample:  [28.  2. 28.] [28.161287   2.0833988 28.354046 ] 
 

State size  (3,)
State bounds:  [0. 0. 0.] [200. 200.  40.]
State dim:  [201. 201.  41.]
state sample:  [154.  24.   3.] [153.82169   23.722378   2.995409]


In [65]:
#Initializing the Q-matrix 
print(type(state_dim))
dim = np.concatenate((state_dim_real, action_dim)).astype(int)
print(dim)
Q = np.zeros(dim)
print(Q.size)

<class 'numpy.ndarray'>
[20 20  2 51  6 51]
12484800


In [66]:
policy = np.zeros(state_dim_real, dtype=type(a_sample))
action = np.argmax(Q[5, 5, 1])
print(action)

0


## Init agent parameters

In [67]:
epsilon = 0.7
total_episodes = 10000000
alpha = 0.001
gamma = 0.93
number_of_actions = action_dim[0] * action_dim[1] * action_dim[2]
render_every = 5000
max_steps = 10000
dec_eps = True

if dec_eps:
    # Exploration parameters
    epsilon = 1.0                 # Exploration rate
    epsilon_max = 1.0             # Exploration probability at start
    epsilon_min = 0.01            # Minimum exploration probability 
    epsilon_decay_rate = 0.00001             # Exponential decay rate for exploration prob

In [68]:
def epsilon_greedy(state, number_of_actions):
    if np.random.rand() > epsilon:
        action = policy[tuple(state)]
        return action
    else:
        action = np.random.randint(0, number_of_actions)
        return action

In [69]:
def get_epsilon(episode):
        return epsilon_min + (epsilon_max - epsilon_min) * np.exp(-epsilon_decay_rate * episode)



## Defining agents

In [70]:
def train_sarsa():

    # Start the training
    for i in range(total_episodes):
        epsilon = get_epsilon(i)
        state = env.reset() # reseting the environment
        state_idx = np.round(state)
        # Set a flag for when environment to be rendered
        if i % render_every is 0:
            print(i)
            do_rendering = True
        else:
            do_rendering = False
        episode_done = False
        while not episode_done:
            # Getting an action
            action = epsilon_greedy(state, number_of_actions)
            action_idx = np.round(action)

            # Taking the action in the environment
            next_states, reward, episode_done, _ = env.step(action)
            new_state_idx = np.round(new_state)
            
            old_idx = np.concatenate((state_idx, action_idx))
            new_action = np.round(policy(new_state_idx))
            new_idx = np.concatenate((new_state_idx, new_action))
            

            qtable[old_idx] += alpha * (
                    reward + (gamma * qtable[new_idx] 
                              - qtable[old_idx]))
            # Update policy
            policy[tuple(state)] = action
            temp_max_Q = np.abs(qtable[old_idx])
            
            for pid_p in range(action_dim[0]):
                for pid_i in range(action_dim[1]):
                    for pid_d in range(action_dim[3]):
                        a_pol_idx = np.array([pid_p, pid_i, pid_d])
                        pol_idx = np.concatenate((state_idx, a_pol_idx))
                        if qtable[pol_idx] > temp_max_Q:
                            policy[state_idx] = action
                            temp_max_Q = qtable[pol_idx]

            # print("New best policy for state: ",self.discrete_states, self.policy[self.discrete_states], episode_done, self.take_off_procedure - current_step)
            if do_rendering:
                env.render()

            if episode_done:
                env.close()
                break

            states = next_states
        # Reduce epsilon (because we need less and less exploration)
        #epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * i)


In [71]:

def qlearn():
    # 2 For life or until learning is stopped
    for episode in range(total_episodes):
        # Reset the environment
        state = env.reset()
        state_idx = tuple(state)
        step = 0
        done = False
        
        if episode % render_every is 0:
            print(episode)
            do_rendering = True
        else:
            do_rendering = False

        for step in range(max_steps):
            # 3. Choose an action a in the current world state (s)
            # Getting an action
            action = epsilon_greedy(state, number_of_actions)

            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, done, _ = env.step(action)
            new_state_idx = tuple(new_state)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            qtable[action][state_idx] = qtable[action][state_idx] + alpha * (reward + gamma * 
                                        np.max([tab[new_state_idx] for tab in qtable.values()]) - qtable[action][state_idx])

            # Our new state is state
            state = new_state
            state_idx = tuple(state)
            
            
            if do_rendering:
                env.render()
            # If done : finish episode
            if done == True:
                env.close()
                break

        episode += 1

        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)



## Training

In [None]:
train_sarsa()

### Monte Carlo

In [4]:
from collections import defaultdict
from time import perf_counter
import sys
import matplotlib.pyplot as plt

In [15]:
env.reset().values() # to be added in review

dict_values([(5, 5, 15), (0, 0, 0), (0, 0, 0), (0, 0, 0)])

In [20]:
env.action_space.sample()

array([41.396454 ,  1.5858614, 30.788624 ], dtype=float32)

In [9]:
train_episodes = 100000
# nA = env.action_space.n # to be added in review
nA = 3
Q = defaultdict(lambda: np.zeros(nA, dtype=float))
discount_factor = 0.9  
epsilon = 0.1
env_max_steps = 1000
render = False
test_episodes = 1000
train_rewards = []

In [10]:
def make_epsilon_greedy_policy():
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    Returns:
        A function that takes the observation (state) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    """

    def policy_fn(observation):
        prob = np.ones(nA, dtype=float) * epsilon / nA
        optimal_action = np.argmax(Q[tuple(observation)])
        prob[optimal_action] += (1.0 - epsilon)
        return prob

    return policy_fn

In [17]:
def mc_control_train():
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """

    # Keeps track of sum and count of returns for each state
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    # A nested dictionary that maps state -> (action -> action-value).
    policy_fn = make_epsilon_greedy_policy()
    counter = perf_counter()

    for i_episode in range(1, train_episodes + 1):
        episode = []
        state = env.reset().values()
        
        if i_episode % 5000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, train_episodes), end="")
#             print('no of states explored:', len(Q.keys()))
            print('Time taken:', (perf_counter() - counter) / 60, 'minutes')
            counter = perf_counter()
#             render = True
            sys.stdout.flush()
        else:
            render = False

        reward_episode = 0.0
        for step in range(env_max_steps):
            if state in Q:
                prob_values = policy_fn(state)
                action = np.random.choice(np.arange(nA), p=prob_values)
            else:
                action = env.action_space.sample()

            next_state, reward, done, _ = env.step(action)
            episode.append((next_state, action, reward))
            reward_episode += reward

            if render:
                env.render(mode='rgb_array')

            if done:
                env.close()
                break
                
            state = next_state

        train_rewards.append(reward_episode)
        sa_in_episode = set([(x[0], x[1]) for x in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurrence of the (state, action) pair in the episode
            first_occurrence_idx = next(i for i, x in enumerate(episode)
                                        if x[0] == state and x[1] == action)
            # Sum up all rewards since the first occurrence
            G = sum([x[2] * (discount_factor ** i) for i, x in enumerate(episode[first_occurrence_idx:])])
            # Calculate average return for this state over all sampled episodes
            returns_sum[sa_pair] += G
            returns_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]

    return train_rewards

In [18]:
rewards = mc_control_train()

TypeError: unhashable type: 'dict'