# FROZENLAKE - 3

In [16]:
import numpy as np
from frozen_lake import SlipperyFrozenLake, FrozenLakeState, a_few_tests
from pprint import pprint

In [17]:
def pretty_print_qpi(Q, lake_environment):
    print("actions:", lake_environment.actions)
    print()

    print("STATE ACTION VALUE TABLE (Q)")
    print("+-------+---------+---------+---------+---------+")
    print("| STATE | LEFT    | DOWN    | RIGHT   | UP      |")
    print("+-------+---------+---------+---------+---------+")

    for state_id in range(lake_environment.number_of_states):
        print("| {:2d}    ".format(state_id), end="")
        for action in lake_environment.actions:
            print('| {:6.5f} '.format(Q[state_id][action]), end="")
        print(end="|\n")
        print("+-------+---------+---------+---------+---------+")
        
def pretty_print_v(V, env):
    print("STATE VALUES GRID ")
    print("+---------+---------+---------+---------+")

    for r in range(lake_environment.rows):
        for c in range(lake_environment.columns):
            n = lake_environment.location_to_n(r, c)
            print('| {:6.5f} '.format(V[n]), end="")
        print(end="|\n")
        print("+---------+---------+---------+---------+")

In [18]:
frozen_lake_map = [
    ['S', 'F', 'F', 'F'], 
    ['F', 'H', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['H', 'F', 'F', 'G']]

lake_environment = SlipperyFrozenLake(frozen_lake_map)

# RECAP 

### EQUIPROBABLE POLICY 
- First, we created a random policy given this environment

### POLICY EVALUATION
- Then, we computed the value of each state given our random policy 
- This can be called the `state_value` or `V[s]` under a given policy and environment

### COMPUTE `STATE_ACTION_VALUE` FROM `STATE_VALUE`
- Then we compute the value of doing an action given that we are on a certain state based on the value of that state
- This value of doing an action given a state can also be called a `state_action_value` or `q`


### POLICY IMPROVEMENT
- Given state-action values for each state-action pair, we can improve our initial policy by always selecting the best possible action given a state. The best possibile action for each state is the one with the largest state-action value. 
- More formally `policy(state, action) = 1.0` if `max(state_action_value[state]` looking at all possible actions given a particular state
- An intuitive example could be like `policy(state = hungry, action = eat) = 1.0`
- If there is a a few actions with a equal values we can split the probability among them equaly. 
- For example:

```
policy(state=hungry, action=eat_oatmeal) = 0.3333, 
policy(state=hungry, action=eat_tuna) = 0.3333, 
policy(state=hungry, action=eat_chicken) = 0.0
```

In [4]:
def create_equiprobable_policy(env):

    p = 1.0 / len(env.actions)
    
    policy_for_any_state = {}
    policy = {}
    for action in env.actions:
        policy_for_any_state[action] = p
    
    for state_id in range(env.number_of_states):
        policy[state_id] = policy_for_any_state
    
    return policy

In [5]:
def get_state_action_value(state_id, action, env, state_values, gamma):
    expected_return = 0.0    

    possibilities = env.get_possibilities(state_id, action)

    for state_info in possibilities:
        reward = state_info.reward
        next_state_id = state_info.n 
        p = state_info.probability 
        v = (reward + gamma * state_values[next_state_id])
        expected_return += (p * v)
    return expected_return

In [6]:
def update_state_value(V, state_id, policy, env, gamma):
    new_v = 0.0
    for action, action_probability in policy[state_id].items():
        state_action_value = get_state_action_value(state_id, action, env, V, gamma)
        new_v += (action_probability * state_action_value)
    return new_v

In [7]:
def evaluate_policy(env, policy, gamma=1, theta=1e-8):
    
    V = np.zeros(env.number_of_states)
    print("Evaluating policy...")
    i = 0
    while True:
        i += 1
        delta = 0.0
        for state_id in range(env.number_of_states):
            old_v = V[state_id]
            new_v = update_state_value(V, state_id, policy, env, gamma)
            value_difference = np.abs(old_v - new_v)
            V[state_id] = new_v
            delta = max(delta, value_difference)
            
        if delta < theta: break
    
    print("... Evaluation done. Number of iterations:", i)
    return V

In [8]:
def create_state_action_value_dictionary(env, state_values, gamma=1):
    Q = {}
    for state_id in range(env.number_of_states):
        q = {}        
        for action in env.actions: 
            state_action_value = get_state_action_value(
                state_id, action, env, state_values, gamma)
            q[action] = state_action_value
        
        Q[state_id] = q
    return Q

In [9]:
def get_best_actions(action_values):
    max_val = float('-inf')
    best_actions = []
    for action, value in action_values.items(): 
        if value > max_val:
            best_actions = [action]
            max_val = value
        elif value == max_val:
            best_actions.append(action)
    return best_actions

In [10]:
def improve_policy(policy_pi, V, env):  
    Q = create_state_action_value_dictionary(
        env=lake_environment, 
        state_values=V, 
        gamma=1)
    
    improved_policy = {}
    
    for state_id, action_values in Q.items():
        best_actions = get_best_actions(action_values)
        
        action_probabilities = {} 
        action_probability = 1.0 / len(best_actions)
        
        for action in action_values.keys():
            if action in best_actions: 
                action_probabilities[action] = action_probability
            else: 
                action_probabilities[action] = 0.0
        
        improved_policy[state_id] = action_probabilities 
            
    return improved_policy

# DYNAMIC PROGRAMMING ALGORITHMS
- Remember policy improvement
    - Given state-action values for each state-action pair, we can improve our initial policy by always selecting the best possible action given a state. The best possibile action for each state is the one with the largest state-action value.
    - We can get the state-action value given the state values of the environment. 
     - Always remember that the state value and the state action value are linked. And that the state value also depends on the values of the possible next states it could land to given an action. 


- **Policy Iteration**
  ```
  We start with a random policy
  Repeat until the policy no longer has significant improvements:
    -> evaluate the policy (we get the state_values)
    -> improve the policy 
    -> evaluate the new policy if it's better than the previous policy 
        (compare the state values)
    ```

- **Value Iteration** 
    ```
    We start with state values equal to zero 
    Repeat the ff until there are no more improvements in the state value:
        -> for each state get the maximum state_action value q
            -> store q as new the state value v of this state
    
    Given the best state values we have gotten compute the best policy
    ```
    - Note that the state values changes every iteration because the computing the state action value depends on other state values
    - Basically the idea is, we get the best value of each state-action pair and we use that as the value of our state. Then after getting the best possible state values (when the values don't improve anymore), we use these state values to derive our policy from this information.
   
- **Truncated Policy Iteration (not implemented)** 
  - Means that instead of waiting for the evaluation or the policy to converge at some point, we just specify how many iterations and evaluations such that the policy is good enough

In [11]:
def iterate_policy(env):
    # discount factor gamma is 1.0
    policy = create_equiprobable_policy(env)
    print("Iterating policy...")
    i = 0
    while True:
        i+=1
        state_values = evaluate_policy(env, policy)
        updated_policy = improve_policy(policy, state_values, env)
        updated_state_values = evaluate_policy(env, updated_policy)
        
        # Continue doing this until policy doesn't change 
        if policy == updated_policy: break 
        
        # ALTERNATIVE VIEW/OPTION:
        #delta = np.max(np.abs(updated_state_values - state_values))
        #if delta < 1e-8: break
        policy = updated_policy
                    
    print("... Iteration done. Number of iterations:", i)
    return updated_policy, updated_state_values

In [12]:
def iterate_value(env):
    print("Performing value iteration...")
    i = 0
    V = np.zeros(env.number_of_states)
    while True:
        i += 1
        delta = 0.0
        
        for i in range(env.number_of_states):
            old_v = V[i]
            
            # ------------------------------
            # get max state-action value of a given a state
            values = []
            for a in env.actions:
                v = get_state_action_value(i, a, env, V, gamma=1.0)
                values.append(v)      
            V[i] = max(values)  
            # ------------------------------
            
            delta = max(delta, abs(V[i] - old_v))
            
        if delta < 1e-8: break
        print(".", end="")
        
    policy = create_equiprobable_policy(env)
    policy = improve_policy(policy, V, env)
    print("... Done. Number of iterations:", i)
    
    return policy, V

In [13]:
#---------------------
# Evaluating a Random Policy
#---------------------
random_policy = create_equiprobable_policy(lake_environment)
V = evaluate_policy(lake_environment, random_policy)

pretty_print_qpi(random_policy, lake_environment)

print()
pretty_print_v(V, lake_environment)

Evaluating policy...
... Evaluation done. Number of iterations: 57
actions: ['left', 'down', 'right', 'up']

STATE ACTION VALUE TABLE (Q)
+-------+---------+---------+---------+---------+
| STATE | LEFT    | DOWN    | RIGHT   | UP      |
+-------+---------+---------+---------+---------+
|  0    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  1    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  2    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  3    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  4    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  5    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  6    | 0.25000 | 0.25000 | 0.25000 | 0.25000 |
+-------+---------+---------+---------+---------+
|  7    | 0.

In [14]:
#---------------------
# Finding the best policy through policy iteration
#---------------------
policy, state_values = iterate_policy(env=lake_environment)

pretty_print_qpi(policy, lake_environment)
print()
pretty_print_v(state_values, lake_environment)

Iterating policy...
Evaluating policy...
... Evaluation done. Number of iterations: 57
Evaluating policy...
... Evaluation done. Number of iterations: 362
Evaluating policy...
... Evaluation done. Number of iterations: 362
Evaluating policy...
... Evaluation done. Number of iterations: 458
Evaluating policy...
... Evaluation done. Number of iterations: 458
Evaluating policy...
... Evaluation done. Number of iterations: 458
... Iteration done. Number of iterations: 3
actions: ['left', 'down', 'right', 'up']

STATE ACTION VALUE TABLE (Q)
+-------+---------+---------+---------+---------+
| STATE | LEFT    | DOWN    | RIGHT   | UP      |
+-------+---------+---------+---------+---------+
|  0    | 1.00000 | 0.00000 | 0.00000 | 0.00000 |
+-------+---------+---------+---------+---------+
|  1    | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
+-------+---------+---------+---------+---------+
|  2    | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
+-------+---------+---------+---------+---------+
|  3    

In [15]:
#---------------------
# Finding the best policy through policy iteration
#---------------------
policy, state_values = iterate_value(env=lake_environment)

pretty_print_qpi(policy, lake_environment)
print()
pretty_print_v(state_values, lake_environment)
    

Performing value iteration...
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... Done. Number of iterations: 15
actions: ['left', 'down', 'right', 'up']

STATE ACTION VALUE TABLE (Q)
+-------+---------+---------+---------+---------+
| STATE | LEFT    | DOWN    | RIGHT   | UP      |
+-------+---------+---------+---------+---------+
|  0    | 1.00000 | 0.00000 | 0.00000 | 0.00000 |
+-------+---------+---------+---------+---------+
|  1    | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
+-------+---------+---------+---------+---------+
|  2    | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
+-------+