In [None]:
import warnings
warnings.filterwarnings('ignore')

### Run in collab
<a href="https://colab.research.google.com/github/racousin/rl_introduction/blob/master/notebooks/2_Dynamic_Programming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install swig build-essential python-dev python3-dev > /dev/null 2>&1
!pip install pygame==2.1.0 > /dev/null 2>&1
!pip install gym==0.23.1 > /dev/null 2>&1
!pip install pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender==1.0.2 > /dev/null 2>&1
!pip install imageio==2.4.1 > /dev/null 2>&1
!git clone https://github.com/racousin/rl_introduction.git > /dev/null 2>&1

In [None]:
!git clone https://github.com/racousin/rl_introduction.git > /dev/null 2>&1

# 2_Dynamic_Programming

### Objective
Before we go any further into RL. We will compute the best agent when the model is perfectly known (MDP). As example, we will solve the FrozenLake problem.

**Complete the TODO steps! Good luck!**

In [None]:
import numpy as np
import gym
import copy
import matplotlib.pyplot as plt
import seaborn as sns
from time import sleep
from rl_introduction.rl_introduction.tools import Agent, plot_values_lake
from rl_introduction.rl_introduction.render_colab import gym_render
env = gym.make('FrozenLake-v1')

### FrozenLake- environment
The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

The surface is described using a grid like the following:
    
SFFF       (S: starting point, safe)

FHFH       (F: frozen surface, safe)

FFFH       (H: hole, fall to your doom)

HFFG       (G: goal, where the frisbee is located)

In [None]:
#TODO: Get the state, action size

In [None]:
def run_experiment_episode(env, agent, nb_episode):
    rewards = np.zeros(nb_episode)
    for i in range(nb_episode):
        state = env.reset()
        done = False
        rews = []
        while done is False:
            action = agent.act(state)
            state, reward, done, info = env.step(action)
            rews.append(reward)
        rewards[i] = sum(rews)
    return rewards

In [None]:
gym_render(env_name='FrozenLake-v1', directory='./video', agent = 'random', slow_coeff=10)

## 1) env transition model

We are access of the transition model, $P(S_{t+1}=s'|S_t = s, A_t=a)$ and the associated reward using <b>env.P[s][a]</b>. For example, the probabilities of each possible reward and next state, if the agent is in state 1 of the gridworld and decides to go left.

In [None]:
state = 1
action = 0 #left

In [None]:
# P(s'|state,action), s', reward, done
env.P[state][action]

We see here that there is a $1/3$ probability falling in the hole in state 5 (and finish the episode).

To resume, we have 

**States:** $S = \{0,...,15\}$

**Actions:** $A = \{0,1,2,3\}$

**Transition model:** $P_{ss'}^a = \mathbb{P} [S_{t+1} = s' \vert S_t = s, A_t = a]$ -> env.P[state][action]

**Reward function:**
$R(s, a) = \mathbb{E} [R_{t+1} \vert S_t = s, A_t = a]$

$\forall a \in {1,2,3} : R(14,a) = 1/3$
    
$R(14,0) = 0$

$\forall a \in A \forall s \in \{0,...,13\} : R(s,a) = 0$

## 2) agent policy
We add to our agent its <b>policy</b> $\pi(a|s)$.

In [None]:
#TODO: write Random Policy (ex uniformly random policy)
class MyRandomAgent(Agent):
    def __init__(self, env):
        super().__init__(env)
        self.policy = None#complete here
    def act(self, state):
        action = np.random.choice(np.arange(self.env.action_space.n),p=self.policy[state])
        return action

In [None]:
agent = MyRandomAgent(env)
agent.policy

In [None]:
def run_experiment_episode(env, agent, nb_episode):
    rewards = np.zeros(nb_episode)
    for i in range(nb_episode):
        state = env.reset()
        done = False
        rews = []
        while done is False:
            action = agent.act(state)
            state, reward, done, info = env.step(action)
            rews.append(reward)
        rewards[i] = sum(rews)
    return rewards

In [None]:
#TODO: eval Random Policy with run_experiment_episode

## 2) Policy Evaluation

The Value Function $V_\pi(s)$ is the expected return in state $s$, according to $\pi$.
$V_\pi(s) 
= \mathbb{E}_\pi [r + \gamma V_\pi(s_{t+1}) | S_t = s]
= \sum_a \pi(a \vert s) \sum_{s', r} P(s', r \vert s, a) (r + \gamma V_\pi(s'))$

We have all the information to resolve linear system for $V_\pi$:
\begin{equation}
V(s_0) =  \sum_a \pi(a \vert s) \sum_{s', r} P(s', r \vert s_0, a) (r + \gamma V(s'))\\
V(s_1) =  \sum_a \pi(a \vert s) \sum_{s', r} P(s', r \vert s_1, a) (r + \gamma V(s'))\\
...\\
V(s_{16}) = \sum_a \pi(a \vert s) \sum_{s', r} P(s', r \vert s_{16}, a) (r + \gamma V(s'))
\end{equation}
Even for 16 states it could be complicated. 

So we use an iterative approach.


We initialize $V_0$ arbitrarly. And we update it using:


$V_{k+1}(s) = \mathbb{E}_\pi [r + \gamma V_k(s_{t+1}) | S_t = s] $ (1).

$\forall s$, $V_\pi(s)$ is a fix point for (1), so if $(V_k)_{k\in \mathbb{N}}$ converges, it converges to $V_\pi$.

In [None]:
#TODO: write the value evaluation from Policy, reward and transition model
def policy_evaluation(env, policy, gamma=1, theta=1e-8):
    V = np.zeros(env.observation_space.n) # initialization
    #complete here
    return V

In [None]:
my_rand_agent = MyRandomAgent(env)

In [None]:
# evaluate the policy 
V = policy_evaluation(env, my_rand_agent.policy)

plot_values_lake(V)

In [None]:
V.sum()

Knowing the transition $\mathbb{P}(S_{t+1},R_{t+1}|S_t,A_t)$, it is natural to compute the q function from value function.

\begin{aligned}
Q(s, a) 
&= \mathbb{E} [R_{t+1} + \gamma V(S_{t+1}) \mid S_t = s, A_t = a] \\
&= \sum_{s'} [r_{t+1} + \gamma V(s')] P(S_{t+1}=s'|S_t=s,A_t=a)
\end{aligned}

In [None]:
#TODO: write the q evaluation from the value function, reward and transition model
def q_from_v(env, V, s, gamma=1):
    #complete here 
    return q

In [None]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
for s in range(env.observation_space.n):
    Q[s] = q_from_v(env, V, s)
print("Action-Value Function:")
print(Q)

## 3) Policy Improvement

With the q function, we update our policy from $\pi$ to $\pi'$ by acting greedy. That means $\pi'(.|s) = \arg\max_a Q_\pi(a,s)$.

This improvment is given by: $\forall s, V_\pi(s) = \sum_a\pi(a|s)Q_\pi(s,a) \leq \max_a Q_\pi(s,a) = V_{\pi'}(s)$

In [None]:
#TODO: choose the best action in a state s from Q, What the best direction/action on state 1?
def best_action_from_Q(env, Q, s):
  # Complete
  return best_a
print(f"best direction/action on state 1: {best_action_from_Q(env, Q, 1)}")

In [None]:
#TODO: write the policy improvment update step
def policy_improvement(env, V, gamma=1):
    policy = np.zeros([env.observation_space.n, env.action_space.n])
    #complete here    
    return policy

In [None]:
# evaluate the policy 
V = policy_evaluation(env, my_rand_agent.policy)

plot_values_lake(V)

In [None]:
V.sum()

In [None]:
#TODO: Improve and evaluate the policy

In [None]:
new_V.sum()

## 4) Policy iteration 

$\pi_0 \xrightarrow[]{\text{evaluation}} V_{\pi_0} \xrightarrow[]{\text{improve}}
\pi_1 \xrightarrow[]{\text{evaluation}} V_{\pi_1} \xrightarrow[]{\text{improve}}
\pi_2 \xrightarrow[]{\text{evaluation}} \dots \xrightarrow[]{\text{improve}}
\pi_* \xrightarrow[]{\text{evaluation}} V_*$

In [None]:
#TODO: write the policy iteration
def policy_iteration(env):
    policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n # init a random policy
    # complete here
    return policy, V

In [None]:

# obtain the optimal policy and optimal state-value function
policy_pi, V_pi = policy_iteration(env)

# print the optimal policy
print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
print(policy_pi,"\n")

plot_values_lake(V_pi)

In [None]:
policy_pi

In [None]:
V_pi.sum()

## 4) Value iteration 

Value iteration consists in directly compute the best policy evaluation.
We initialize $V_0$ arbitrarly. And we update it using:

$V_{k+1}(s) = \mathbb{E}_\pi [r + \gamma \max_a Q_k(s_{t+1},a) | S_t = s] $ (2).
$\forall s$, $V_{\pi^*}(s)$ is a fix point for (2), so if $(V_k)_{k\in \mathbb{N}}$ converges, it converges to $V_{\pi^*}$.

In [None]:
def value_iteration(env, gamma=1, theta=1e-8):
    V = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            v = V[s]
            V[s] = max(q_from_v(env, V, s, gamma))
            delta = max(delta,abs(V[s]-v))
        if delta < theta:
            break
    policy = policy_improvement(env, V, gamma)
    return policy, V

In [None]:
policy_vi, V_vi = value_iteration(env)

# print the optimal policy
print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
print(policy_vi,"\n")

# plot the optimal state-value function
plot_values_lake(V_vi)

In [None]:
V_vi.sum()

## Train agent and Run experiments

In [None]:
class MyMDPAgent(Agent):
    def __init__(self, env):
        super().__init__(env)
        self.policy = np.ones([self.env.observation_space.n, self.env.action_space.n]) / self.env.action_space.n
    def act(self, state):
        action = np.random.choice(np.arange(self.env.action_space.n),p=self.policy[state])
        return action
    def train(self):
        self.policy, _ = value_iteration(self.env)

In [None]:
def run_experiment_episode(env, agent, nb_episode):
    rewards = np.zeros(nb_episode)
    for i in range(nb_episode):
        state = env.reset()
        done = False
        rews = []
        while done is False:
            action = agent.act(state)
            state, reward, done, info = env.step(action)
            rews.append(reward)
        rewards[i] = sum(rews)
    return rewards

In [None]:
#TODO: eval best Policy with run_experiment_episode

In [None]:
gym_render(env=env, directory='./video', agent = mdp_agent, slow_coeff=10)