# Exercise II: Cart Pole preliminaries and Monte Carlo
Open Files/A2_... and study it in Colab by running it.

Observe how all the facets of a reinforcement learning coupled machine/environment system are present.

The notebook includes some code to show how the behaviour of the agent can be rendered, using a random policy that exploits the .sample() method.
## A2_CartPoleWithRendering.ipynb

In [1]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

In [2]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
import collections

In [4]:
env = gym.make("CartPole-v0")
env.reset()


for i in range(50000):
  action = env.action_space.sample()
  print("step i",i,"action=",action)
  obs, reward, done, info = env.step(action)
  print("obs=",obs,"reward=",reward,"done=",done,"info=",info)

  if done:
    break
    
env.close()
print("Iterations that were run:",i)

step i 0 action= 0
obs= [ 0.00546224 -0.24165428 -0.02821972  0.26101427] reward= 1.0 done= False info= {}
step i 1 action= 1
obs= [ 0.00062915 -0.0461411  -0.02299944 -0.04043424] reward= 1.0 done= False info= {}
step i 2 action= 0
obs= [-0.00029367 -0.24092582 -0.02380812  0.24490433] reward= 1.0 done= False info= {}
step i 3 action= 1
obs= [-0.00511219 -0.04547205 -0.01891004 -0.05519216] reward= 1.0 done= False info= {}
step i 4 action= 1
obs= [-0.00602163  0.14991587 -0.02001388 -0.35378086] reward= 1.0 done= False info= {}
step i 5 action= 1
obs= [-0.00302331  0.3453166  -0.0270895  -0.65270695] reward= 1.0 done= False info= {}
step i 6 action= 1
obs= [ 0.00388302  0.54080511 -0.04014364 -0.95379546] reward= 1.0 done= False info= {}
step i 7 action= 1
obs= [ 0.01469912  0.73644349 -0.05921955 -1.25881566] reward= 1.0 done= False info= {}
step i 8 action= 0
obs= [ 0.02942799  0.54212715 -0.08439586 -0.98525248] reward= 1.0 done= False info= {}
step i 9 action= 1
obs= [ 0.04027054 

##Exercise 1:
> Can you design a dynamic programming based policy for the agent as in assignment 1? If so, design it and demonstrate that it solves the cart pole problem.

No. Because dynamic programming needs to have the complete knowledge of the environment which is *p(s',r|s,a)*. However, for the Cart Pole environment, we don't have the probabilities of environment transition.



##Exercise 2:

>Can you design a Monte Carlo based policy for the agent? What ingredients do you require? Explain the design flow, and execute it. Show that it works, or indicate why you can't proceed.

Yes. To design a Monte Carlo based policy we require to generate episodes to explore the environment. We first need to generate a finite number of discrete states. Because the state observation of the environment is four continuous variables. Then we can implement on-policy first-visit Monte Carlo control(for $\epsilon$-*soft* policies).

In [81]:
'''
  Type: Box(4)
  Num     Observation               Min                     Max
  0       Cart Position             -4.8                    4.8
  1       Cart Velocity             -Inf                    Inf
  2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
  3       Pole Angular Velocity     -Inf                    Inf
'''

def discretize_state(obs):
	# env.observation_space.high
	# [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
	# env.observation_space.low
	# [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
  discrete = [np.digitize(obs[i], bins) for i, bins in enumerate([
    np.linspace(-4.8, 4.8, 9),
    np.linspace(-4, 4, 7),
    np.linspace(-0.418, 0.418, 9),
    np.linspace(-4, 4, 7),
    ])]
  return ((obs > 0) * 8 **np.arange(len(obs))).sum()

def init_policy(epsilon,S,A):
  pi_init = np.random.random([S,A])
  out = np.zeros_like(pi_init, dtype = np.float)
  idx = pi_init.argmax(axis=1)
  out[np.arange(S), idx] = 1
  pi = out*(1 - epsilon) + epsilon / A
  return pi

In [82]:
def generate_episode(env, policy, actions):
  episode = []
  obs = env.reset()
  i = 0
  while True:
    i += 1
    state = discretize_state(obs)
    action = np.random.choice(actions, p=policy[state])
    obs, reward, done, info = env.step(action)
    episode.append((state, action, reward))
    if done:
      break
  return episode,i


def Monte_Carlo(env,gamma,epsilon):
  S = 8**len(obs)
  A = env.action_space.n
  #initialize
  pi = init_policy(epsilon,S,A)
  states = np.arange(S)
  actions = np.arange(A)
  Q = np.random.random([S,A])
  returns = collections.defaultdict(lambda : collections.defaultdict(list))

  for episode in range(1000):
    #generate episodes
    episode, i = generate_episode(env,pi,actions)
    state_actions = [(s, a) for (s,a,r) in episode]
    
    G = 0
    for t in range(i-1,-1,-1):
      state, action, reward = episode[t]
      G = gamma * G + reward
      if not (state,action) in state_actions[0:t]:
        returns[state][action] = G
        Q[state, action] = np.mean(returns[state][action])
        best_action = np.argmax(Q[state])
        for a in range(A):
          if a == best_action:
            pi[state,a] = 1 - epsilon + epsilon / A
          else:
            pi[state,a] = epsilon / A
  return Q, pi

In [93]:
Q, policy = Monte_Carlo(env,0.99,0.01)

In [94]:
# Convert epsilon-soft policy to a greedy policy
S = 8**len(obs)
A = env.action_space.n
pi_greedy = np.zeros_like(policy, dtype=np.float)
pi_greedy[np.arange(S), np.argmax(policy, axis=1)] = 1

# Test
done = False
obs = env.reset()
for i in range(50000):
    state = discretize_state(obs)
    action = np.random.choice(np.arange(A), p=policy[state])

    print("step i",i,"action=",action)
    obs, reward, done, info = env.step(action)
    print("obs=",obs,"reward=",reward,"done=",done,"info=",info)

    if done:
        break

step i 0 action= 0
obs= [ 0.04037584 -0.18818377  0.02776897  0.26090951] reward= 1.0 done= False info= {}
step i 1 action= 1
obs= [ 0.03661217  0.006531    0.03298716 -0.02288701] reward= 1.0 done= False info= {}
step i 2 action= 0
obs= [ 0.03674279 -0.18904811  0.03252942  0.28001852] reward= 1.0 done= False info= {}
step i 3 action= 1
obs= [ 0.03296182  0.00559508  0.03812979 -0.00222984] reward= 1.0 done= False info= {}
step i 4 action= 0
obs= [ 0.03307373 -0.19005238  0.03808519  0.30223548] reward= 1.0 done= False info= {}
step i 5 action= 1
obs= [0.02927268 0.00450666 0.0441299  0.02180289] reward= 1.0 done= False info= {}
step i 6 action= 1
obs= [ 0.02936281  0.19896887  0.04456596 -0.25663647] reward= 1.0 done= False info= {}
step i 7 action= 0
obs= [0.03334219 0.00323991 0.03943323 0.04976352] reward= 1.0 done= False info= {}
step i 8 action= 1
obs= [ 0.03340699  0.19777489  0.0404285  -0.23022174] reward= 1.0 done= False info= {}
step i 9 action= 0
obs= [0.03736248 0.0020992