# install dependancies, takes around 45 seconds

Rendering Dependancies


In [None]:
!sudo apt-get update
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [None]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[classic_control]

In [None]:
# import some helper functions and tools
import gym
from gym import logger as gymlogger
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import numpy as np
import time, math, random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

The Cart-Pole consists of a pole, which is connected to a horizontally moving cart. T solve the task, the pole has to be balanced by applying a force F to the cart, The system is nonlinear, since the rotation of the pole introduces trigonometric functions into the force balance equations.

 # State and Action
 All observations are assigned a uniformly random value in (-0.05, 0.05)

In [None]:
env = gym.make("CartPole-v0")
# The environment is old, there will be a warning. But there is no need to change the environment to a newer version.
env.reset()
# This will give you the initial states: [position, velocity, angle, angular velocity]

In [None]:
print("upper bounds", env.observation_space.high)
print("lower bounds", env.observation_space.low)

The **states** of the Cart-Pole are the distance s of the cart, the velocity dot $\dot{s}$ of the cart, the angle of the pole $\theta$ and the angular velocity of the pole dot $\dot{\theta}$. In the environment, the observation of the environment will be $\text{obs}=[s,\dot{s},\theta,\dot{\theta}]$

In [None]:
env.action_space

The **action space** of the Cart-Pole environment is discrete, which includes 0 and 1. 0 means pushing the cart to the left, and 1 means pushing the cart to the right.

In [None]:
for i in range(10):
  action = env.action_space.sample()
  obs, reward, done, info = env.step(action)
  print(reward)
  if done:
    break

**Reward:** Since the goal is to keep the pole upright for as long as possible, a reward of +1 for every step taken, including the termination step, is allotted. The threshold for rewards is 475

For more details, please refer to the documentation of [OpenAI gym](https://www.gymlibrary.dev/environments/classic_control/cart_pole/). 

# Episode termination conditions

In [None]:
env.reset()
for i in range(100):
  action = env.action_space.sample()
  obs, reward, done, info = env.step(action)
  if done:
    print(obs)
    if np.abs(obs[0]>2.4):
      print('Cart Position is greater than 2.4!')
    elif np.abs(obs[2]>0.2095):
      print('Pole angle is greater than 12 degree!')
    break

The episode ends if any one of the following occurs:

1. Termination: Pole Angle is greater than ±12°

2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)

3. Truncation: Episode length is greater than 500.


# Helper functions for rendering

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = VideoRecorder(env, 'demo.mp4')
  return env

In [None]:
import gym
env = gym.make("CartPole-v0")
video = wrap_env(env)
obs = env.reset()
i = 0
while True:
    i+=1
    env.render()
    video.capture_frame()
    #your agent goes here
    action = env.action_space.sample() 
         
    obs, reward, done, info = env.step(action)   
    if done: 
      if np.abs(obs[0]>2.4):
        print('Cart Position is greater than 2.4!')
      elif np.abs(obs[2]>0.2095):
        print('Pole angle is greater than 12 degree!')
      break
video.close()            
env.close()
show_video()
#right click the video, you can download it!

# Questions begin here
Now we already have a basic understanding of the environment, let's have more fun!

In [None]:
# Define the Q agent
class Cart_Pole_Q_agent():
  def __init__(self, discretization_bin=(1,1,8,16),
                min_lr=0.1,
                lr=0.2,
                discount_factor = 0.99,
                exploration_decay_rate =0.99,
                exploration_rate =0.5,
                num_episodes=1000):
    # lr is short for learning rate, recall the update rule of Q learning, 
    #Q(s,a) += alpha*[R+gamma*max_aQ(s',a)-Q(s,a)]
    #lr = alpha
    #gamma = discount_factor
    self.min_lr = min_lr
    self.lr = lr
    self.discount_factor = discount_factor
    self.exploration_decay_rate = exploration_decay_rate
    self.exploration_rate = exploration_rate
    self.num_episodes = num_episodes
    self.env = gym.make('CartPole-v0')
    # Set the upper and lower bound
    # Discretize the state space
    self.discretization_bin = discretization_bin
    self.upperbound = [2.4,3.0,0.5,2.0]
    self.lowerbound = [-2.4,-3.0,-0.5,-2.0]
    self.action_space_len = self.env.action_space.n
    Q_table_size = self.discretization_bin+(self.action_space_len,)
    #Initilize the Q value for all state-action pairs as 0
    self.Q_tabular = np.zeros(Q_table_size)
  
  #Discretize the observations
  def discretize_obs(self, obs):
    discretized = list()
    for i in range(len(obs)):
      scaling = (obs[i] + abs(self.lowerbound[i])) / (self.upperbound[i] - self.lowerbound[i])
      new_obs = int(round((self.discretization_bin[i] - 1) * scaling))
      new_obs = min(self.discretization_bin[i] - 1, max(0, new_obs))
      discretized.append(new_obs)
    return tuple(discretized)
  
  # Choose the action
  def select_action(self, obs):
    #####explain why we need the first part to randomly pick actions
    if (np.random.random() < self.exploration_rate):
      return self.env.action_space.sample() 
    else:
      #choose the action
      ###### Your Code starts there
      
      ###### Your Code ends there

  # Update Q table
  def update_the_Q_table(self, state, action, reward, new_state):
    ###### Your Code starts there
      
    ###### Your Code ends there
  def get_exploration_rate(self):
    return self.exploration_rate*self.exploration_decay_rate

  def train(self):
    # you should collect all episodic rewards during training
    #
    reward_traj = []
    for e in range(self.num_episodes):
      ###### Your Code starts there
      
      ###### Your Code ends there

    print('Finished training!')
    return reward_traj

In [None]:
agent = Cart_Pole_Q_agent()
rwd_traj = agent.train()
plt.plot(range(len(rwd_traj)),rwd_traj)
plt.xlable('Episodes')
plt.ylable('Rewards')

In [None]:
# test the policy
import gym
env = gym.make("CartPole-v0")
video = wrap_env(env)
obs = env.reset()
###### Test the performance of your trianed agent 
###### Your Code starts there
      
###### Your Code ends there
video.close()            
env.close()
show_video()
#right click the video, you can download it!

In [None]:
###### Your Code starts there
      
###### Your Code ends there