In [None]:
!pip install -y python-numpy python-dev cmake zlib1g-dev libjpeg- dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig

In [None]:
!pip install pyvirtualdisplay
#pip install piglet

In [None]:
!pip install piglet

## The Cart-Pole Problem

Cart-Pole also known as Inverted Pendulum with a center of gravity aobve its pivot point. It is unstable and falls over but can be controlled by moving the cart. The goal of the problem is to keep the pole balanced by moving the cart left or right by applying appropriate forces to the pivot point.

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.

In [None]:
import os

import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gym import wrappers
import warnings
warnings.filterwarnings('ignore')

In [2]:
class CartPoleQLearningAgent:
    def __init__(self,
                 learning_rate=0.2,
                 discount_factor=1.0,
                 exploration_rate=0.5,
                 exploration_decay_rate=0.99):

        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay_rate = exploration_decay_rate
        self.state = None
        self.action = None
        ##is discretizing of the state space as the environment is a continous flow of states,
        ##we need to discretize the states to make the Q-Table
        # Discretize the continuous state space for each of the 4 features.
        num_discretization_bins = 7
        self._state_bins = [
            # Cart position.
            self._discretize_range(-2.4, 2.4, num_discretization_bins),
            # Cart velocity.
            self._discretize_range(-3.0, 3.0, num_discretization_bins),
            # Pole angle.
            self._discretize_range(-0.5, 0.5, num_discretization_bins),
            # Tip velocity.
            self._discretize_range(-2.0, 2.0, num_discretization_bins)
        ]

        # Create a clean Q-Table.
        self._num_actions = 2
        self._max_bins = max(len(bin) for bin in self._state_bins)
        num_states = (self._max_bins + 1) ** len(self._state_bins)
        ### Initialize the Q-table by all zeros.
        self.q = np.zeros(shape=(num_states, self._num_actions))

    @staticmethod
    def _discretize_range(lower_bound, upper_bound, num_bins):
        return np.linspace(lower_bound, upper_bound, num_bins + 1)[1:-1]

    @staticmethod
    def _discretize_value(value, bins):
        return np.digitize(x=value, bins=bins)

    def _build_state(self, observation):
        # Discretize the observation features and reduce them to a single integer.
        state = sum(
            self._discretize_value(feature, self._state_bins[i]) * ((self._max_bins + 1) ** i)
            for i, feature in enumerate(observation)
        )
        return state

    def begin_episode(self, observation):
        # Reduce exploration over time.
        self.exploration_rate *= self.exploration_decay_rate

        # Get the action for the initial state.
        self.state = self._build_state(observation)
        return np.argmax(self.q[self.state])

    def act(self, observation, reward):
        #Travel to the next state (S') as a result of that action (a).
        next_state = self._build_state(observation)
        ##Start exploring actions: For each state, select any one among all possible actions for the current state (S).
        # Exploration/exploitation: choose a random action or select the best one.
        enable_exploration = (1 - self.exploration_rate) <= np.random.uniform(0, 1)
        if enable_exploration:
            # To explore Explore: select a random action
            next_action = np.random.randint(0, self._num_actions)
        else:
            #Exploit: select the action with max value (future reward)
            ##For all possible actions from the state (S') select the one with the highest Q-value.
            next_action = np.argmax(self.q[next_state])
        # defining  to define the (sub)-probability transition operator given in the section 3.2 of the research paper
        aStar = np.argmax(self.q[next_state])
        probabilities = [self.exploration_rate / self._num_actions] * self._num_actions
        probabilities[aStar] += 1 - self.exploration_rate

        exp = 0

        for i in range(self._num_actions):
        	exp += probabilities[i] * self.q[next_state, i]

        # Learn: update Q-Table based on current reward and future action.
        #Update Q-table values using the equation (3) in the research paper: "Safe and Efficient off-policy reinforcement learning"
        self.q[self.state, self.action] += self.learning_rate * \
            (reward + self.discount_factor * exp - self.q[self.state, self.action])
        ##Set the next state as the current state.
        self.state = next_state
        self.action = next_action
        return next_action

In [3]:
class EpisodeHistory:
    def __init__(self,
                 capacity,
                 plot_episode_count=200,
                 max_timesteps_per_episode=200,
                 goal_avg_episode_length=195,
                 goal_consecutive_episodes=100):

        self.lengths = np.zeros(capacity, dtype=int)
        self.plot_episode_count = plot_episode_count
        self.max_timesteps_per_episode = max_timesteps_per_episode
        self.goal_avg_episode_length = goal_avg_episode_length
        self.goal_consecutive_episodes = goal_consecutive_episodes

        self.point_plot = None
        self.mean_plot = None
        self.fig = None
        self.ax = None

    def __getitem__(self, episode_index):
        return self.lengths[episode_index]

    def __setitem__(self, episode_index, episode_length):
        self.lengths[episode_index] = episode_length
    def is_goal_reached(self, episode_index):
        avg = np.average(self.lengths[episode_index - self.goal_consecutive_episodes + 1:episode_index + 1])
        return avg >= self.goal_avg_episode_length


In [4]:
def log_timestep(index, action, reward, observation):
    format_string = "   ".join([
        "Timestep: {0:3d}",
        "Action: {1:2d}",
        "Reward: {2:5.1f}",
        "Cart Position: {3:6.3f}",
        "Cart Velocity: {4:6.3f}",
        "Angle: {5:6.3f}",
        "Tip Velocity: {6:6.3f}"
    ])
    print(format_string.format(index, action, reward, *observation))

In [5]:
def run_agent(env, verbose=False):
    max_episodes_to_run = 5000
    max_timesteps_per_episode = 200

    goal_avg_episode_length = 195
    goal_consecutive_episodes = 100

    plot_episode_count = 200
    plot_redraw_frequency = 10

    agent = CartPoleQLearningAgent(
        learning_rate=0.05,
        discount_factor=0.95,
        exploration_rate=0.5,
        exploration_decay_rate=0.99
    )

    episode_history = EpisodeHistory(
        capacity=max_episodes_to_run,
        plot_episode_count=plot_episode_count,
        max_timesteps_per_episode=max_timesteps_per_episode,
        goal_avg_episode_length=goal_avg_episode_length,
        goal_consecutive_episodes=goal_consecutive_episodes
    )
   

    for episode_index in range(max_episodes_to_run):
        observation = env.reset()
        action = agent.begin_episode(observation)

        for timestep_index in range(max_timesteps_per_episode):
            # Perform the action and observe the new state.
            observation, reward, done, info = env.step(action)

            # Update the display and log the current state.
            if verbose:
                env.render()
                log_timestep(timestep_index, action, reward, observation)

            # If the episode has ended prematurely, penalize the agent.
            if done and timestep_index < max_timesteps_per_episode - 1:
                reward = -max_episodes_to_run

            # Get the next action from the learner, given our new state.
            action = agent.act(observation, reward)

            # Record this episode to the history and check if the goal has been reached.
            if done or timestep_index == max_timesteps_per_episode - 1:
                print("Episode {} finished after {} timesteps.".format(episode_index + 1, timestep_index + 1))

                episode_history[episode_index] = timestep_index + 1
                
                if episode_history.is_goal_reached(episode_index):
                    print()
                    print("Goal reached after {} episodes!".format(episode_index + 1))
                    return episode_history

                break

    print("Goal not reached after {} episodes.".format(max_episodes_to_run))
    return episode_history

In [6]:
def save_history(history, experiment_dir):
    # Save the episode lengths to CSV.
    filename = os.path.join(experiment_dir, "episode_history.csv")
    dataframe = pd.DataFrame(history.lengths, columns=["length"])
    dataframe.to_csv(filename, header=True, index_label="episode")

In [7]:
def main():
    random_state = 0
    experiment_dir = "cartpole-retrace"

    env = gym.make("CartPole-v0")
    env.seed(random_state)
    np.random.seed(random_state)

    env = wrappers.Monitor(env, experiment_dir, force=True, resume=False)
    episode_history = run_agent(env, verbose=False)   # Set verbose=False to greatly speed up the process.
    save_history(episode_history, experiment_dir)
    env.close()

In [8]:
main()

Episode 1 finished after 22 timesteps.
Episode 2 finished after 13 timesteps.
Episode 3 finished after 24 timesteps.
Episode 4 finished after 11 timesteps.
Episode 5 finished after 19 timesteps.
Episode 6 finished after 30 timesteps.
Episode 7 finished after 27 timesteps.
Episode 8 finished after 34 timesteps.
Episode 9 finished after 36 timesteps.
Episode 10 finished after 13 timesteps.
Episode 11 finished after 42 timesteps.
Episode 12 finished after 12 timesteps.
Episode 13 finished after 11 timesteps.
Episode 14 finished after 39 timesteps.
Episode 15 finished after 19 timesteps.
Episode 16 finished after 12 timesteps.
Episode 17 finished after 17 timesteps.
Episode 18 finished after 12 timesteps.
Episode 19 finished after 20 timesteps.
Episode 20 finished after 16 timesteps.
Episode 21 finished after 11 timesteps.
Episode 22 finished after 27 timesteps.
Episode 23 finished after 24 timesteps.
Episode 24 finished after 42 timesteps.
Episode 25 finished after 52 timesteps.
Episode 2

Episode 200 finished after 200 timesteps.
Episode 201 finished after 200 timesteps.
Episode 202 finished after 200 timesteps.
Episode 203 finished after 200 timesteps.
Episode 204 finished after 154 timesteps.
Episode 205 finished after 200 timesteps.
Episode 206 finished after 200 timesteps.
Episode 207 finished after 200 timesteps.
Episode 208 finished after 200 timesteps.
Episode 209 finished after 200 timesteps.
Episode 210 finished after 200 timesteps.
Episode 211 finished after 200 timesteps.
Episode 212 finished after 200 timesteps.
Episode 213 finished after 200 timesteps.
Episode 214 finished after 200 timesteps.
Episode 215 finished after 200 timesteps.
Episode 216 finished after 69 timesteps.
Episode 217 finished after 200 timesteps.
Episode 218 finished after 200 timesteps.
Episode 219 finished after 200 timesteps.
Episode 220 finished after 200 timesteps.
Episode 221 finished after 200 timesteps.
Episode 222 finished after 200 timesteps.
Episode 223 finished after 190 time

**Goal reached after 325 episodes**

In [9]:
data = pd.read_csv('cartpole-retrace/episode_history.csv')

In [10]:
data.head()

Unnamed: 0,episode,length
0,0,22
1,1,13
2,2,24
3,3,11
4,4,19
