In [1]:
import gym
env = gym.make('CartPole-v0')

[2017-05-09 01:09:30,340] Making new env: CartPole-v0


In [13]:
env.reset()

for _ in range(1000):
        env.render()
        observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
        print(info,observation, reward,done)
        

[2017-05-09 01:16:13,181] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


{} [-0.02373025 -0.18916158  0.02563683  0.25310956] 1.0 False
{} [-0.02751348  0.0055851   0.03069902 -0.03137807] 1.0 False
{} [-0.02740178 -0.18996334  0.03007146  0.27083047] 1.0 False
{} [-0.03120105  0.00471688  0.03548807 -0.01221815] 1.0 False
{} [-0.03110671  0.19931239  0.03524371 -0.29349632] 1.0 False
{} [-0.02712046  0.00370614  0.02937378  0.01009035] 1.0 False
{} [-0.02704634 -0.19182449  0.02957559  0.31189442] 1.0 False
{} [-0.03088283  0.0028639   0.03581348  0.0286835 ] 1.0 False
{} [-0.03082555  0.19745445  0.03638715 -0.25248824] 1.0 False
{} [-0.02687646  0.00183232  0.03133738  0.05144618] 1.0 False
{} [-0.02683982  0.19649124  0.0323663  -0.23118716] 1.0 False
{} [-0.02290999  0.39113611  0.02774256 -0.51348752] 1.0 False
{} [-0.01508727  0.58585657  0.01747281 -0.79730055] 1.0 False
{} [-0.00337014  0.78073447  0.0015268  -1.08443603] 1.0 False
{} [ 0.01224455  0.5855924  -0.02016192 -0.7912744 ] 1.0 False
{} [ 0.0239564   0.7809853  -0.03598741 -1.09023138] 1.

In [None]:
import gym
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

In [4]:
env = gym.make('SpaceInvaders-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample())

[2017-05-09 01:03:49,507] Making new env: SpaceInvaders-v0


In [2]:
import gym
import pandas as pd
import numpy as np
import random

# https://gym.openai.com/envs/CartPole-v0
# Carlos Aguayo - carlos.aguayo@gmail.com


class QLearner(object):
    def __init__(self,
                 num_states=100,
                 num_actions=4,
                 alpha=0.2,
                 gamma=0.9,
                 random_action_rate=0.5,
                 random_action_decay_rate=0.99):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.random_action_rate = random_action_rate
        self.random_action_decay_rate = random_action_decay_rate
        self.state = 0
        self.action = 0
        self.qtable = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))

    def set_initial_state(self, state):
        """
        @summary: Sets the initial state and returns an action
        @param state: The initial state
        @returns: The selected action
        """
        self.state = state
        self.action = self.qtable[state].argsort()[-1]
        return self.action

    def move(self, state_prime, reward):
        """
        @summary: Moves to the given state with given reward and returns action
        @param state_prime: The new state
        @param reward: The reward
        @returns: The selected action
        """
        alpha = self.alpha
        gamma = self.gamma
        state = self.state
        action = self.action
        qtable = self.qtable

        choose_random_action = (1 - self.random_action_rate) <= np.random.uniform(0, 1)

        if choose_random_action:
            action_prime = random.randint(0, self.num_actions - 1)
        else:
            action_prime = self.qtable[state_prime].argsort()[-1]

        self.random_action_rate *= self.random_action_decay_rate

        qtable[state, action] = (1 - alpha) * qtable[state, action] + alpha * (reward + gamma * qtable[state_prime, action_prime])

        self.state = state_prime
        self.action = action_prime

        return self.action


def cart_pole_with_qlearning():
    env = gym.make('CartPole-v0')
    experiment_filename = './cartpole-experiment-1'
    #env.monitor.start(experiment_filename, force=True)
    env = gym.wrappers.Monitor(env, experiment_filename,force=True)

    goal_average_steps = 195
    max_number_of_steps = 200
    number_of_iterations_to_average = 100

    number_of_features = env.observation_space.shape[0]
    last_time_steps = np.ndarray(0)

    cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
    pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
    cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1]
    angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1]

    def build_state(features):
        return int("".join(map(lambda feature: str(int(feature)), features)))

    def to_bin(value, bins):
        return np.digitize(x=[value], bins=bins)[0]

    learner = QLearner(num_states=10 ** number_of_features,
                       num_actions=env.action_space.n,
                       alpha=0.2,
                       gamma=1,
                       random_action_rate=0.5,
                       random_action_decay_rate=0.99)

    for episode in range(50000):
        observation = env.reset()
        cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation
        state = build_state([to_bin(cart_position, cart_position_bins),
                             to_bin(pole_angle, pole_angle_bins),
                             to_bin(cart_velocity, cart_velocity_bins),
                             to_bin(angle_rate_of_change, angle_rate_bins)])
        action = learner.set_initial_state(state)

        for step in range(max_number_of_steps - 1):
            observation, reward, done, info = env.step(action)

            cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation

            state_prime = build_state([to_bin(cart_position, cart_position_bins),
                                       to_bin(pole_angle, pole_angle_bins),
                                       to_bin(cart_velocity, cart_velocity_bins),
                                       to_bin(angle_rate_of_change, angle_rate_bins)])

            if done:
                reward = -200

            action = learner.move(state_prime, reward)

            if done:
                last_time_steps = np.append(last_time_steps, [int(step + 1)])
                if len(last_time_steps) > number_of_iterations_to_average:
                    last_time_steps = np.delete(last_time_steps, 0)
                break

        if last_time_steps.mean() > goal_average_steps:
            print ("Goal reached!")
            print ("Episodes before solve: ", episode + 1)
            print ("Best 100-episode performance {} {} {}".format(last_time_steps.max(),
                                                                  unichr(177),  # plus minus sign
                                                                  last_time_steps.std()))
            break

    env.monitor.close()

if __name__ == "__main__":
    random.seed(0)
    cart_pole_with_qlearning()

[2017-05-09 01:22:38,814] Making new env: CartPole-v0
[2017-05-09 01:22:38,824] Clearing 16 monitor files from previous run (because force=True was provided)
[2017-05-09 01:22:38,855] Starting new video recorder writing to /Users/syunyo/Desktop/OpenGym/cartpole-experiment-1/openaigym.video.0.63560.video000000.mp4
[2017-05-09 01:22:40,253] Starting new video recorder writing to /Users/syunyo/Desktop/OpenGym/cartpole-experiment-1/openaigym.video.0.63560.video000001.mp4
[2017-05-09 01:22:41,248] Starting new video recorder writing to /Users/syunyo/Desktop/OpenGym/cartpole-experiment-1/openaigym.video.0.63560.video000008.mp4
[2017-05-09 01:22:41,586] Starting new video recorder writing to /Users/syunyo/Desktop/OpenGym/cartpole-experiment-1/openaigym.video.0.63560.video000027.mp4
[2017-05-09 01:22:41,918] Starting new video recorder writing to /Users/syunyo/Desktop/OpenGym/cartpole-experiment-1/openaigym.video.0.63560.video000064.mp4


Error: Tried to reset environment which is not done. While the monitor is active for CartPole-v0, you cannot call reset() unless the episode is over.