In [1]:
import gym
env = gym.make('Copy-v0')

m = {}

print('Observation space:', env.observation_space)
print('Action space:', env.action_space)
print('---')
print()

"""
idx: 0
    0: flyt hoved venstre
    1: flyt hoved højre
idx: 1
    0: læs
    1: skriv
idx: 2
    0: skriv 'A'
    1: skriv 'B'
    2: skriv 'C'
    3: skriv 'D'
    4: skriv 'E'
"""

"""
Algorithm:
læs og flyt til venstre indtil observation = 5
læs og flyt til højre
hvis 5: done
ellers: skriv obs
"""
STEP_LEFT_READ = (0,0,0)
STEP_RIGHT_READ = (1,0,0)
STEP_RIGHT_WRITE = (1,1,)

def act_render(env, action):
    observation, reward, done, info = env.step(action)
    env.render()
    return observation, reward, done, info

env.reset()
env.render()
print('BEGIN')
print()
observation, reward, done, info = act_render(env, STEP_LEFT_READ)
while observation < 5: 
    observation, reward, done, info = act_render(env, STEP_LEFT_READ)
observation, reward, done, info = act_render(env, STEP_RIGHT_READ)
while not done:
    observation, reward, done, info = act_render(env, STEP_RIGHT_WRITE + (observation,))

print('END')
    
env.close()

Observation space: Discrete(6)
Action space: Tuple(Discrete(2), Discrete(2), Discrete(5))
---

Total length of input instance: 2, step: 0
Observation Tape    :   [42mB[0mA  
Output Tape         :   
Targets             :   BA  






BEGIN

Total length of input instance: 2, step: 1
Observation Tape    :  [42m [0mBA  
Output Tape         :   
Targets             :   BA  

Current reward      :   0.000
Cumulative reward   :   0.000
Action              :   Tuple(move over input: left,
                              write to the output tape: False,
                              prediction: A)
Total length of input instance: 2, step: 2
Observation Tape    :   [42mB[0mA  
Output Tape         :   
Targets             :   BA  

Current reward      :   0.000
Cumulative reward   :   0.000
Action              :   Tuple(move over input: right,
                              write to the output tape: False,
                              prediction: A)
Total length of input instance: 2, step: 3

In [8]:
import gym
env = gym.make('CartPole-v0')
env.reset()

done = False
while not done:
    observation, reward, done, info = env.step(env.action_space.sample())
    env.render()
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [59]:
"""
Solving FrozenLake8x8 environment using Value-Itertion.
Author : Moustafa Alzantot (malzantot@ucla.edu)
"""
import numpy as np
import gym
from gym import wrappers
np.set_printoptions(suppress=True)

RENDER=False

def run_episode(env, policy, gamma = 1.0, render = False):
    """ Evaluates policy by using it to run an episode and finding its
    total reward.
    args:
    env: gym environment.
    policy: the policy to be used.
    gamma: discount factor.
    render: boolean to turn rendering on/off.
    returns:
    total reward: real value of the total reward recieved by agent under policy.
    """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    done = False
    while not done:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
    if render:
        env.render()

    return total_reward


def evaluate_policy(env, policy, gamma = 1.0,  n = 100):
    """ Evaluates a policy by running it n times.
    returns:
    average total reward
    """
    scores = [
            run_episode(env, policy, gamma = gamma, render = RENDER)
            for _ in range(n)]
    return np.mean(scores)

def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.env.nS)
    for s in range(env.env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for p, s_, r, _ in env.env.P[s][a]:
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy


def value_iteration(env, gamma = 1.0):
    """ Value-iteration algorithm """
    v = np.zeros(env.env.nS)  # initialize value-function to zeros
    max_iterations = 100000
    eps = 1e-10
    for i in range(max_iterations):
        #print(np.ceil(v.reshape([8,8]))); print()
        prev_v = np.copy(v)
        for s in range(env.env.nS):
            q_sa = [sum([p*(r + prev_v[s_]) for p, s_, r, _ in env.env.P[s][a]]) for a in range(env.env.nA)] 
            v[s] = max(q_sa)
        if (np.sum(np.fabs(prev_v - v)) <= eps):
            print ('Value-iteration converged at iteration %d.' %(i+1))
            break
    return v


if __name__ == '__main__':
    env_name  = 'FrozenLake8x8-v0'
    gamma = 1.0
    env = gym.make(env_name)
    optimal_v = value_iteration(env, gamma);
    print(np.around(optimal_v.reshape((8,8)), decimals=2))
    policy = extract_policy(optimal_v, gamma)
    print("""
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3    
    """)
    print(policy.reshape((8,8)).astype(int))
    policy_score = evaluate_policy(env, policy, gamma, n=1000)
    print('Policy average score = ', policy_score)
    env.close()

Value-iteration converged at iteration# 2357.
[[1.   1.   1.   1.   1.   1.   1.   1.  ]
 [1.   1.   1.   1.   1.   1.   1.   1.  ]
 [1.   0.98 0.93 0.   0.86 0.95 0.98 1.  ]
 [1.   0.93 0.8  0.47 0.62 0.   0.94 1.  ]
 [1.   0.83 0.54 0.   0.54 0.61 0.85 1.  ]
 [1.   0.   0.   0.17 0.38 0.44 0.   1.  ]
 [1.   0.   0.19 0.12 0.   0.33 0.   1.  ]
 [1.   0.73 0.46 0.   0.28 0.55 0.78 0.  ]]

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3    
    
[[1 2 2 1 2 2 2 2]
 [3 3 3 3 3 3 3 2]
 [0 0 0 0 2 3 3 2]
 [0 0 0 1 0 0 2 2]
 [0 3 0 0 2 1 3 2]
 [0 0 0 1 3 0 0 2]
 [0 0 1 0 0 0 0 2]
 [0 1 0 0 1 2 1 0]]
Policy average score =  0.876


In [75]:
"""
Solving FrozenLake8x8 environment using Policy iteration.
Author : Moustafa Alzantot (malzantot@ucla.edu)
"""
import numpy as np
import gym
from gym import wrappers


def run_episode(env, policy, gamma = 1.0, render = False):
    """ Runs an episode and return the total reward """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


def evaluate_policy(env, policy, gamma = 1.0, n = 100):
    scores = [run_episode(env, policy, gamma, False) for _ in range(n)]
    return np.mean(scores)

def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.env.nS)
    for s in range(env.env.nS):
        q_sa = np.zeros(env.env.nA)
        for a in range(env.env.nA):
            q_sa[a] = sum([p * (r + gamma * v[s_]) for p, s_, r, _ in  env.env.P[s][a]])
        policy[s] = np.argmax(q_sa)
    return policy

def compute_policy_v(env, policy, gamma=1.0):
    """ Iteratively evaluate the value-function under policy.
    Alternatively, we could formulate a set of linear equations in iterms of v[s] 
    and solve them to find the value function.
    """
    v = np.zeros(env.env.nS)
    eps = 1e-10
    while True:
        prev_v = np.copy(v)
        for s in range(env.env.nS):
            policy_a = policy[s]
            v[s] = sum([p * (r + gamma * prev_v[s_]) for p, s_, r, _ in env.env.P[s][policy_a]])
        if (np.sum((np.fabs(prev_v - v))) <= eps):
            # value converged
            break
    return v

def policy_iteration(env, gamma = 1.0):
    """ Policy-Iteration algorithm """
    policy = np.random.choice(env.env.nA, size=(env.env.nS))  # initialize a random policy
    max_iterations = 200000
    for i in range(max_iterations):
        old_policy_v = compute_policy_v(env, policy, gamma)
        new_policy = extract_policy(old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            print ('Policy-Iteration converged at iteration %d.' %(i+1))
            break
        policy = new_policy
    return policy


env_name  = 'FrozenLake8x8-v0'
env = gym.make(env_name)
optimal_policy = policy_iteration(env, gamma = 1.0)
scores = evaluate_policy(env, optimal_policy, gamma = 1.0, n=1000)
print('Average scores = ', np.mean(scores))

Policy-Iteration converged at iteration 9.
Average scores =  0.888


In [102]:
env.close()

In [100]:
"""
Hand-crafted algorithm for MountainCar.
Steer in direction of velocity if car is in motion, 
else neutral.
May waste energy climbing higher than necessary on left bank 
"""
import gym
from gym import wrappers
from time import sleep
env_name = 'MountainCar-v0'
env = gym.make(env_name)
env.seed(0)
np.random.seed(0)

left, neutral, right = 0, 1, 2

pos, vel = env.reset()
print(pos, vel)
print(env.observation_space)
print(env.action_space)
print('Begin sim')
done = False
total_reward = 0
while not done:
    env.render()
    if vel < 0: action = left
    elif vel > 0: action = right
    else: action = neutral
    (pos, vel), r, done, _ = env.step(action)
    total_reward += r
    sleep(.1)
    
print('Total reward:', total_reward)
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
-0.5891279887498433 0.0
Box(2,)
Discrete(3)
Begin sim
Total reward: -114.0


In [101]:
"""
Q-Learning example using OpenAI gym MountainCar enviornment
Author: Moustafa Alzantot (malzantot@ucla.edu)
"""
import numpy as np

import gym
from gym import wrappers

n_states = 40
iter_max = 10000

initial_lr = 1.0 # Learning rate
min_lr = 0.003
gamma = 1.0
t_max = 10000
eps = 0.02

def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a,b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_idx * reward
        step_idx += 1
        if done:
            break
    return total_reward

def obs_to_state(env, obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b

env_name = 'MountainCar-v0'
env = gym.make(env_name)
env.seed(0)
np.random.seed(0)
print ('----- using Q Learning -----')
q_table = np.zeros((n_states, n_states, 3))
for i in range(iter_max):
    obs = env.reset()
    total_reward = 0
    ## eta: learning rate is decreased at each step
    eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
    for j in range(t_max):
        a, b = obs_to_state(env, obs)
        if np.random.uniform(0, 1) < eps:
            action = np.random.choice(env.action_space.n)
        else:
            logits = q_table[a][b]
            logits_exp = np.exp(logits)
            probs = logits_exp / np.sum(logits_exp)
            action = np.random.choice(env.action_space.n, p=probs)
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        # update q table
        a_, b_ = obs_to_state(env, obs)
        q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma *  np.max(q_table[a_][b_]) - q_table[a][b][action])
        if done:
            break
    if i % 100 == 0:
        print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
solution_policy = np.argmax(q_table, axis=2)
solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
print("Average score of solution = ", np.mean(solution_policy_scores))
# Animate it
run_episode(env, solution_policy, True)
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
----- using Q Learning -----
Iteration #1 -- Total reward = -200.
Iteration #101 -- Total reward = -200.
Iteration #201 -- Total reward = -200.
Iteration #301 -- Total reward = -200.
Iteration #401 -- Total reward = -200.
Iteration #501 -- Total reward = -200.
Iteration #601 -- Total reward = -200.
Iteration #701 -- Total reward = -200.
Iteration #801 -- Total reward = -200.
Iteration #901 -- Total reward = -200.
Iteration #1001 -- Total reward = -200.
Iteration #1101 -- Total reward = -200.
Iteration #1201 -- Total reward = -200.
Iteration #1301 -- Total reward = -200.
Iteration #1401 -- Total reward = -200.
Iteration #1501 -- Total reward = -200.
Iteration #1601 -- Total reward = -200.
Iteration #1701 -- Total reward = -200.
Iteration #1801 -- Total reward = -200.
Iteration #1901 -- Total reward = -200.
Iteration #2001 -- Total reward = -200.
Iteration #2101 -- Total reward = -

In [109]:
"""
Q-Learning for MountainCar
Altered reward = velocity, 
Author: Pimin Konstantin Kefaloukos
"""
import numpy as np

import gym
from gym import wrappers

n_states = 40
iter_max = 5000

initial_lr = 1.0 # Learning rate
min_lr = 0.003
gamma = 1.0
t_max = 10000
eps = 0.02

def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a,b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_idx * reward
        step_idx += 1
        if done:
            break
    return total_reward

def obs_to_state(env, obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b

env_name = 'MountainCar-v0'
env = gym.make(env_name)
env.seed(0)
np.random.seed(0)
print ('----- using Q Learning -----')
q_table = np.zeros((n_states, n_states, 3))
for i in range(iter_max):
    obs = env.reset()
    total_reward = 0
    ## eta: learning rate is decreased at each step
    eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
    for j in range(t_max):
        a, b = obs_to_state(env, obs)
        if np.random.uniform(0, 1) < eps:
            action = np.random.choice(env.action_space.n)
        else:
            logits = q_table[a][b]
            logits_exp = np.exp(logits)
            probs = logits_exp / np.sum(logits_exp)
            action = np.random.choice(env.action_space.n, p=probs)
        obs, reward, done, _ = env.step(action)
        #reward = abs(obs[1])
        #print(reward)
        total_reward += reward
        # update q table
        a_, b_ = obs_to_state(env, obs)
        q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma *  np.max(q_table[a_][b_]) - q_table[a][b][action])
        if done:
            break
    if i % 100 == 0:
        print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
solution_policy = np.argmax(q_table, axis=2)
solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
print("Average score of solution = ", np.mean(solution_policy_scores))
# Animate it
run_episode(env, solution_policy, True)
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
----- using Q Learning -----
Iteration #1 -- Total reward = -200.
Iteration #101 -- Total reward = -200.
Iteration #201 -- Total reward = -200.
Iteration #301 -- Total reward = -200.
Iteration #401 -- Total reward = -200.
Iteration #501 -- Total reward = -200.
Iteration #601 -- Total reward = -200.
Iteration #701 -- Total reward = -200.
Iteration #801 -- Total reward = -200.
Iteration #901 -- Total reward = -200.
Iteration #1001 -- Total reward = -200.
Iteration #1101 -- Total reward = -200.
Iteration #1201 -- Total reward = -200.
Iteration #1301 -- Total reward = -200.
Iteration #1401 -- Total reward = -200.
Iteration #1501 -- Total reward = -200.
Iteration #1601 -- Total reward = -200.
Iteration #1701 -- Total reward = -200.
Iteration #1801 -- Total reward = -200.
Iteration #1901 -- Total reward = -200.
Iteration #2001 -- Total reward = -200.
Iteration #2101 -- Total reward = -