## Frozen Lake

In [2]:
import gym
import numpy as np

In [3]:
env = gym.make('FrozenLake-v0')

Q(s,a) = r + γ(max(Q(s’,a’))

This says that the Q-value for a given state (s) and action (a) should represent the current reward (r) plus the maximum discounted (γ) future reward expected according to our own table for the next state (s’) we would end up in. The discount variable allows us to decide how important the possible future rewards are compared to the present reward. By updating in this way, the table slowly begins to obtain accurate measures of the expected future reward for a given action in a given state.

In [15]:
#Initialize table with all zeros
Q = np.zeros([env.observation_space.n, env.action_space.n])
# Set learning parameters
lr = .8
y = .95
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []

In [16]:
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)
#     env.render()

In [17]:
print ("Score over time: " +  str(sum(rList)/num_episodes))

Score over time: 0.4895


In [18]:
print ("Final Q-Table Values")
print (Q)

Final Q-Table Values
[[8.17798474e-02 3.57501328e-03 3.67419579e-03 6.78326790e-03]
 [7.22838775e-04 1.66094851e-04 7.26340854e-04 9.20627993e-02]
 [5.25946114e-03 4.92073634e-03 5.62991342e-03 6.15117454e-02]
 [7.99329770e-06 1.13712217e-03 4.95138091e-04 3.60054616e-02]
 [3.74942701e-02 3.84489581e-06 2.79617155e-04 3.87779385e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.17154627e-01 1.60833577e-04 2.12444102e-04 6.01938728e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.51181519e-03 5.39413709e-04 2.94383332e-03 1.54227208e-02]
 [0.00000000e+00 4.34718246e-02 7.39328000e-04 0.00000000e+00]
 [1.18890338e-01 6.44022296e-04 1.03772155e-03 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.31258580e-03 0.00000000e+00 1.53647518e-01 1.21507794e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 8.28420531e-01]
 [0.00000000e+00 0.00000000e+00 0.

<hr>

## Mountain Car

Q-Learning example using OpenAI gym MountainCar enviornment

Author: Moustafa Alzantot (malzantot@ucla.edu)

In [1]:
import numpy as np

import gym
from gym import wrappers

In [2]:
n_states = 40
iter_max = 10000

initial_lr = 1.0 # Learning rate
min_lr = 0.003
gamma = 1.0
t_max = 10000
eps = 0.02

In [3]:
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a,b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_idx * reward
        step_idx += 1
        if done:
            break
    return total_reward

In [4]:
def obs_to_state(env, obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b

In [9]:
if __name__ == '__main__':
    env_name = 'MountainCar-v0'
    env = gym.make(env_name)
    env.seed(0)
    np.random.seed(0)
    print ('----- using Q Learning -----')
    q_table = np.zeros((n_states, n_states, 3))
    for i in range(iter_max):
        obs = env.reset()
        total_reward = 0
        ## eta: learning rate is decreased at each step
        eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
        for j in range(t_max):
            a, b = obs_to_state(env, obs)
            if np.random.uniform(0, 1) < eps:
                action = np.random.choice(env.action_space.n)
            else:
                logits = q_table[a][b]
                logits_exp = np.exp(logits)
                probs = logits_exp / np.sum(logits_exp)
#                 from IPython.core.debugger import Tracer; Tracer()()
                action = np.random.choice(env.action_space.n, p=probs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            # update q table
            a_, b_ = obs_to_state(env, obs)
            q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma *  np.max(q_table[a_][b_]) - q_table[a][b][action])
            if done:
                break
        if i % 100 == 0:
            print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
    print("Average score of solution = ", np.mean(solution_policy_scores))
    # Animate it
    run_episode(env, solution_policy, True)



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
----- using Q Learning -----
> [1;32m<ipython-input-9-b94a016daa4e>[0m(22)[0;36m<module>[1;34m()[0m
[1;32m     20 [1;33m                [0mprobs[0m [1;33m=[0m [0mlogits_exp[0m [1;33m/[0m [0mnp[0m[1;33m.[0m[0msum[0m[1;33m([0m[0mlogits_exp[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m     21 [1;33m                [1;32mfrom[0m [0mIPython[0m[1;33m.[0m[0mcore[0m[1;33m.[0m[0mdebugger[0m [1;32mimport[0m [0mTracer[0m[1;33m;[0m [0mTracer[0m[1;33m([0m[1;33m)[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m---> 22 [1;33m                [0maction[0m [1;33m=[0m [0mnp[0m[1;33m.[0m[0mrandom[0m[1;33m.[0m[0mchoice[0m[1;33m([0m[0menv[0m[1;33m.[0m[0maction_space[0m[1;33m.[0m[0mn[0m[1;33m,[0m [0mp[0m[1;33m=[0m[0mprobs[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m     23 [1;33m            [0mobs[0m[1;33m,[0m [0mr