# sample examples of gym library
https://gym.openai.com/docs/

## MountainCar-v0

In [1]:
import gym
env = gym.make('MountainCar-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

<img src="MountainCar-v0.png">

## CartPole-v0

In [2]:
env = gym.make('CartPole-v0')
for i_episode in range(1):
    observation = env.reset()
    for t in range(10):
        env.render()
        #print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("observation:", observation)
        print("reward:", reward)
        print("done:", done)
        print("info:", info)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

observation: [ -2.35925599e-04  -1.76082211e-01  -1.43783232e-02   2.76483898e-01]
reward: 1.0
done: False
info: {}
observation: [-0.00375757 -0.37099611 -0.00884865  0.56459744]
reward: 1.0
done: False
info: {}
observation: [-0.01117749 -0.17575113  0.0024433   0.26913997]
reward: 1.0
done: False
info: {}
observation: [-0.01469251 -0.37090787  0.0078261   0.56259253]
reward: 1.0
done: False
info: {}
observation: [-0.02211067 -0.56613876  0.01907795  0.85773075]
reward: 1.0
done: False
info: {}
observation: [-0.03343345 -0.76151535  0.03623257  1.15635089]
reward: 1.0
done: False
info: {}
observation: [-0.04866375 -0.56688401  0.05935959  0.87524553]
reward: 1.0
done: False
info: {}
observation: [-0.06000143 -0.76276048  0.0768645   1.18598423]
reward: 1.0
done: False
info: {}
observation: [-0.07525664 -0.56871489  0.10058418  0.91835118]
reward: 1.0
done: False
info: {}
observation: [-0.08663094 -0.76504219  0.11895121  1.24087558]
reward: 1.0
done: False
info: {}


<img src="CartPole-v0.png">

In [3]:
print(env.observation_space)
print(env.action_space)

Box(4,)
Discrete(2)


In [4]:
print(env.observation_space.high)

[  4.80000019e+00   3.40282347e+38   4.18879032e-01   3.40282347e+38]


In [5]:
print(env.observation_space.low)

[ -4.80000019e+00  -3.40282347e+38  -4.18879032e-01  -3.40282347e+38]


In [6]:
from gym import spaces
space = spaces.Discrete(8) # Set with 8 elements {0, 1, 2, ..., 7}
x = space.sample()
assert space.contains(x)
assert space.n == 8

## Taxi-v2

In [7]:
import gym
env = gym.make('Taxi-v2')
env.reset()

for i_episode in range(1):
    observation = env.reset()
    for t in range(5):
        env.render()
        print("state:", observation)
        action = env.action_space.sample()
        print("action:", action)
        observation, reward, done, info = env.step(action)
        print("state_prime:", observation)
        print("reward:", reward)
        print("done:", done)
        print("info:", info)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

state: 62
action: 4
state_prime: 62
reward: -10
done: False
info: {'prob': 1.0}
+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
state: 62
action: 2
state_prime: 82
reward: -1
done: False
info: {'prob': 1.0}
+---------+
|[34;1mR[0m: | : :[43mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
state: 82
action: 5
state_prime: 82
reward: -10
done: False
info: {'prob': 1.0}
+---------+
|[34;1mR[0m: | : :[43mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Dropoff)
state: 82
action: 2
state_prime: 82
reward: -1
done: False
info: {'prob': 1.0}
+---------+
|[34;1mR[0m: | : :[43mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
state: 82
action: 1
state_prime: 82
reward: -1
done: False
info: {'prob': 1

# Q-Learning on Taxi-v2
https://gym.openai.com/envs/Taxi-v2/ <br>
https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py <br>
Sutton Textbook: 6.5 Q-learning: Off-policy TD Control

In [8]:
import numpy as np
import gym
env = gym.make('Taxi-v2')

So at the high level, there are two possible situations to describe for passenger: 

1) At one of the four locations and not yet been picked up

2) Have been picked (so passenger's initial position no longer matter and taxi's location is passenger's location)

<br>

For situation 1): there are four states (four possible pickup locations)

For situation 2): there is a single state (in the taxi)

Combined situation 1) and 2) --> We have 5 unique states to describe the passenger.

<br>

Then 4 unique states to describe the destination locations, 25 states to describe taxi current location.

So 5\*4\*25 = 500 unique states to describe the whole system.

https://gym.openai.com/docs/


In [9]:
# init no.of states and actions
n_states = 500
n_actions = 6

# specify no.of iterations 
iter_max = 250000

In [10]:
# discount rate gamma
gamma = 0.9

# learning rate alpha
alp = 0.99

# epsilon greedy, for each time t, with eps probability randomly take action, 
# (1-eps) probability take the best action from current Q table
# higher eps value means more exploration
# lower eps value means more exploitation
# the hw asks to find a good balance between exploration and exploitation
eps = 0.1

# maximum iterations in each episode
# in taxi-v2, gym will stop the current episode once taken 200 iteartions
t_max = 500

In [11]:
# global set to store states visited
global_visited_states = set()

In [12]:
# init Q-value table
Q = np.zeros([n_states,n_actions])

In [13]:
for i_episode in range(iter_max):
    if i_episode % 5000 == 0:
        print("episode",i_episode)
    
    # local set to store states visited in each episode
    visted_state = set()
    
    # start with a new episode on the initial state
    state = env.reset()
    visted_state.add(state)
    
    for t in range(t_max):
        #env.render()
        
        # based on prob, perform random action or best action based on current Q-table
        prob = np.random.uniform(0, 1)
        if prob <= eps:
            action = env.action_space.sample()
        else:   
            action = np.random.choice(np.flatnonzero(Q[state] == Q[state].max()))
        
        # take the action and get next state, reward
        state_prime, reward, done, info = env.step(action)

        visted_state.add(state_prime)
        
        # after action reaches terminal states, perform Q update
        if reward == 20:
            Q[state][action] = Q[state][action] + alp * (reward + gamma*0 - Q[state][action])
            break
        
        # reach 200 iterations, proceed to a new episode
        elif done:
            break
            
        # terminal state is not reached, perform updates on Q table on state before the action
        else:
            #print(np.amax(Q[state_prime]))
            Q[state][action] = Q[state][action] + alp * (reward + gamma*np.amax(Q[state_prime]) - Q[state][action]) 
        
        # prep for a new iteration     
        state = state_prime      
        
    # append visited states to global set
    global_visited_states = global_visited_states.union(visted_state)
env.close()

episode 0
episode 5000
episode 10000
episode 15000
episode 20000
episode 25000
episode 30000
episode 35000
episode 40000
episode 45000
episode 50000
episode 55000
episode 60000
episode 65000
episode 70000
episode 75000
episode 80000
episode 85000
episode 90000
episode 95000
episode 100000
episode 105000
episode 110000
episode 115000
episode 120000
episode 125000
episode 130000
episode 135000
episode 140000
episode 145000
episode 150000
episode 155000
episode 160000
episode 165000
episode 170000
episode 175000
episode 180000
episode 185000
episode 190000
episode 195000
episode 200000
episode 205000
episode 210000
episode 215000
episode 220000
episode 225000
episode 230000
episode 235000
episode 240000
episode 245000


In [14]:
print(Q)

[[  0.           0.           0.           0.           0.           0.        ]
 [  1.62261467   2.9140163    1.62261467   2.9140163    4.348907
   -6.0859837 ]
 [  4.348907     5.94323      4.348907     5.94323      7.7147      -3.05677   ]
 ..., 
 [  7.7147       9.683        7.7147       5.94323     -1.2853      -1.2853    ]
 [  1.62261467   2.91401621   1.54551972   2.9140163   -7.37738533
   -7.37738533]
 [ 14.3         11.87        14.3         17.           5.3          5.3       ]]


In [15]:
Q.max()

20.0

In [16]:
np.nonzero(Q)

(array([  1,   1,   1, ..., 499, 499, 499]), array([0, 1, 2, ..., 3, 4, 5]))

In [17]:
visted_state

{87, 99, 107, 127, 147, 167, 179, 187, 199, 227, 279, 379, 475, 479}

In [18]:
global_visited_states

{0,
 1,
 2,
 3,
 4,
 6,
 7,
 8,
 9,
 11,
 12,
 13,
 14,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 26,
 27,
 28,
 29,
 31,
 32,
 33,
 34,
 36,
 37,
 38,
 39,
 41,
 42,
 43,
 44,
 46,
 47,
 48,
 49,
 51,
 52,
 53,
 54,
 56,
 57,
 58,
 59,
 61,
 62,
 63,
 64,
 66,
 67,
 68,
 69,
 71,
 72,
 73,
 74,
 76,
 77,
 78,
 79,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 91,
 92,
 93,
 94,
 96,
 97,
 98,
 99,
 101,
 102,
 103,
 104,
 106,
 107,
 108,
 109,
 111,
 112,
 113,
 114,
 116,
 117,
 118,
 119,
 121,
 122,
 123,
 124,
 126,
 127,
 128,
 129,
 131,
 132,
 133,
 134,
 136,
 137,
 138,
 139,
 141,
 142,
 143,
 144,
 146,
 147,
 148,
 149,
 151,
 152,
 153,
 154,
 156,
 157,
 158,
 159,
 161,
 162,
 163,
 164,
 166,
 167,
 168,
 169,
 171,
 172,
 173,
 174,
 176,
 177,
 178,
 179,
 181,
 182,
 183,
 184,
 186,
 187,
 188,
 189,
 191,
 192,
 193,
 194,
 196,
 197,
 198,
 199,
 201,
 202,
 203,
 204,
 206,
 207,
 208,
 209,
 211,
 212,
 213,
 214,
 216,
 217,
 218,
 219,
 221,
 222,
 223,
 224,

In [19]:
full_states = set([i for i in range(500)])

In [20]:
full_states.difference(global_visited_states)

{5,
 10,
 15,
 20,
 25,
 30,
 35,
 40,
 45,
 50,
 55,
 60,
 65,
 70,
 75,
 80,
 90,
 95,
 100,
 105,
 110,
 115,
 120,
 125,
 130,
 135,
 140,
 145,
 150,
 155,
 160,
 165,
 170,
 175,
 180,
 185,
 190,
 195,
 200,
 205,
 210,
 215,
 220,
 225,
 230,
 235,
 240,
 245,
 250,
 255,
 260,
 265,
 270,
 275,
 280,
 285,
 290,
 295,
 300,
 305,
 310,
 315,
 320,
 325,
 330,
 335,
 340,
 345,
 350,
 355,
 360,
 365,
 370,
 375,
 380,
 385,
 390,
 395,
 400,
 405,
 415,
 420,
 425,
 430,
 435,
 440,
 445,
 450,
 455,
 460,
 465,
 470,
 480,
 485,
 490,
 495}

In [21]:
Q[462][4]

-11.374402515012999

In [22]:
Q[398][3]

4.3489069990460933

In [23]:
Q[253][0]

-0.5856821172999982

In [24]:
Q[377][1]

9.6830000000000016

In [25]:
Q[83,5]

-12.823266037160529

<img src="hw4_submission.png">

In [26]:
Q[299][5]

0.68300000000000161

In [27]:
Q[388][4]

-11.374402515012999

In [28]:
Q[309][3]

-0.5856821172999982

In [29]:
Q[309][2]

-0.5856821172999982

In [30]:
Q[209][5]

-10.527113905569998

In [31]:
Q[168][4]

-10.527113905569998

In [32]:
Q[138][0]

9.6830000000000016

In [33]:
Q[422][2]

-1.5271139055699985

In [34]:
Q[146][3]

-3.1369622635116987

In [35]:
Q[492][0]

0.46035320300000193