In [2]:
#adapted from https://jovian.ai/americanachen/qlearning-med/v/1?utm_source=embed#C8

# some more recent versions of OpenAI gym return 2 values in env.reset() and 5 values in env.step()

import gymnasium as gym
import random
from time import sleep

random.seed(1234)

mountain = gym.make("MountainCar-v0") # render_mode="human"
state, info = mountain.reset()
print(state)
print(info)
#mountain.render()
#sleep(1)
#mountain.close()

[-0.54271615  0.        ]


In [25]:
# test random policy

epochs = 1000
lengths=[]
for taxi_run in range(epochs):
    state, info = mountain.reset()
    done = False
    trip_length = 0
    while not done and trip_length < 100:
        action = mountain.action_space.sample() # Explore a random action
            
        next_state, reward, terminated, truncated, info = mountain.step(action)       
        
        done = terminated or truncated
        trip_length +=1
    lengths.append(trip_length)
    
avg_len=sum(lengths)/epochs
print(avg_len)

print('states',mountain.observation_space.n)
print('actions',mountain.action_space.n)
        

100.0


AttributeError: 'Box' object has no attribute 'n'

: 

In [38]:
# learn using q-learning algorithm

import numpy as np

q_table = np.zeros([mountain.observation_space.n, mountain.action_space.n])
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.01
epochs = 100000

#discretize state
for taxi_run in range(epochs):
    
    state, info = mountain.reset()
    done = False
            
    while not done:    
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = mountain.action_space.sample()
        else:
            action = np.argmax(q_table[state])
                    
        next_state, reward, terminated, truncated, info = mountain.step(action)
                
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
                
        state = next_state

        done = terminated or truncated
        


In [39]:
#check correct action for some states

state=mountain.encode(1,1,1,0)
print(state)
print(q_table[state])
action=np.argmax(q_table[state])
print(action) 
mountain.s = state
mountain.lastaction = action
print(mountain.render())

124
[-2.48942084 -2.49113106 -2.49026462 -2.49064139 -3.74280525 -3.74191277]
0
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)



In [42]:
# check performance after learning

from IPython.display import clear_output
from time import sleep
lengths=[]
for tripnum in range(1, 11):
    state, info = mountain.reset()
   
    done = False
    trip_length = 0
    returnL=0
    while not done and trip_length < 100:
        action = np.argmax(q_table[state])
        next_state, reward, terminated, truncated,  info = mountain.step(action)
        done = terminated or truncated
        returnL+=reward
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(mountain.render())
        sleep(.2)
        state = next_state
        trip_length += 1
    lengths.append(trip_length)
    sleep(.2)
avg_len=sum(lengths)/10
print(avg_len)

Trip number 10 Step 9
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

13.8
