In [None]:
import gym
import numpy as np
import time

from IPython.display import clear_output

In [None]:
env = gym.make('MountainCar-v0')
env.reset()
env = env.unwrapped
env.seed(0)

In [None]:
print(env.observation_space)
print(env.action_space)

In [None]:
print(env.observation_space.high)
print(env.observation_space.low)

In [None]:
env.reset()

episodes = 0

while True:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    env.render()
    episodes += 1
    
    if done:
        break

print("Finished after {} episodes".format(episodes))
print('observation:', observation)
print('reward:', reward)
print('done:', done)
print('info:', info)
env.close()

In [None]:
def box2discrete(env, obs, num_states=20):
    pos_discrete = ((env.observation_space.high - env.observation_space.low) / num_states)[0]
    vel_discrete = ((env.observation_space.high - env.observation_space.low) / num_states)[1]
    new_pos = int((obs[0] - env.observation_space.low[0]) / pos_discrete)
    new_vel = int((obs[1] - env.observation_space.low[1]) / vel_discrete)
    return new_pos, new_vel

In [None]:
num_states = 20
q_table = np.zeros([num_states, num_states, env.action_space.n])

In [None]:
print(box2discrete(env, env.reset()))

In [None]:
episodes = 1000
lr_start = 1
lr_end = 0.005
# learning_rate = 0.01
discount = 0.95
epsilon = 0.05


for episode in range(episodes):
    state = env.reset()
    done = False
    learning_rate = max(lr_end, lr_start * (discount ** (episode // 100)))
    
    while True:
        pos, vel = box2discrete(env, state)
        if np.random.uniform(0, 1) > epsilon:
            action = np.argmax(q_table[pos][vel])
        else:
            action = np.random.choice(env.action_space.n)
        new_state, reward, done, info = env.step(action)
        new_pos, new_vel = box2discrete(env, new_state)
        
        q_table[pos][vel][action] = (1 - learning_rate) * q_table[pos][vel][action] + learning_rate * (reward + discount * np.max(q_table[new_pos][new_vel]))
       
        state = new_state
        
        if reward == 1.0:
            print("WON")
        if done:
            break   

# print(q_table[pos][vel][action])

In [None]:
print(q_table)

In [None]:
episodes = 10
max_steps = 200
won = 0

for episode in range(episodes):
    clear_output(wait=True)
    state = env.reset()
    pos, vel = box2discrete(env, state)
    done = False
    
    for step in range(max_steps):
        clear_output(wait=True)
        
        action = np.argmax(q_table[pos][vel])
        new_state, reward, done, _ = env.step(action)
        print("*" * 10)
        print(f"Episode: {episode+1}")
        print(f"Step {step+1}")
        print(f"Result {won}/{episode}")
        print(f"Reward: {reward}")
        env.render()
#         time.sleep(0.2)
        state = new_state
        
        if reward == 1.0:
            won += 1
            print("You reached the goal")
            break
        
        if done and not reward:
            print("You fell in the hole")
            break
        
    else:
        print(f"Maximum step ({max_steps}) limit reached.")
        
    time.sleep(2)
#     clear_output(wait=True)

print(f"Percentage of winning is {(won / episodes) * 100}%")
env.close()