In [1]:
import gym
import numpy as np
from IPython.display import clear_output
from time import sleep

In [2]:
env = gym.make("Taxi-v3").env

In [12]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print('timestep:',i+1)
        print('reward:',frame['reward'])
        print('penalty:',frame['penalty'])
        print('state:',frame['state'])
        sleep(.1)

In [10]:
def random_action_model(state):
    frames=[]
    penalties=0
    done=False
    env.s=state
    while not done:
        action=env.action_space.sample()
        state,reward,done,info=env.step(action)
        if reward==-10:
            penalties+=1
        frames.append({'frame':env.render(mode='ansi'),'reward':reward,'penalty':penalties,'state':state})
    return frames

In [49]:
def Q_learning_model(state):
    frames=[]
    penalties=0
    done=False
    env.s=state
    q_table = np.loadtxt('q_table.txt', dtype=float)
    while not done:
        action=np.argmax(q_table[state])
        state,reward,done,info=env.step(action)
        if reward==-10:
            penalties+=1
        frames.append({'frame':env.render(mode='ansi'),'reward':reward,'penalty':penalties,'state':state})
    return frames

# Testing Random Model v/s Reinforcement Learning Model

In [46]:
state = env.encode(3, 1, 2, 0)

In [47]:
env.s = state
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)


In [13]:
print_frames(random_action_model(state))

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

timestep: 1860
reward: 20
penalty: 619
state: 0


In [50]:
print_frames(Q_learning_model(state))

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

timestep: 10
reward: 20
penalty: 0
state: 0


# Training Script

### Preparing Q table

In [27]:
alpha=0.1
gamma=0.6
epsilon=0.1
q_table=np.zeros([env.observation_space.n, env.action_space.n])
for episode in range (1,100001):
    epochs, penalties, reward= 0, 0, 0
    state = env.reset()
    done=False
    while not done:
        if np.random.uniform(0, 1) < epsilon:
            action=env.action_space.sample()
        else:
            action=np.argmax(q_table[state])
            
        next_state, reward, done, info=env.step(action)
        
        if reward==-10:
            penalties+=1
        
        q_table[state,action]=(1-alpha)*q_table[state,action]+alpha*(reward+gamma*np.max(q_table[next_state]))
        
        state=next_state
        
        epochs+=1

In [39]:
np.savetxt('q_table.txt', q_table, fmt='%f')