## Setting up OpenAi Gym Environment :

In [1]:
!pip install cmake 'gym[atari]' scipy




In [0]:
import gym

In [0]:
env = gym.make("Taxi-v2").env

In [4]:
env.render()

+---------+
|R: | : :[35mG[0m|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [5]:
env.reset()

1

In [6]:
env.render()

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [7]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(6)
State Space Discrete(500)


In [0]:
state = env.encode(3,1,2,0) # (taxi row, taxi column, passenger index, destination index)

In [9]:
print("State", state)

State 328


In [0]:
env.s = state


In [11]:
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [12]:
env.P[328]   # This dictionary has the structure {action: [(probability, nextstate, reward, done)]}.

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

## Solving without Reinforcement Learning :

In [0]:
# The env.action_space.sample() method automatically selects one random action from set of all possible actions.

In [14]:
env.s = 328 # setting environment to illustration state

epochs = 0
penalties, reward = 0,0

frames = []  # For animation
done = False

while not done : 
  action = env.action_space.sample()
  state , reward, done, info = env.step(action)
#   print("state : {} , reward : {} , done : {}, info : {}".format(state, reward, done,info))
  
  if reward == -10:
    penalties += 1
    
  # Put each rendereed frame into dict for animation
  frames.append({
      'frame' : env.render(mode='ansi'),
      'state' : state,
      'action': action,
      'reward' : reward
  })
  
  epochs +=1
  
print("Timesteps taken : {}".format(epochs))
print("Penalties incurred : {}".format(penalties))

Timesteps taken : 256
Penalties incurred : 58


In [0]:
# Displaying the Frames :

In [16]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
  for i, frame in enumerate (frames):
    clear_output(wait = True)
    print(frame['frame'].getvalue())
    print(f"Timestep: {i + 1}")
    print(f"State: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    sleep(.7)

    
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 256
State: 0
Action: 5
Reward: 20


## Q- Learning :

In [0]:
# Initializing the Q-table to a 500 x 6 matrix of zeroes 

In [0]:
import numpy as np

In [0]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [20]:
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [21]:
%%time
"""Training the Agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1     # Learning Rate
gamma = 0.6     # Discount Factor
epsilon = 0.1   # Exploration Factor

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1,100001):
  state = env.reset()
  
  epochs, penalties, reward = 0, 0, 0
  done = False
  
  while not done:
    if random.uniform(0,1) < epsilon:
      action = env.action_space.sample() # Explore action space
    else:
      action = np.argmax(q_table[state]) # Exploit learned values
      
    next_state , reward, done, info = env.step(action)
    
    old_value = q_table[state, action]
    next_max = np.max(q_table[next_state])
    
    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
    q_table[state, action] = new_value
    
    if reward == -10:
      penalties += 1
      
    state = next_state
    epochs +=1
    
  if i % 100 == 0:
    clear_output(wait = True)
    print(f"Episode : {i}")
    
    
print("Training Finished. \n")

Episode : 100000
Training Finished. 

CPU times: user 46.9 s, sys: 2.72 s, total: 49.7 s
Wall time: 48.2 s


In [22]:
q_table[328]

array([ -2.40420798,  -2.27325184,  -2.41132732,  -2.36185445,
       -10.74917054, -10.51017477])

### Evaluating the Agent:

In [23]:
"""Evaluate agent's performance after Q-Learning"""

total_epochs , total_penalties = 0,0
episodes = 100

for _ in range(episodes):
  state = env.reset()
  epochs , penalties, reward = 0,0,0
  
  done = False
  
  while not done:
    action = np.argmax(q_table[state])
    state , reward, done, info = env.step(action)
    
    if reward == -10:
      penalties += 1
      
    epochs += 1
    
  total_penalties += penalties
  total_epochs += epochs
  
  
print(f"Results after {episodes} episodes: ")
print(f"Average timesteps per episode : {total_epochs / episodes}")
print(f"Average penalties per episode : {total_penalties / episodes}")

Results after 100 episodes: 
Average timesteps per episode : 12.73
Average penalties per episode : 0.0


In [0]:
# Displaying frames of the Q-Learning algo:

In [25]:
"""Evaluate agent's performance after Q-Learning"""

total_epochs , total_penalties = 0,0
episodes = 100

frames = [] # For animation

for _ in range(episodes):
  state = env.reset()
  epochs , penalties, reward = 0,0,0
  
  done = False
  
  while not done:
    action = np.argmax(q_table[state])
    state , reward, done, info = env.step(action)
    
    if reward == -10:
      penalties += 1
      
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward,
        'episode': _
    })
    
    epochs += 1
    
  total_penalties += penalties
  total_epochs += epochs
  
  
print(f"Results after {episodes} episodes: ")
print(f"Average timesteps per episode : {total_epochs / episodes}")
print(f"Average penalties per episode : {total_penalties / episodes}")

Results after 100 episodes: 
Average timesteps per episode : 12.66
Average penalties per episode : 0.0


In [27]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        print(f"Episode: {frame['episode']}")

        sleep(.1)
        
print_frames(frames)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 1266
State: 85
Action: 5
Reward: 20
Episode: 99
