In [141]:
import gym

env = gym.make("Taxi-v2").env

print(type(env.render()
     ))

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+

<type 'NoneType'>


In [38]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


##  <font color='red'>R</font>,  <font color='Green'>G</font>, <font color='yellow'>Y</font>, <font color='blue'>B</font> are the possible pickup and destination locations.
The <font color='blue'>**blue letter**</font> represents the current passenger pick-up location, and the <font color='Purple'> **purple letter**</font> is the current destination.

In [44]:
state = env.encode(0, 0, 6, 0) # (taxi row, taxi column, passenger index(pick-up location), destination index)
print("State:", state)

env.s = state
env.render()

('State:', 24)
+---------+
|[35mR[0m:[43m [0m| : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



# <font color='orange'>**P**</font> 
## We can think of it like a matrix that has the number of states as rows and number of actions as columns, i.e.<font color='orange'> *a states × actions matrix*</font>.
## It is the reward table that is created whenever we create an environment
## For this simulation it has <font color='orange'>500(0-499)</font> Entries in *P* table
## FOr each entry we have <font color='orange'>6(0-5)</font> states from :-

* 0 = **south**
* 1 = **north**
* 2 = **east**
* 3 = **west**
* 4 = **pickup**
* 5 = **dropoff**


# Reward Table (<FONT COLOR='BLUE'>P</font>)

In [47]:
env.P[34]# to Access a particular state from Reward Table

{0: [(1.0, 134, -1, False)],
 1: [(1.0, 34, -1, False)],
 2: [(1.0, 34, -1, False)],
 3: [(1.0, 14, -1, False)],
 4: [(1.0, 34, -10, False)],
 5: [(1.0, 34, -10, False)]}

This **dictionary** has the structure<FONT COLOR='ORANGE'> **{action: [(probability, nextstate, reward, done)]}**</FONT>.
## * The 0-5 corresponds to the actions (south, north, east, west, pickup, dropoff) the taxi can perform at our current state in the illustration.
## * In this env,<FONT COLOR='BLUE'> probability</font> is always <FONT COLOR='red'>1.0</font>.
## * The <FONT COLOR='BLUE'>nextstate</font> is the state we would be in if we take the action at this index of the dict
## * All the movement actions have a <FONT COLOR='red'>-1</font> reward and the pickup/dropoff actions have <FONT COLOR='red'>-10</font> reward in this particular state. If we are in a state where the taxi has a passenger and is on top of the right destination, we would see a reward of <FONT COLOR='red'>20</font> at the dropoff action (5)
## * <FONT COLOR='BLUE'>done</font> is used to tell us when we have successfully dropped off a passenger in the right location. Each successfull dropoff is the end of an episode


In [55]:
# to see first five states from 500
import time

for i in range(5):
    env.s=i
    print("State:", env.s)
    env.render()
    time.sleep(5)

('State:', 0)
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

('State:', 1)
+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

('State:', 2)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

('State:', 3)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

('State:', 4)
+---------+
|[35m[43mR[0m[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [139]:
print(env.s)# current state in which an agent is from Total 500 states
env.step(5)# We can take any of the 6(0-5) actions Defined Above at any Given state from 500 states  

4


(4, -10, False, {'prob': 1.0})

 # <FONT COLOR='ORANGE'>Output</font> 
 **env.step( param )** function --> **state, reward, done, info** 

# Without Reinforcement Learning with <FONT COLOR='ORANGE'>BRUTE FORCE</font>

1. ## <FONT COLOR='ORANGE'>env.action_space.sample()</font> method automatically selects one random action from set of all possible actions.



In [160]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    print(state,reward,done,info)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

(428, -1, False, {'prob': 1.0})
(448, -1, False, {'prob': 1.0})
(448, -1, False, {'prob': 1.0})
(448, -10, False, {'prob': 1.0})
(448, -10, False, {'prob': 1.0})
(348, -1, False, {'prob': 1.0})
(248, -1, False, {'prob': 1.0})
(248, -10, False, {'prob': 1.0})
(148, -1, False, {'prob': 1.0})
(128, -1, False, {'prob': 1.0})
(108, -1, False, {'prob': 1.0})
(108, -10, False, {'prob': 1.0})
(108, -1, False, {'prob': 1.0})
(108, -10, False, {'prob': 1.0})
(8, -1, False, {'prob': 1.0})
(8, -1, False, {'prob': 1.0})
(8, -10, False, {'prob': 1.0})
(8, -1, False, {'prob': 1.0})
(28, -1, False, {'prob': 1.0})
(28, -10, False, {'prob': 1.0})
(28, -10, False, {'prob': 1.0})
(28, -1, False, {'prob': 1.0})
(28, -10, False, {'prob': 1.0})
(28, -1, False, {'prob': 1.0})
(28, -1, False, {'prob': 1.0})
(28, -10, False, {'prob': 1.0})
(128, -1, False, {'prob': 1.0})
(128, -10, False, {'prob': 1.0})
(228, -1, False, {'prob': 1.0})
(248, -1, False, {'prob': 1.0})
(268, -1, False, {'prob': 1.0})
(368, -1, Fal

(316, -1, False, {'prob': 1.0})
(316, -10, False, {'prob': 1.0})
(416, -1, False, {'prob': 1.0})
(416, -1, False, {'prob': 1.0})
(408, -1, False, {'prob': 1.0})
(408, -10, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(408, -1, False, {'prob': 1.0})
(408, -1, False, {'prob': 1.0})
(408, -10, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(308, -10, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(408, -1, False, {'prob': 1.0})
(408, -1, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(408, -1, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(308, -10, False, {'prob': 1.0})
(308, -10, False, {'prob': 1.0})
(308, -1, False, {'prob': 1.0})
(308, -10, False, {'prob': 1.0})
(208, -1, False, {'prob': 1.0})
(228, -1, False, {'prob': 1.0})
(248, -1, False, {'prob': 1.0})
(348, -1, False, {'prob': 1.0})
(248, -1, False, {'prob': 1.0})
(268, -1, False, {'prob': 1.0})
(268, -10, False, {'prob': 1.0})
(168, -1, False, {'prob': 1.0})


In [157]:
print((frames['frame'].getvalue()))

TypeError: list indices must be integers, not str

In [153]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print("Timestep: { }".format(i+1))
        print("State: {}".format(frame['state']))
        print("Action: {}".format(frame['action']))
        print("Reward: {}".format(frame['reward']))
        sleep(.1)
        
print_frames(frames)

TypeError: list indices must be integers, not str