In [0]:
import gym
import numpy as np

### Loading the environment

In [0]:
#Loading the envrionment
env = gym.make('Taxi-v2')

In [0]:
print(env.action_space)
print(env.observation_space)

Discrete(6)
Discrete(500)


### Agent

In [2]:
from collections import defaultdict
import sys

class Agent:

    def __init__(self, env, alpha, gamma=1.0, eps_start=1.0, eps_decay=0.9999, eps_min=0.05):
        self.env = env
        self.eps_start = eps_start
        self.gamma = gamma
        self.alpha = alpha
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.nA = self.action_space.n

    def action_from_eps_greedy_policy(self, state, Q, epsilon, nA):
        '''
            Chooses action with max Q-value with a probability of 1-epsilon (Exploitation)
            or any other action with probability = epsilon/(nA-1) for each.
            
            sum of probabilities = (nA-1) * epsilon/(nA-1) + 1*(1-epsilon) = 1
        '''
        max_ = np.argmax(Q[state])
        prob = self.get_prob(nA, epsilon, max_)
        action = np.random.choice(np.arange(nA), p=prob)
        return action

    def get_prob(self, nA, epsilon, max_):
        '''
            returns probability distribution
            with max index probability = 1-epsilon
            and rest indices set to probability = epsilon
        '''
        prob = np.ones(nA)*epsilon/(nA-1)
        prob[max_] = 1 - epsilon
        return prob
    

    def interact(self, num_episodes):
        '''
            interact with the environment and learn
        '''
        Q = defaultdict( lambda: np.zeros(self.nA))
        epsilon = self.eps_start
        eps_decay = self.eps_decay  #decay rate of epsilon
        eps_min = self.eps_min       #min epsilon value, not allowing to go very low, to maintain exploration
        # loop over episodes
        for i_episode in range(1, num_episodes+1):
            # monitor progress
            if i_episode % 100 == 0:
                print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
                sys.stdout.flush() 
            #calculating epsilon
            epsilon = max(epsilon*eps_decay, eps_min)
            #observing state s0 and taking action
            state_prev = self.env.reset()
            action_prev = self.action_from_eps_greedy_policy(state_prev, Q, epsilon, self.nA)
            #loop over SARSA
            while True:
                state, reward, done, info = self.env.step(action_prev)
                Q[state_prev][action_prev] += self.alpha*(reward + self.gamma*np.max(Q[state]) - Q[state_prev][action_prev])
                if done:
                    break
                #update state and action
                state_prev = state
                action_prev = self.action_from_eps_greedy_policy(state, Q, epsilon, self.nA)

        self.Q = Q
        self.policy = self.get_Policy(Q)

    def get_Policy(self, Q):
        '''
            returns optimal policy using the Q-table
        '''
        policy = defaultdict(lambda: 0)
        for state, action in Q.items():
            policy[state] = np.argmax(action)
        return policy
    
    def take_action(self,state):
        '''
            take action as per policy
        '''
        return self.policy[state]

    def save(self):
        try:
            data = pd.DataFrame(self.Q)
            data.to_csv('Q.csv')
        except :
            pass

### Training the Agent

In [7]:
#creating the environment
env = gym.make('Taxi-v2')

#initialising agent
agent = Agent(env = env,
              alpha = 0.1 )

#training the agent via interaction
agent.interact(num_episodes=20000)

#testing the agent
state = env.reset()
while True:
    #rendering the environment
    env.render()
    #decide action for present state
    action = agent.take_action(state)
    state, reward, done, info = env.step(action)
    if done:
        break 

#if required save the Q_table for further use
#agent.save()
env.close()

Episode 20000/20000+---------+
|[34;1mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

+---------+
|[34;1mR[0m: | : :G|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[42m_[0m: : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : :

Since Github doesn't render agent's actions properly. You can visit [NbViewer](https://nbviewer.jupyter.org/github/Sachinkumar04428/OpenAI-Gym-Solutions/blob/master/Toy%20text%20Easy/Taxi_v2.ipynb) to view it properly.

## Testing the agent

In [9]:
num_episodes = 100
total_reward = 0
for i_episode in range(num_episodes):
    state = env.reset()
    while True:
        #env.render()
        action = agent.take_action(state)        #choose an action from the polic
        state, reward, done, info = env.step(action)   #perform chosen action to get next_state and reward
        total_reward += reward
        if done:
            break

print(f'Average reward over {num_episodes} episodes=',total_reward/num_episodes)

Average reward over 100 episodes= 8.45


The top average scores on the [Leaderboard](https://github.com/openai/gym/wiki/Leaderboard) are from 9.23 to 9.716.