# RL Practice
Learning how to implement an RL algorithm

### *The taxi problem using OpenAI's Gym:*

In [6]:
import gym
env = gym.make("Taxi-v3").env
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+



In [7]:
# Rewards table
print('Length: ', len(env.P), ' = number of possible states (25 spaces on board x 4 destinations x (4+1) passenger locations)')
#                                                                                                   ^ bc R,G,Y,B or in taxi
print('\nFirst 3 entries:')
[env.P[i] for i in range(3)]

# Structure of each entry: {action: [(probability, nextstate, reward, done)]}
#    6 possible actions: move N/S/W/E, pick up, drop off
#    done = whether a passenger was successfully dropped off in the right location

Length:  500  = number of possible states (25 spaces on board x 4 destinations x (4+1) passenger locations)

First 3 entries:


[{0: [(1.0, 100, -1, False)],
  1: [(1.0, 0, -1, False)],
  2: [(1.0, 20, -1, False)],
  3: [(1.0, 0, -1, False)],
  4: [(1.0, 16, -1, False)],
  5: [(1.0, 0, -10, False)]},
 {0: [(1.0, 101, -1, False)],
  1: [(1.0, 1, -1, False)],
  2: [(1.0, 21, -1, False)],
  3: [(1.0, 1, -1, False)],
  4: [(1.0, 17, -1, False)],
  5: [(1.0, 1, -10, False)]},
 {0: [(1.0, 102, -1, False)],
  1: [(1.0, 2, -1, False)],
  2: [(1.0, 22, -1, False)],
  3: [(1.0, 2, -1, False)],
  4: [(1.0, 18, -1, False)],
  5: [(1.0, 2, -10, False)]}]

#### Q-Learning model:

In [8]:
import numpy as np
import random
from IPython.display import clear_output
from time import sleep

In [9]:
# 2D array that initializes all rewards to 0
q_table = np.zeros([env.observation_space.n, env.action_space.n])

alpha = .1
gamma = .6
epsilon = .1

all_epochs = []
all_penalties = []

In [10]:
# the RL (Q-Learning) model
# animation:
#    - R,G,Y,B = destinations (blue = passenger for pickup, purple = destination for dropoff)
#    - yellow block = empty taxi, green block = full taxi
#    - pauses after each episode (which is 1 successful pickup and dropoff)
for i in range(1,1001):
    state = env.reset()
    
    epochs, penalties, reward = 0,0,0
    done = False
    frames = []
    
    while not done:
        if random.uniform(0,1) < epsilon:
            # Exploration (try some random action)
            action = env.action_space.sample()
        else:
            # Exploitation (use best known action, or "exploit" current knowledge)
            # Note: indexing the q_table just by the state gives all possible actions for that state,
            #   and then argmax gives the action with the highest reward
            action = np.argmax(q_table[state])
            
        # Take the action
        next_state, reward, done, info = env.step(action)
        
        old_value = q_table[state,action]
        next_max = np.max(q_table[next_state])
        
        # Update Q-value
        q_table[state,action] = (1-alpha)*old_value + alpha*(reward + gamma*next_max)
        
        if reward == -10:
            penalties += 1
            
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward,
        })
            
        state = next_state
        epochs += 1
        
    all_penalties.append(penalties)
    if i % 100 == 0:
        sleep(1 if i != 100 else 0)
        clear_output(wait=True)
        print(f'Episode: {i}')
        sleep(1)
        
        for i, frame in enumerate(frames):
            clear_output(wait=True)
            print(frame['frame'])
            print(f"Timestep: {i + 1}")
            print(f"State: {frame['state']}")
            print(f"Action: {frame['action']}")
            print(f"Reward: {frame['reward']}")
            sleep(.1)


sleep(1)
clear_output(wait=True)
print('Training finished.')

import matplotlib.pyplot as plt
plt.plot(all_penalties, color='green')
plt.title('Penalties per Episode')
plt.ylabel('Number of Penalties')
plt.xlabel('Episode')
plt.show()

Training finished.


ModuleNotFoundError: No module named 'matplotlib'

#### DQN Model

In [20]:
from stable_baselines import DQN
from stable_baselines.common.evaluation import evaluate_policy

env2 = gym.make("Taxi-v3")
model = DQN('MlpPolicy', env2, gamma=.6, learning_rate=.1, exploration_initial_eps=.05, 
            exploration_final_eps=.15, verbose=1)

mean_reward_before_train, _ = evaluate_policy(model, env2, n_eval_episodes=1000)
mean_reward_before_train

ImportError: cannot import name 'PPO2' from 'stable_baselines' (unknown location)

In [None]:
model.learn(total_timesteps=10000, log_interval=10)
mean_reward, _ = evaluate_policy(model, env2, n_eval_episodes=1000)
mean_reward