

# Basic reinforcement learning example, openAI Gym environment

## Goal : 
The goal is to be familiarized with the #openAI Gym

In [0]:
!pip install gym



In [0]:
import gym
from time import sleep
from IPython.display import clear_output

# Creating thr env
env = gym.make("Taxi-v2").env

env.s = 328


# Setting the number of iterations, penalties and reward to zero,
epochs = 0
penalties, reward = 0, 0

frames = []

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into the dictionary for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    }
    )

    epochs += 1

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

# Printing all the possible actions, states, rewards.
def renderFrames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'].getvalue() )
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
renderFrames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 256
State: 0
Action: 5
Reward: 20


# Q-learning solution

In [0]:
import gym
import numpy as np
import random
from IPython.display import clear_output

# Init Taxi-V2 Env
env = gym.make("Taxi-v2").env

# Init arbitary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1    # learning rate
gamma = 0.6    #initial 0.6 discount factor
epsilon = 0.1


all_epochs = []
all_penalties = []
all_rewards = []

for i in range(1, 100001):
    state = env.reset()

    # Init Vars
    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            # Check the action space
            action = env.action_space.sample()
        else:
            # Check the learned values
            action = np.argmax(q_table[state])  # Utilisation des valeurs déjà apprise

        next_state, reward, done, info = env.step(action)
        print()
        
        all_rewards.append(reward) # garde les rewards

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        # Update the new value
        new_value = (1 - alpha) * old_value + alpha * \
            (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print(f" Episode: { i }")
print("Penalties ", all_penalties)
print("Rewards ", all_penalties)

print("Training finished.")

# https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/ pour le rapport
# SARSA : https://blog.goodaudience.com/attempting-open-ais-taxi-v2-using-the-sarsa-max-algorithm-70a4de8c8c9c 

 Episode: 100000
Penalties  []
Rewards  []
Training finished.


# Conclusion
Plus gamma(discount factor ) est élévé il va cherché à considerer un gros grand reward. Learning rate (alpha)

In [0]:
print(q_table)

[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.27325181  -2.12208639  -2.27325183  -2.12208639  -1.870144
  -11.12208579]
 [ -1.87014345  -1.45024005  -1.87014398  -1.45024006  -0.7504
  -10.45023967]
 ...
 [ -0.95844332   0.416       -0.91357461  -1.19075874  -6.39694391
   -6.07935937]
 [ -2.15942823  -2.12204103  -2.14648392  -2.12203985  -7.17923381
   -7.15523852]
 [  2.17751996   1.3697416    3.76474205  11.          -2.60602997
   -2.75102621]]


# Using the trained Q_table to test 

In [0]:

test_episodes = 100000
env.reset()    # réinitialisation de l'environnement
rewards =[]   

for e in range(test_episodes):
    done = False
    state = env.reset()
    total_reward = 0
    while not done :
        #env.render()
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        state = new_state
        total_reward += reward
        if done:
            rewards.append(total_reward)
            break
env.close()    
print('the score is: {}'.format(sum(rewards)/test_episodes))



the score is: 8.46569


# SARSA learning algorithm

In [0]:
env = gym.make("Taxi-v2").env

env.s = 328

# Init arbitary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

#hyperparameters
alpha = 0.85
gamma = 0.90
epsilon = 0.8



In [0]:
rewards=[]

for i in range(30000):
  # we store cumulative reward of each episodes in r
  r = 0
  # initialize the state,
  state = env.reset()
  # select the action 
  if random.uniform(0,1) < epsilon :
    action = env.action_space.sample()
  else:
    action = np.argmax(q_table[state])
      
  done = False
  while not done:
       
    # then we perform the action and move to the next state, and receive the reward
    nextstate, reward, done, info = env.step(action)
        
    # again, we select the next action 
    if random.uniform(0,1) < epsilon :
      nextaction = env.action_space.sample()
    else:
      nextaction = np.argmax(q_table[state])
    
    # we calculate the Q value of previous state using our update rule
    q_table[(state,action)] += alpha * (reward + gamma * q_table[(nextstate,nextaction)]-q_table[(state,action)])

    # finally we update our state and action with next action and next state
    action = nextaction
    state = nextstate
        
    # store the rewards
    r+=reward
     
    if done:
      rewards.append(r)
  #print(rewards[i])
        
env.close()

In [0]:
print(q_table)

[[  0.           0.           0.           0.           0.
    0.        ]
 [-33.16102325 -28.42239026 -29.59155876 -23.72550622 -31.40559313
  -33.96014659]
 [-33.44367501 -29.82959485 -29.42912475 -26.06650958 -26.23155519
  -31.26726039]
 ...
 [-22.91617867 -30.88381015 -25.61548424 -28.19666113 -30.6474348
  -32.26941425]
 [-22.12638081 -33.84549716 -29.5881619  -33.5094775  -31.90831879
  -35.62921668]
 [-21.36829045 -22.16514535 -28.95468587 -11.62899783  -6.97042712
  -23.00722792]]


In [0]:
test_episodes = 10000
env.reset()    # réinitialisation de l'environnement
rewards =[]   

done = False

for e in range(test_episodes):
    done = False
    state = env.reset()
    total_reward = 0
    while not done :
        #env.render()
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        state = new_state
        total_reward += reward
        if done:
            rewards.append(total_reward)
            break
env.close()    
print('the score is: {}'.format(sum(rewards)/test_episodes))