In [1]:
import numpy as np
import gym

In [2]:
def eps_greedy(Q, s, eps=0.1):
    if np.random.uniform(0,1) < eps:
      return np.random.randint(Q.shape[1])
    else:
      return greedy(Q, s)

# Greedy Policy
> Returining TO Maximum Action State Value

In [3]:
def greedy(Q, s):
    return np.argmax(Q[s])

# Policy Testing


In [4]:
def run_episodes(env, Q, num_episodes=100, to_print=False):
    tot_rew = [] #total reward
    state = env.reset()

    for _ in range(num_episodes):
        done = False
        game_rew = 0

        while not done:
            next_state, rew, done, _ =env.step(greedy(Q, state))

            state = next_state
            game_rew += rew
            if done:
                state = env.reset()
                tot_rew.append(game_rew)

    if to_print:
        print('Mean score: %.3f of %1 games!'%(np.mean(tot_rew), num_episodes))

    return np.mean(tot_rew)

# **SARSA**
* initialize Q Matrix
* Decay The Epsilon Until It Reaches The Threshold
* Choose Next Action
* SARSA Update
* Testing The Policy







In [5]:
def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    Q = np.zeros((nS, nA))
    games_rewards=[]
    test_rewards=[]

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0

        if eps>0.01:
            eps -= eps_decay


        action = eps_greedy(Q, state, eps)

        while not done:
            next_state, rew, done, _ = env.step(action)

            next_action = eps_greedy(Q, next_state, eps)

            #Bellman's Equation
            Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])

            state = next_state
            action = next_action
            tot_rew += rew
            if done:
                games_rewards.append(tot_rew)

        if (ep % 300) == 0:
              test_rew =run_episodes(env, Q, 1000)
              print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
              test_rewards.append(test_rew)
    return Q

SARSA - TAXI V3 DATA

In [6]:
if __name__ == '__main__':
    env = gym.make('Taxi-v3')
    print("SARSA")
    Q_sarsa = SARSA(env, lr=0.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

  deprecation(
  deprecation(


SARSA
Episode:    0  Eps:0.3990  Rew:-205.4000
Episode:  300  Eps:0.0990  Rew:-236.9230
Episode:  600  Eps:0.0100  Rew:-201.4490
Episode:  900  Eps:0.0100  Rew:-165.9270
Episode: 1200  Eps:0.0100  Rew:-72.9890
Episode: 1500  Eps:0.0100  Rew:-42.7770
Episode: 1800  Eps:0.0100  Rew:-62.2390
Episode: 2100  Eps:0.0100  Rew:-15.4300
Episode: 2400  Eps:0.0100  Rew:-10.6140
Episode: 2700  Eps:0.0100  Rew:-1.8770
Episode: 3000  Eps:0.0100  Rew:4.6940
Episode: 3300  Eps:0.0100  Rew:5.7650
Episode: 3600  Eps:0.0100  Rew:6.9760
Episode: 3900  Eps:0.0100  Rew:6.3130
Episode: 4200  Eps:0.0100  Rew:6.3490
Episode: 4500  Eps:0.0100  Rew:7.4890
Episode: 4800  Eps:0.0100  Rew:7.9000


**Q-LEARNING**

 * initialize Q Matrix
 * Decay The Epsilon Until It Reaches The Threshold
 * Choose Next Action
 * SARSA Update
 * Testing The Policy



In [7]:
def Q_Learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    Q = np.zeros((nS, nA))
    games_rewards=[]
    test_rewards=[]

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0

        if eps>0.01:
            eps -= eps_decay

        while not done:

            action = eps_greedy(Q, state, eps)
            next_state, rew, done, _ = env.step(action)

            #Bellman's Equation
            Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])

            state = next_state
            tot_rew += rew
            if done:
                games_rewards.append(tot_rew)

        if (ep % 300) == 0:
              test_rew =run_episodes(env, Q, 1000)
              print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
              test_rewards.append(test_rew)
    return Q

In [8]:
if __name__ == '__main__':
    env = gym.make('Taxi-v3')
    print("Q_Learning")
    Q_Learning = Q_Learning(env, lr=0.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

Q_Learning
Episode:    0  Eps:0.3990  Rew:-275.2310
Episode:  300  Eps:0.0990  Rew:-209.6020
Episode:  600  Eps:0.0100  Rew:-203.9030
Episode:  900  Eps:0.0100  Rew:-186.9950
Episode: 1200  Eps:0.0100  Rew:-109.0820
Episode: 1500  Eps:0.0100  Rew:-94.1440
Episode: 1800  Eps:0.0100  Rew:-29.0390
Episode: 2100  Eps:0.0100  Rew:-26.1290
Episode: 2400  Eps:0.0100  Rew:-4.1200
Episode: 2700  Eps:0.0100  Rew:-0.5320
Episode: 3000  Eps:0.0100  Rew:6.6530
Episode: 3300  Eps:0.0100  Rew:-3.9140
Episode: 3600  Eps:0.0100  Rew:3.7260
Episode: 3900  Eps:0.0100  Rew:5.8690
Episode: 4200  Eps:0.0100  Rew:-2.1160
Episode: 4500  Eps:0.0100  Rew:7.9380
Episode: 4800  Eps:0.0100  Rew:7.7880
