In [None]:
%tensorflow_version 2.x


In [None]:
import gym
import numpy as np


def td_lambtha(env, V, policy, lambtha, episodes=5000, max_steps=100,
               alpha=0.1, gamma=0.99):
    """                                                                                                                 
    Performs the TD(λ) algorithm                                                                                        
                                                                                                                        
    parameters:                                                                                                         
        env: the openAI environment instance                                                                            
        V [numpy.ndarray of shape(s,)]: contains the value estimate                                                     
        policy: function that takes in state & returns the next action to take                                          
        episodes [int]: total number of episodes to train over                                                          
        max_steps [int]: the maximum number of steps per episode                                                        
        alpha [float]: the learning rate                                                                                
        gamma [float]: the discount rate                                                                                
                                                                                                                        
    returns:                                                                                                            
        V: the updated value estimate                                                                                   
    """
    episode = [[], []]
    Et = [0 for i in range(env.observation_space.n)]
    for ep in range(episodes):
        state = env.reset()
        for step in range(max_steps):
            Et = list(np.array(Et) * lambtha * gamma)
            Et[state] += 1

            action = policy(state)
            next_state, reward, done, info = env.step(action)

            if env.desc.reshape(env.observation_space.n)[next_state] == b'H':
                reward = -1

            if env.desc.reshape(env.observation_space.n)[next_state] == b'G':
                reward = 1

            delta_t = reward + gamma * V[next_state] - V[state]

            V[state] = V[state] + alpha * delta_t * Et[state]

            if done:
                break
            state = next_state
    return np.array(V)

np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=4)
print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))
