# Setup

In [1]:
import gym
from gym.envs.registration import register
from IPython.display import clear_output
!pip install --upgrade pygame
import pygame



In [16]:
env_name = "MountainCar-v0"
env = gym.make(env_name)

In [17]:
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)

Observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Action space: Discrete(3)


gym.spaces.discrete.Discrete

In [None]:
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

# True Online Sarsa(λ)

In [6]:
import numpy as np

In [7]:
#using a modified cosine fourier basis. x: position, v: velocity, d: dimension of the features
def get_features(x, v, d):
    if x>=0.5:
        return np.zeros(d)
    
    M = (d-1)//2
    state_features = [1]
    scaled_x = (((x - (-1.2))/(0.6 - (-1.2)))*(1-(-1))) + (-1)
    scaled_v = (((v - (-0.07))/(0.07 - (-0.07)))*(1-(-1))) + (-1)

    for i in range(1, M+1):
        state_features.append(np.sin(i*np.pi*scaled_x))
    for i in range(1, M+1):
        state_features.append(np.sin(i*np.pi*scaled_v))

    return np.array(state_features)

In [27]:
def algorithm(alpha=0.4, gamma=1, λ=0.4, iterations=100, ep=0.2, bins=np.array([15, 5])):
    b_x = bins[0]
    b_v = bins[1]
    q = np.zeros((b_x, b_v, 3)) # bins for position * bins for velocity * number of actions
    observation = env.reset()
    for i in range(1, iterations+1):
        if ep>0.05:
            ep-=0.01
        x = observation[0]
        v = observation[1]
        scaled_x = int(((((x - (-1.2))/(0.5 - (-1.2)))*(1-(0))) + (0))//(1/b_x))
        scaled_v = int(((((v - (-0.7))/(0.7 - (-0.7)))*(1-(0))) + (0))//(1/b_v))
        if np.random.rand()<=ep:
            action = int(np.random.rand()//(1/3))
        else:
            action = np.argmax(q[scaled_x][scaled_v])
        Q_old = 0
        e = np.zeros((b_x, b_v, 3))
        steps = 0
        #episode
        done = False
        total_reward = 0
        while(not done):
            env.render()
            observation, reward, done, info = env.step(action)
            x_next, v_next = observation
            scaled_x_next = int(((((x_next - (-1.2))/(0.6 - (-1.2)))*(1-(0))) + (0))//(1/b_x))
            scaled_v_next = int(((((v_next - (-0.7))/(0.7 - (-0.7)))*(1-(0))) + (0))//(1/b_v))
            next_action = None
            if np.random.rand()<=ep:
                next_action = int(np.random.rand()//(1/3))
            else:
                next_action = np.argmax(q[scaled_x_next][scaled_v_next])
            delta_Q = q[scaled_x][scaled_v][action]-Q_old
            Q_old = q[scaled_x_next][scaled_v_next][next_action]
            delta = reward + gamma*q[scaled_x_next][scaled_v_next][next_action]-q[scaled_x][scaled_v][action]
            e[scaled_x][scaled_v][action] = (1-alpha)*e[scaled_x][scaled_v][action] + 1
            q += alpha*(delta+delta_Q)*e
            e = gamma*λ*e
            q[scaled_x][scaled_v][action] = q[scaled_x][scaled_v][action] - alpha*delta_Q
            action = next_action
            x = x_next
            v = v_next
            scaled_x = scaled_x_next
            scaled_v = scaled_v_next
            steps+=1
            total_reward += reward
        
        print(i, total_reward)

    return q

In [28]:
algorithm(alpha=0.4, gamma=1, λ=0.4, iterations=4, ep=0.4, bins=np.array([15, 5]))

1 -200.0
2 -1.0
3 -1.0
4 -1.0


array([[[  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
      