# Mountain Car

In [1]:
import gym
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
env = gym.make('MountainCar-v0')
env.reset()

array([-0.5301305,  0.       ], dtype=float32)

In [3]:
print('Action Space:', env.action_space)
print('Observation Space:', env.observation_space, '{')
print('\t Car position: min', env.observation_space.low[0], ' - max', env.observation_space.high[0] )
print('\t Car Velocity: min', env.observation_space.low[1], '- max', env.observation_space.high[1], '\n}')

Action Space: Discrete(3)
Observation Space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32) {
	 Car position: min -1.2  - max 0.6
	 Car Velocity: min -0.07 - max 0.07 
}


In [4]:
# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    # Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []
    
    # Calculate episodic reduction in epsilon
    reduction = (epsilon - min_eps)/episodes
    
    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
    
        while done != True:   
            # Render environment for last five episodes
            if i >= (episodes - 20):
                env.render()
                
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done, info = env.step(action) 
            
            # Discretize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                                     
            # Update variables
            tot_reward += reward
            state_adj = state2_adj
        
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
        
        # Track rewards
        reward_list.append(tot_reward)
        
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
            
        if (i+1) % 100 == 0:    
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
            
    env.close()
    
    return ave_reward_list

# Run Q-learning algorithm
rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 5000)

# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')     
plt.close()  

Episode 100 Average Reward: -200.0
Episode 200 Average Reward: -200.0
Episode 300 Average Reward: -200.0
Episode 400 Average Reward: -200.0
Episode 500 Average Reward: -200.0
Episode 600 Average Reward: -200.0
Episode 700 Average Reward: -200.0
Episode 800 Average Reward: -200.0
Episode 900 Average Reward: -200.0
Episode 1000 Average Reward: -200.0
Episode 1100 Average Reward: -200.0
Episode 1200 Average Reward: -200.0
Episode 1300 Average Reward: -200.0
Episode 1400 Average Reward: -200.0
Episode 1500 Average Reward: -200.0
Episode 1600 Average Reward: -200.0
Episode 1700 Average Reward: -200.0
Episode 1800 Average Reward: -200.0
Episode 1900 Average Reward: -200.0
Episode 2000 Average Reward: -200.0
Episode 2100 Average Reward: -200.0
Episode 2200 Average Reward: -200.0
Episode 2300 Average Reward: -200.0
Episode 2400 Average Reward: -200.0
Episode 2500 Average Reward: -200.0
Episode 2600 Average Reward: -199.99
Episode 2700 Average Reward: -200.0
Episode 2800 Average Reward: -200.0


In [16]:
n_action = env.action_space.n
env_low = env.observation_space.low
env_high = env.observation_space.high
bins = 30 # number of states for discretization
def getState(state, env_low = env_low, env_high = env_high, bins = bins):
    """Returns the discretized position and velocity of an observation"""
    discretized_env = (env_high - env_low) / bins
    discretized_pos = int((state[0] - env_low[0]) / discretized_env[0])
    discretized_vel = int((state[1] - env_low[1]) / discretized_env[1])
    return discretized_pos, discretized_vel
n_eps = 100001 # number of episodes
alpha = 0.1 # learning rate
gamma = 0.9 # discount factor
epsilon = 0.2 # explore-exploit tradeoff factor
interval = 10000

In [22]:
def chooseAction(pos, vel, q_table, epsilon = epsilon):
    """Choose action based on an epsilon greedy strategy"""
    if random.random() < epsilon: # explore
        action = env.action_space.sample()
    else: # exploit
        action = np.argmax(q_table[pos][vel])
    return action

In [25]:
# establish q-table
q_table_sarsa = np.zeros((bins, bins, env.action_space.n))
# store reward
rewards_sarsa = []
# training
for ep in range(n_eps):
    
    current_reward = 0
    done = False
    state = env.reset()
    # discretize the state
    pos, vel = getState(state)
    # choose first action
    action = chooseAction(pos, vel, q_table_sarsa)
while not done:
        
        # render for the last 10 episodes
    if ep >= (n_eps - 10): 
        env.render()
            
        # next state
    next_state, reward, done, info = env.step(action)
# discretize the state
    next_pos, next_vel = getState(next_state)
        # next action
    next_action = chooseAction(next_pos, next_vel, q_table_sarsa)
    if done and next_state[0] >= env.goal_position:
        q_table_q[next_pos][next_vel][action] = reward
        
    else:
        # update Q value: Q(S, A) <-- Q(S, A) + alpha [R + gamma * Q(S', A') - Q(S, A)]
        q_table_sarsa[pos][vel][action] += \
        alpha * (reward + gamma * q_table_sarsa[next_pos][next_vel][next_action] - q_table_sarsa[pos][vel][action])
        
        # reassign state, action, reward
    state = next_state
    pos, vel = next_pos, next_vel
    action = next_action
    current_reward += reward
    
    # update epsilon
    if epsilon > 0:
        epsilon*= (n_eps - 1)/n_eps
    if ep % interval == 0:
        print('Game no.: ', ep, 'epsilon: ', epsilon, 'with reward: ', current_reward)
        rewards_sarsa.append(current_reward)
env.close()

Game no.:  100000 epsilon:  0.19919960985670956 with reward:  -1.0
Game no.:  100000 epsilon:  0.19919761788053078 with reward:  -2.0
Game no.:  100000 epsilon:  0.19919562592427154 with reward:  -3.0
Game no.:  100000 epsilon:  0.19919363398793166 with reward:  -4.0
Game no.:  100000 epsilon:  0.19919164207151097 with reward:  -5.0
Game no.:  100000 epsilon:  0.19918965017500923 with reward:  -6.0
Game no.:  100000 epsilon:  0.19918765829842625 with reward:  -7.0
Game no.:  100000 epsilon:  0.19918566644176183 with reward:  -8.0
Game no.:  100000 epsilon:  0.1991836746050158 with reward:  -9.0
Game no.:  100000 epsilon:  0.19918168278818793 with reward:  -10.0
Game no.:  100000 epsilon:  0.19917969099127802 with reward:  -11.0
Game no.:  100000 epsilon:  0.19917769921428588 with reward:  -12.0
Game no.:  100000 epsilon:  0.19917570745721133 with reward:  -13.0
Game no.:  100000 epsilon:  0.19917371572005413 with reward:  -14.0
Game no.:  100000 epsilon:  0.1991717240028141 with reward

Game no.:  100000 epsilon:  0.1989268946073818 with reward:  -138.0
Game no.:  100000 epsilon:  0.19892490535832824 with reward:  -139.0
Game no.:  100000 epsilon:  0.19892291612916696 with reward:  -140.0
Game no.:  100000 epsilon:  0.19892092691989777 with reward:  -141.0
Game no.:  100000 epsilon:  0.19891893773052047 with reward:  -142.0
Game no.:  100000 epsilon:  0.19891694856103487 with reward:  -143.0
Game no.:  100000 epsilon:  0.19891495941144077 with reward:  -144.0
Game no.:  100000 epsilon:  0.19891297028173796 with reward:  -145.0
Game no.:  100000 epsilon:  0.19891098117192627 with reward:  -146.0
Game no.:  100000 epsilon:  0.19890899208200546 with reward:  -147.0
Game no.:  100000 epsilon:  0.19890700301197536 with reward:  -148.0
Game no.:  100000 epsilon:  0.19890501396183577 with reward:  -149.0
Game no.:  100000 epsilon:  0.19890302493158646 with reward:  -150.0
Game no.:  100000 epsilon:  0.19890103592122726 with reward:  -151.0
Game no.:  100000 epsilon:  0.19889