# MountainCar

This is our solution for the openAI gym MountainCar-v0 problem.
The 2 solutions we used were:

    - QLearning 
    - SARSA

In [7]:
import numpy as np
import gym
import pickle
import matplotlib
import matplotlib.pyplot as plt

## Helper functions

In [8]:
env = gym.make('MountainCar-v0')
# Makes the observation and action space discrete and places them in bins.
pos_space = np.linspace(-1.2, 0.6, 18)    # 18 bins for the position space 
vel_space = np.linspace(-0.07, 0.07, 28)  # 28 bins for the velocity space

# given observation, returns what bin
def getState(observation):
    pos, vel = observation
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return (pos_bin, vel_bin)            

# Creates a new empty Q-table for the environment
def createEmptyQTable():
    states = []
    for pos in range(len(pos_space) + 1):
        for vel in range(len(vel_space) + 1):
            states.append((pos,vel))
        
    Q = {}
    for state in states:
        for action in range(env.action_space.n):
            Q[state, action] = 0
    return Q

# Given a state and a set of actions
# returns action that has the highest Q-value
def maxAction(Q, state, actions=[0, 1, 2]):
    values = np.array([Q[state,a] for a in actions])
    action = np.argmax(values)
    return action

# Saves a variable as a file, so we dont have to re-train each time we want to run the simulation
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

# Load a variable from file
def load_obj(name ):
    print(name + '.pkl')
    with open(name + '.pkl', 'rb') as f:
        f.seek(0)
        return pickle.load(f)

## Q-Learning

In [9]:
def Q_learning(alpha, gamma, epsilon, episodes):
    env = gym.make('MountainCar-v0')
    env._max_episode_steps = 1000

    # Create an empty Q-table
    Q = createEmptyQTable()

    score = 0
    reward_list = []
    avg_reward_list = []
    # Variable to keep track of the total score obtained
    # at each episode to plot it later
    total_score = np.zeros(episodes)

    for i in range(episodes):
        done = False
        observation = env.reset()
        state = getState(observation)
    
        if i % 500 == 0:
            print(f'episode: {i}, score: {score}, epsilon: {epsilon:0.3f}')
    
        score = 0
        while not done:
            # e-Greedy strategy
            # Explore random action with probability epsilon
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            # Take best action with probability 1-epsilon
            else:
                action = maxAction(Q, state)
        
            # Observe next state based on the next step
            next_observation, reward, done, info = env.step(action)
            next_state = getState(next_observation)
        
            # Add reward to the score of the episode
            score += reward
        
            # Get next action
            next_action = maxAction(Q, next_state)
        
            # Update Q value for state and action given the bellman equation
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action]) 
        
            # Move to next state
            state = next_state

        # Save score for this episode
        total_score[i] = score
    
        reward_list.append(score)

        if (i+1) % 100 == 0:
                avg_reward = np.mean(reward_list)
                avg_reward_list.append(avg_reward)
                reward_list = []
            
            
        # Reduce epsilon 
        epsilon = epsilon - 2/episodes if epsilon > 0.01 else 0.01
        
    save_obj(Q, 'trained-QLearning')
    return avg_reward_list

# SARSA-Learning

In [10]:
def SARSA_learning(alpha, gamma, epsilon, episodes):
    env = gym.make('MountainCar-v0')
    env._max_episode_steps = 1000

    # Create an empty Q-table
    Q = createEmptyQTable()

    score = 0
    reward_list = []
    avg_reward_list = []
    # Variable to keep track of the total score obtained at each episode
    total_score = np.zeros(episodes)

    for i in range(episodes):
        if i % 500 == 0:
            print(f'episode: {i}, score: {score}, epsilon: {epsilon:0.3f}')
    
        observation = env.reset()
        state = getState(observation)

        # e-Greedy strategy
        # Explore random action with probability epsilon
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        # Take best action with probability 1-epsilon
        else:
            action = maxAction(Q, state)
    
        score = 0
        done = False
        while not done:
            # Take action and observe next state
            next_observation, reward, done, info = env.step(action)
            next_state = getState(next_observation)
        
            # Get next action following e-Greedy policy
            if random.uniform(0, 1) < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action= maxAction(Q, next_state)
        
            # Add reward to the score of the episode
            score += reward

            # Update Q value for state and action given the bellman equation
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action]) 
        
            # Move to next state, and next action
            state, action = next_state, next_action

        total_score[i] = score
    
        reward_list.append(score)
    
        if (i+1) % 100 == 0:
            avg_reward = np.mean(reward_list)
            avg_reward_list.append(avg_reward)
            reward_list = []
            
        epsilon = epsilon - 2/episodes if epsilon > 0.01 else 0.01
    save_obj(Q, 'trained-SARSA')    
    return avg_reward_list        

### Run the training

Here we run the training with the defined variables as hyperparameters

It saves the rewards from each episode and returns an average of every 100 episodes. 

This was meant for plotting, which we unfortunately didn't have enough time to implement properly, but it also saves the trained data in a pickle file for running.
We did however include a few examples of the plotting from the testing phase in our video presentation.
Those are from another notebook, with another implementation of the Q-learning algorithm. Sadly not on SARSA. 

In [5]:
alpha = 0.1 # Learning Rate
gamma = 0.9 # Discount Factor 
epsilon = 1 # e-Greedy 
episodes = 20000 # number of episodes
    
Q_rewards = Q_learning(alpha, gamma, epsilon, episodes) 
SARSA_rewards = SARSA_learning(alpha, gamma, epsilon, episodes)

episode: 0, score: 0, epsilon: 1.000
episode: 500, score: -1000.0, epsilon: 0.950
episode: 1000, score: -1000.0, epsilon: 0.900
episode: 1500, score: -1000.0, epsilon: 0.850
episode: 2000, score: -627.0, epsilon: 0.800
episode: 2500, score: -741.0, epsilon: 0.750
episode: 3000, score: -787.0, epsilon: 0.700
episode: 3500, score: -478.0, epsilon: 0.650
episode: 4000, score: -292.0, epsilon: 0.600
episode: 4500, score: -312.0, epsilon: 0.550
episode: 5000, score: -309.0, epsilon: 0.500
episode: 5500, score: -325.0, epsilon: 0.450
episode: 6000, score: -298.0, epsilon: 0.400
episode: 6500, score: -218.0, epsilon: 0.350
episode: 7000, score: -237.0, epsilon: 0.300
episode: 7500, score: -379.0, epsilon: 0.250
episode: 8000, score: -226.0, epsilon: 0.200
episode: 8500, score: -196.0, epsilon: 0.150
episode: 9000, score: -149.0, epsilon: 0.100
episode: 9500, score: -168.0, epsilon: 0.050
episode: 10000, score: -149.0, epsilon: 0.010
episode: 10500, score: -197.0, epsilon: 0.010
episode: 11000

###### This is our run function
It makes use of the load function to load in the saved Q-table from training so we can just run it without having to retrain

It runs for 10 episodes with 200 steps, rendering the environment so you can see how it does

In [11]:
def run(pickle):
    env = gym.make('MountainCar-v0')
    env._max_episode_steps = 200
    
    # Load previously saved Q-table
    Q = load_obj(pickle)
    
    for episode in range(10):
        done = False
        observation = env.reset()
        state = getState(observation)
        # While the car don't reach the goal or number of steps < 200
        while not done:
            env.render()
            # Take the best action for that state given trained values
            action = maxAction(Q, state)
            observation, reward, done, info = env.step(action)
            # Go to next state
            state = getState(observation)
    env.close()

In [12]:
run('trained-QLearning')

trained-QLearning.pkl


In [13]:
run('trained-SARSA')

trained-SARSA.pkl


# TO-DO: CONCLUSIONS

..

..

..


