In [1]:
import gym
import numpy as np
import math
from collections import deque
import time
import matplotlib
import matplotlib.pyplot as plt

class MountaincarQAgent():
    def __init__(self, buckets=(12, 12), num_episodes=500000, min_epsilon=0.01, discount=0.99, decay=100, force=True):
        self.buckets = buckets
        self.num_episodes = num_episodes
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.decay = decay

        self.env = gym.make('MountainCar-v0')
        self.upper_bounds = [self.env.observation_space.high[0], self.env.observation_space.high[1]]
        self.lower_bounds = [self.env.observation_space.low[0], self.env.observation_space.low[1]]
        
        ## Concatination of tuples to get shape (12,12,3) for buckets=(12, 12)
        self.Q_table = np.zeros(self.buckets + (self.env.action_space.n,))        
        
        self.learning_rate = 0.008
        
        self.threshold = self.env.spec.reward_threshold
        print('threshold: ', self.threshold)

    def discretize_state(self, obs):
        discretized = list()
        for i in range(len(obs)):
            scaling = (obs[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i])
            new_obs = int(round((self.buckets[i] - 1) * scaling))
            new_obs = min(self.buckets[i] - 1, max(0, new_obs))
            discretized.append(new_obs)
        return tuple(discretized)

    
    def choose_action(self, state):
        if (np.random.random() < self.epsilon):
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.Q_table[state])

    def update_q(self, state, action, reward, new_state):
        self.Q_table[state][action] += \
           self.learning_rate * (reward + self.discount * np.max(self.Q_table[new_state]) - self.Q_table[state][action])

    def get_epsilon(self, t):
        return max(self.min_epsilon, min(1., 1. - math.log10((t + 1) / self.decay)))

    
    def train(self):
        scores_deque = deque(maxlen=100)
        scores_array = []
        avg_scores_array = []  
        print_every = 400
        time_start = time.time()
        
        for i_episode in range(self.num_episodes):
            current_state = self.discretize_state(self.env.reset())

            self.epsilon = self.get_epsilon(i_episode)
            done = False
            
            episode_reward = 0
            time_step = 0
            
            while not done:
                action = self.choose_action(current_state)
                obs, reward, done, _ = self.env.step(action)   
                new_state = self.discretize_state(obs)
                self.update_q(current_state, action, reward, new_state)
                current_state = new_state
                time_step += 1
                episode_reward += reward
                
            scores_deque.append(episode_reward)
            scores_array.append(episode_reward)
            
            avg_score = np.mean(scores_deque)
            avg_scores_array.append(avg_score)
            
            s = (int)(time.time() - time_start)
            
            if i_episode % print_every == 0 and i_episode > 0:                
                print('Episode: {}, Timesteps:  {}, Score: {:5},  Avg.Score: {:.2f}, eps-greedy: {:5.2f}, Time: {:02}:{:02}:{:02}'.\
                    format(i_episode, time_step, episode_reward, avg_score, self.epsilon, s//3600, s%3600//60, s%60))    
                
            if avg_score >= self.threshold: 
                print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}'. \
                    format(i_episode, np.mean(scores_deque)))
                break                                

        print('Finished training!')
        
        return scores_array, avg_scores_array

    def run(self):
        self.env = gym.wrappers.Monitor(self.env,'Mountaincar', force=True)
        t = 0
        done = False
        current_state = self.discretize_state(self.env.reset())
        while not done:
                self.env.render()
                t = t+1
                action = self.choose_action(current_state)
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretize_state(obs)
                current_state = new_state
            
        return t
    
    def close(self):
        self.env.close()

In [None]:
agent = MountaincarQAgent()
scores, avg_scores = agent.train()

threshold:  -110.0
Episode: 400, Timesteps:  200, Score: -200.0,  Avg.Score: -200.00, eps-greedy:  0.40, Time: 00:00:05
Episode: 800, Timesteps:  200, Score: -200.0,  Avg.Score: -200.00, eps-greedy:  0.10, Time: 00:00:11
Episode: 1200, Timesteps:  200, Score: -200.0,  Avg.Score: -200.00, eps-greedy:  0.01, Time: 00:00:16
Episode: 1600, Timesteps:  200, Score: -200.0,  Avg.Score: -197.64, eps-greedy:  0.01, Time: 00:00:21
Episode: 2000, Timesteps:  200, Score: -200.0,  Avg.Score: -198.59, eps-greedy:  0.01, Time: 00:00:27
Episode: 2400, Timesteps:  200, Score: -200.0,  Avg.Score: -198.01, eps-greedy:  0.01, Time: 00:00:32
Episode: 2800, Timesteps:  121, Score: -121.0,  Avg.Score: -187.99, eps-greedy:  0.01, Time: 00:00:38
Episode: 3200, Timesteps:  122, Score: -122.0,  Avg.Score: -185.60, eps-greedy:  0.01, Time: 00:00:43
Episode: 3600, Timesteps:  200, Score: -200.0,  Avg.Score: -197.88, eps-greedy:  0.01, Time: 00:00:48
Episode: 4000, Timesteps:  200, Score: -200.0,  Avg.Score: -197.2

Episode: 32400, Timesteps:  157, Score: -157.0,  Avg.Score: -153.70, eps-greedy:  0.01, Time: 00:06:24
Episode: 32800, Timesteps:  152, Score: -152.0,  Avg.Score: -160.58, eps-greedy:  0.01, Time: 00:06:29
Episode: 33200, Timesteps:  117, Score: -117.0,  Avg.Score: -160.75, eps-greedy:  0.01, Time: 00:06:33
Episode: 33600, Timesteps:  120, Score: -120.0,  Avg.Score: -154.17, eps-greedy:  0.01, Time: 00:06:38
Episode: 34000, Timesteps:  161, Score: -161.0,  Avg.Score: -164.43, eps-greedy:  0.01, Time: 00:06:42
Episode: 34400, Timesteps:  180, Score: -180.0,  Avg.Score: -154.68, eps-greedy:  0.01, Time: 00:06:46
Episode: 34800, Timesteps:  148, Score: -148.0,  Avg.Score: -170.50, eps-greedy:  0.01, Time: 00:06:51
Episode: 35200, Timesteps:  153, Score: -153.0,  Avg.Score: -155.72, eps-greedy:  0.01, Time: 00:06:56
Episode: 35600, Timesteps:  119, Score: -119.0,  Avg.Score: -149.14, eps-greedy:  0.01, Time: 00:07:01
Episode: 36000, Timesteps:  197, Score: -197.0,  Avg.Score: -176.04, eps-

In [None]:
%matplotlib inline

print('length of scores: ', len(scores), ', len of avg_scores: ', len(avg_scores))

fig = plt.figure(figsize=(15,3))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores, label="Score")
plt.plot(np.arange(1, len(avg_scores)+1), avg_scores, label="Avg on 100 episodes")
plt.legend(bbox_to_anchor=(1.05, 1)) 
plt.ylabel('Score')
plt.xlabel('Episodes #')
plt.show()

In [4]:
t = agent.run()
print("Time", t)

Time 110


In [None]:
agent.close()