In [3]:
import sys
import gym
import numpy as np
import random
import math
from collections import defaultdict, deque
import matplotlib.pyplot as plt

In [4]:
def random_control(n):
    rewards = 0
    max_reward = -float("inf")
    env = gym.make("BipedalWalker-v3")
    observation = env.reset()
    for i in range(n):
        while True:
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            rewards += reward
            if reward > max_reward:
                max_reward = reward
            if done:
                print("End Game!: Reward: ", reward)
                observation = env.reset()
                break
        env.close()
    rewards = rewards / n
    print("Average Final Reward: ", rewards)
    print("Max Final Reward: ", max_reward)

In [103]:
random_control(100)

End Game!: Reward:  -100
End Game!: Reward:  -0.10703787943472703
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -0.055409644504390544
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -0.20295348527034363
End Game!: Reward:  -100
End Game!: Reward:  -0.09710959613323211
End Game!: Reward:  0.11028303787112237
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -0.18620821034908297
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -0.06360525055726607
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -100
End Game!: Reward:  -0.13226642215251921
End Game!: Reward:  -0.07127401012554764
End Game!: Reward:  0.0018142666618005572
End Game!: Reward:  0.10635405158003053
End Game!: Reward:  -100
End Game!: Reward:  -0.09173737896730386
End Game

In [5]:
env = gym.make("BipedalWalker-v3")

In [6]:
def generate_random_episode(env):
    episode = []
    state = env.reset()
    while True:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        state = observation
        if done:
            break
    return episode

In [7]:
print(env.action_space)

Box([-1. -1. -1. -1.], [1. 1. 1. 1.], (4,), float32)


In [12]:
def monte_learning(env, num_episodes, generate_episode, gamma=1.0):
    xdata, ydata = [], []
    max_score = -float("inf")
    tmp_scores = deque(maxlen=100) 
    avg_scores = deque(maxlen=num_episodes)
    
    for i_episode in range(1, num_episodes + 1):
        next_state = env.reset()
        while True:  
            action = env.action_space.sample()
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
            next_state, reward, done, info = env.step(action)

            # whenever the episode ends, add the reward to the temporary array
            if done:
                # stores the maximum score over the num_episodes
                if max_score < reward:
                    max_score = reward
                tmp_scores.append(reward)
                
                break
                
        if (i_episode % 10 == 0):
            # plot updates every 10 episodes     
            xdata.append(i_episode)
            ydata.append(np.mean(tmp_scores))

        if (i_episode % 100 == 0):
            # updates user every 100 episodes 
            avg_scores = tmp_scores
            print("\nEpisode Number: ", i_episode)
            print(('Best Average Reward over %d Episodes: ' % 100), np.max(avg_scores))

    fig= plt.figure()
    ax=fig.add_subplot()
    Ln, = ax.plot(xdata,ydata)
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward Over Prior 10 Episodes')
    ax.set_xlim([0,num_episodes])
    ax.set_ylim([-100,10])
    fig.savefig("img_monte_learning.png")
    print("\nAverage reward after ", num_episodes," episodes: ",np.mean(ydata))
    print("\nMax reward after ", num_episodes, " episodes: ", max_score)

In [14]:
def mc_control_test(env, num_episodes, alpha, gamma=1.0, eps_start=1.0, eps_decay=.99999, eps_min=0.05):
    nA = env.action_space.shape[0]
    Q = defaultdict(lambda: np.zeros(nA))
    epsilon = eps_start
    for i_episode in range(1, num_episodes+1):
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        epsilon = max(epsilon*eps_decay, eps_min)
        episode = generate_episode_from_Q(env, Q, epsilon, nA)
        Q = update_Q(env, episode, Q, alpha, gamma)
    policy = dict((k,np.argmax(v)) for k, v in Q.items())
    return policy, Q

In [13]:
env = gym.make("BipedalWalker-v3")
monte_learning(env, 1000, generate_random_episode)

Episode 100/1000.
Episode Number:  100
Best Average Reward over 100 Episodes:  0.10822782421112058
Episode 200/1000.
Episode Number:  200
Best Average Reward over 100 Episodes:  0.15014757639169693
Episode 300/1000.
Episode Number:  300
Best Average Reward over 100 Episodes:  0.09723373850186902
Episode 338/1000.

KeyboardInterrupt: 