In [4]:
import sys
import gym
import numpy as np
import random
import math
from collections import defaultdict, deque
import matplotlib.pyplot as plt
env = gym.make("BipedalWalker-v3")

In [5]:
def random_control(n):
    rewards = 0
    max_reward = -float("inf")
    env = gym.make("BipedalWalker-v3")
    observation = env.reset()
    for i in range(n):
        while True:
            env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            rewards += reward
            if reward > max_reward:
                max_reward = reward
            if done:
                print("End Game!: Reward: ", reward)
                observation = env.reset()
                break
        env.close()
    rewards = rewards / n
    print("Average Final Reward: ", rewards)
    print("Max Final Reward: ", max_reward)

In [6]:
# taken from Hifly reference to attempt to turn continuous
# state space into a discrete state space
bucket_size_states = (4,5,5,5,4,5,4,5,2,4,5,4,5,2)
dim_states = len(bucket_size_states)

bucket_size_action = (20,20,20,20)
dim_action = len(bucket_size_action)
sBounds = [(0, math.pi),
           (-2,2),
           (-1,1),
           (-1,1),
           (0,math.pi),
           (-2,2),
           (0, math.pi),
           (-2,2),
           (0,1),
           (0, math.pi),
           (-2, 2),
           (0, math.pi),
           (-2, 2),
           (0, 1)]

def state_to_bucket(state):
    bucket_state = []
    for i in range(len(state)):
        bucket_index = int((state[i]-sBounds[i][0])
                           / (sBounds[i][1]-sBounds[i][0])*bucket_size_states[i]-1)
        bucket_state.append(bucket_index)
    return tuple(bucket_state)

def bucket_to_action(bucket_action):
    actionBounds = (-1, 1)
    action = []
    for i in range(len(bucket_action)):
        value_action = bucket_action[i] \
                       / (bucket_size_action[i] -1 ) * (actionBounds[1] - actionBounds[0]) - 1
        action.append(value_action)
    return tuple(action)

def choose_action(q_table, state, eps):
    if random.random() < eps:
        action = ()
        for i in range (0, dim_action):
            action += (random.randint(0, bucket_size_action[i]-1),)
    else:
        action = np.unravel_index(np.argmax(q_table[state]), q_table[state].shape)
    return action

def dd():
    return np.zeros(bucket_size_action)

In [7]:
def q_learning(env, num_episodes, gamma=1.0):
    xdata, ydata = [], []
    
    q_table = defaultdict(dd)
    max_score = -float("inf")
    tmp_scores = deque(maxlen=100) 
    avg_scores = deque(maxlen=num_episodes)
    
    for i_episode in range(1, num_episodes + 1):
        # sets the state according to the state_to_bucket function
        state = state_to_bucket(env.reset()[0:dim_states])
        eps = .99
        total_reward = 0

        while True:
            # update user while learning
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
            # episode = generate_episode(env) 
            # ^uncomment this if you want to see the environment 

            # sets the value of the q_table by specifying the action, state, and reward of the env
            action = choose_action(q_table, state, eps)
            action_bucket = bucket_to_action(action)
            next_state_real, reward, done, info = env.step(action_bucket)
            next_state = state_to_bucket(next_state_real[0:dim_states])
            # print(next_state)
            total_reward += reward
            # adjust learning rate if necessary
            q_table[state, action] = update_Q(0.01, eps, q_table, state, action, reward)
            state = next_state
                    
            # whenever the episode ends, add the reward to the temporary array
            if done:
                # stores the maximum score over the num_episodes
                if max_score < reward:
                    max_score = reward

                tmp_scores.append(reward)
                break
    
        if (i_episode % 10 == 0):
            # plot updates every 10 episodes     
            xdata.append(i_episode)
            ydata.append(np.mean(tmp_scores))
        
        if (i_episode % 100 == 0):
            # updates user every 100 episodes 
            avg_scores = tmp_scores
            print("\nEpisode Number: ", i_episode)
            print(('Best Average Reward over %d Episodes: ' % 100), np.max(avg_scores))
            
    fig= plt.figure()
    ax=fig.add_subplot()
    Ln, = ax.plot(xdata,ydata)
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward Over Prior 10 Episodes')
    ax.set_xlim([0,num_episodes])
    ax.set_ylim([-100,10])
    fig.savefig("img_q_learning_6.png")
    print("\nAverage reward after ", num_episodes," episodes: ",np.mean(ydata))
    print("\nMax reward after ", num_episodes, " episodes: ", max_score)
    return q_table

def update_Q(alpha, gamma, q_table, state, action, reward, next_state=None):
    currentQValue = q_table[state][action] 
    Qsa_next = np.max(q_table[next_state]) 
    # uses the current Q_Value and the next state to determine the target value less the discount factor
    targetQValue = reward + (gamma * Qsa_next)  
    # returns the updated value as a function of the current value, target value, less the learning rate
    updated_value = currentQValue + (alpha * (targetQValue - currentQValue)) 
    #print(new_value)
    return updated_value

In [8]:
q_results = q_learning(env, 1000)

Episode 100/1000.
Episode Number:  100
Best Average Reward over 100 Episodes:  0.08718213958489267
Episode 200/1000.
Episode Number:  200
Best Average Reward over 100 Episodes:  0.12364452132582666
Episode 300/1000.
Episode Number:  300
Best Average Reward over 100 Episodes:  0.11248096062138535
Episode 400/1000.
Episode Number:  400
Best Average Reward over 100 Episodes:  0.13936640108049844
Episode 500/1000.
Episode Number:  500
Best Average Reward over 100 Episodes:  0.09978505933912175
Episode 600/1000.
Episode Number:  600
Best Average Reward over 100 Episodes:  0.09148430789144417
Episode 700/1000.
Episode Number:  700
Best Average Reward over 100 Episodes:  0.10479575539471575
Episode 800/1000.
Episode Number:  800
Best Average Reward over 100 Episodes:  0.08121348980644529
Episode 900/1000.
Episode Number:  900
Best Average Reward over 100 Episodes:  0.08315763258306605
Episode 1000/1000.
Episode Number:  1000
Best Average Reward over 100 Episodes:  0.0666360466950818

Average 