In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt

In [2]:
env = gym.make('MountainCar-v0')

In [3]:
number_of_state_parameters = env.observation_space.shape[0]
number_of_possible_actions = env.action_space.n

custom_parameter_bins = 20

q_table_parameter_bins = [custom_parameter_bins] * number_of_state_parameters 
q_table_size = q_table_parameter_bins + [number_of_possible_actions,]

state_parameter_sample = env.observation_space.sample()
state_parameters_range = env.observation_space.high - env.observation_space.low
state_parameters_dividor = (state_parameters_range) / q_table_parameter_bins


print(f'number_of_state_parameters : {number_of_state_parameters}')
print(f'number_of_possible_actions : {number_of_possible_actions}')

print('-----------------------------')

print(f'custom_parameter_bins      : {custom_parameter_bins}')
print(f'q_table_parameter_bins     : {q_table_parameter_bins}')      
print(f'q_table_size               : {q_table_size}')      

print('-----------------------------')
print(f'state_parameter_sample     : {state_parameter_sample}')
print(f'state_parameters_range     : {state_parameters_range}')
print(f'state_parameters_dividor   : {state_parameters_dividor}')      

number_of_state_parameters : 2
number_of_possible_actions : 3
-----------------------------
custom_parameter_bins      : 20
q_table_parameter_bins     : [20, 20]
q_table_size               : [20, 20, 3]
-----------------------------
state_parameter_sample     : [0.26085252 0.05003344]
state_parameters_range     : [1.8000001 0.14     ]
state_parameters_dividor   : [0.09  0.007]


In [4]:
def make_discrete(state1):
    discrete_state1 = (state1 - env.observation_space.low)/state_parameters_dividor
    discrete_state1 = tuple(discrete_state1.astype(np.int)) 
    return(discrete_state1)

def get_q_array(state2):
    discrete_state2 = make_discrete(state2)
    q_array = q_table[discrete_state2]
    return(q_array)

In [None]:
q_table = np.random.uniform(low = -2, high = 1, size = q_table_size)
#q_table = np.load('q_table.npy')

In [None]:
learning_rate = 0.1
discount = 0.95
episodes = 20000
show_every = episodes//5

In [7]:
epsilon = 1
start_decay_episode = 1
stop_decay_episode = episodes // 2 

epsilon_decay_value = epsilon/(stop_decay_episode - start_decay_episode)

In [9]:
%%time

all_episode_total_reward = []
stats = {'ep':[],'avg':[],'min':[],'max':[]}

for episode in range(episodes):
    done = False
    current_state = env.reset()
    episode_total_reward = 0
    
    while not done:    
        
        #Random Epsilon value setter & Get action
        if np.random.random() > epsilon:
            current_q_array = get_q_array(current_state)
            current_q_action = np.argmax(current_q_array)
        else:
            current_q_action = np.random.randint(low = 0, high = number_of_possible_actions)       

        #Move Agent with the current action
        new_state, reward, done, info = env.step(current_q_action)
        episode_total_reward += reward
        if episode % show_every == 0:
            env.render()        
        
        if not done:
            #Calculate MaxQ and currentQ
            max_future_q_array = get_q_array(new_state)            
            max_future_q_value = np.max(max_future_q_array)
            current_q_value = q_table[make_discrete(current_state) + (current_q_action,)]
 
            #Calculate actual new Q Value and it on Q_Table
            new_q = (1 - learning_rate) * current_q_value + learning_rate * (reward + max_future_q_value * discount)
            q_table[make_discrete(current_state) + (current_q_action,)] =  new_q

        #When agent has reached goal
        elif new_state[0] >= env.goal_position:
            #set q_value to 0 (Causes the q_table to get filled with positive values using new_q formula)
            q_table[make_discrete(current_state) + (current_q_action,)] = 0            
            flag_episode = episode
            if episode % show_every == 0:
                print(f'Reached flag in episode : {flag_episode}')
        
        #Set new_state as current_state to continue the chain of actions
        current_state = new_state 
    
    #Decrease epsilon by decay_values for the first half of the episodes
    if stop_decay_episode >= episode >= start_decay_episode:
        epsilon -= epsilon_decay_value
                     
    all_episode_total_reward.append(episode_total_reward)
    
    if episode % show_every == 0:
        stats['ep'].append(episode)
        stats['avg'].append(sum(all_episode_total_reward[-show_every:])/show_every)
        stats['min'].append(min(all_episode_total_reward[-show_every:]))    
        stats['max'].append(max(all_episode_total_reward[-show_every:]))
        
env.close()        

Reached flag in episode : 60
Reached flag in episode : 80
Wall time: 8.81 s


In [None]:
plt.plot(stats['ep'],stats['avg'], label = 'Avg')
plt.plot(stats['ep'],stats['min'], label = 'min')
plt.plot(stats['ep'],stats['max'], label = 'max')
plt.legend(loc=1)

In [None]:
np.save('q_table.npy',q_table)