In [1]:
import numpy as np
import gym

In [2]:
env = gym.make('MountainCar-v0')

In [3]:
number_of_state_parameters = env.observation_space.shape[0]
number_of_possible_actions = env.action_space.n

custom_parameter_bins = 5

q_table_parameter_bins = [custom_parameter_bins] * number_of_state_parameters 
q_table_size = q_table_parameter_bins + [number_of_possible_actions,]

state_parameter_sample = env.observation_space.sample()
state_parameters_range = env.observation_space.high - env.observation_space.low
state_parameters_dividor = (state_parameters_range) / q_table_parameter_bins


print(f'number_of_state_parameters : {number_of_state_parameters}')
print(f'number_of_possible_actions : {number_of_possible_actions}')

print('-----------------------------')

print(f'custom_parameter_bins      : {custom_parameter_bins}')
print(f'q_table_parameter_bins     : {q_table_parameter_bins}')      
print(f'q_table_size               : {q_table_size}')      

print('-----------------------------')
print(f'state_parameter_sample     : {state_parameter_sample}')
print(f'state_parameters_range     : {state_parameters_range}')
print(f'state_parameters_dividor   : {state_parameters_dividor}')      

number_of_state_parameters : 2
number_of_possible_actions : 3
-----------------------------
custom_parameter_bins      : 5
q_table_parameter_bins     : [5, 5]
q_table_size               : [5, 5, 3]
-----------------------------
state_parameter_sample     : [ 0.28189534 -0.03447452]
state_parameters_range     : [1.8000001 0.14     ]
state_parameters_dividor   : [0.36000001 0.028     ]


In [4]:
q_table = np.random.uniform(low = -2, high = 1, size = q_table_size)
print(q_table.shape)

(5, 5, 3)


In [5]:
def make_discrete(state):
    discrete_state = (state - env.observation_space.low)/state_parameters_dividor
    discrete_state = tuple(discrete_state.astype(np.int)) 
    return(discrete_state)

def get_q_array(state):
    discrete_state = make_discrete(state)
    q_array = q_table[discrete_state]
    return(q_array)

In [6]:
learning_rate = 0.1
discount = 0.95

In [14]:
done = False

state = env.reset()
while not done:    
    current_q_array = get_q_array(state)
    current_action = np.argmax(current_q_array)
    
    new_state, reward, done, info = env.step(current_action)
    
    if not done:
        max_future_q_array = get_q_array(new_state)
        max_future_q_action = np.max(max_q_array)
        
        current_q_action = np.max(current_q_array)
        
        new_q = (1- learning_rate) * current_q_action + learning_rate * (reward + max_future_q_action * discount)
        
        q_table[make_discrete(state)] =  new_q
        
        
    elif new_state[0] >= env.goal_position:
        q_table[make_discrete(state)] = 0
    #env.render()
    
env.close()