In [1]:
import numpy as np
import gym

In [2]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

In [3]:
numter_of_state_parameter = env.observation_space.shape[0]
number_of_possible_actions = env.action_space.n

custom_parameter_bins = 20

q_table_parameter_bins = [custom_parameter_bins] * numter_of_state_parameter
q_table_size = q_table_parameter_bins + [number_of_possible_actions,]


state_sample = env.observation_space.sample()

env.observation_space.high[1] = 100
env.observation_space.low[1] = -100

env.observation_space.high[3] = 100
env.observation_space.low[3] = -100

state_parameter_range = env.observation_space.high - env.observation_space.low
state_parameters_dividor = state_parameter_range / q_table_parameter_bins 

print(f'numter_of_state_parameter   : {numter_of_state_parameter}')
print(f'number_of_possible_actions  : {number_of_possible_actions}')
print(f'custom_parameter_bins       : {custom_parameter_bins}')
print(f'q_table_parameter_bins      : {q_table_parameter_bins}')
print(f'q_table_size                : {q_table_size}')

print(f'state_sample                : {state_sample}')
print(f'state_parameter_range       : {state_parameter_range}')
print(f'state_parameters_dividor    : {state_parameters_dividor}')

numter_of_state_parameter   : 4
number_of_possible_actions  : 2
custom_parameter_bins       : 20
q_table_parameter_bins      : [20, 20, 20, 20]
q_table_size                : [20, 20, 20, 20, 2]
state_sample                : [3.1538935e+00 1.2346671e+38 4.3703638e-02 2.1848769e+38]
state_parameter_range       : [  9.6        200.           0.83775806 200.        ]
state_parameters_dividor    : [ 0.48000002 10.          0.0418879  10.        ]


In [4]:
def make_discrete(state1):
    discrete_state1 = (state1 - env.observation_space.low)/state_parameters_dividor
    discrete_state1 = tuple(discrete_state1.astype(np.int)) 
    return(discrete_state1)

def get_q_array(state2):
    discrete_state2 = make_discrete(state2)
    q_array = q_table[discrete_state2]
    return(q_array)

In [5]:
q_table = np.random.uniform(low = -100, high = 100, size = q_table_size)

In [11]:
learning_rate = 0.1
discount = 0.95
episodes = 20000
show_every = episodes // 5

In [19]:
epsilon = 0
start_epsilon_decay_episode = 1
stop_epsilon_decay_episode = episodes // 1.4
epsilon_decay_value = epsilon / (start_epsilon_decay_episode - stop_epsilon_decay_episode) 

In [20]:
cnt_list = []
total_reward_list = []

for episode in range(episodes):
    done = False
    cnt = 1
    total_reward = 0
    current_state = env.reset()
    
    
    while not done:
        
       #Random Epsilon value setter & Get action
        if np.random.random() > epsilon:
            current_q_array = get_q_array(current_state)
            current_q_action = np.argmax(current_q_array)
        else:
            current_q_action = np.random.randint(low = 0, high = number_of_possible_actions)       

        new_state, reward, done, info = env.step(current_q_action)
        
        if episode % show_every == 0:
            env.render()
        
        if not done:
            max_future_q_array = get_q_array(new_state)
            max_future_q_value = np.max(max_future_q_array)
            current_q_value = q_table[make_discrete(current_state) + (current_q_action,)]
            
            new_q = (1 - learning_rate) * current_q_value + learning_rate * (reward + discount * max_future_q_value)
            
            q_table[make_discrete(current_state) + (current_q_action,)] = new_q
            total_reward += reward
            cnt += 1
            avg_reward = total_reward / cnt
        
        elif done and cnt < 195:
            q_table[make_discrete(current_state) + (current_q_action,)] = -200
         
        if done and cnt >= 195:
            if episode % show_every == 0:
                q_table[make_discrete(current_state) + (current_q_action,)] = 200
                print(f'Reached goal in episode : {episode} with count : {cnt}') 
                
        current_state = new_state
        
        cnt_list.append(cnt)
        total_reward_list.append(total_reward)        
        
    #Decrease epsilon by decay_values for the first half of the episodes
    if stop_epsilon_decay_episode >= episode >= start_epsilon_decay_episode:
        epsilon -= epsilon_decay_value

env.close()    

Reached goal in episode : 4000 with count : 200
Reached goal in episode : 8000 with count : 200
Reached goal in episode : 12000 with count : 200


In [21]:
max(cnt_list)

200

In [22]:
max(total_reward_list)

199.0