In [None]:
import gym
import numpy as np
import random
import math
import matplotlib.pyplot as plt

In [None]:
env = gym.make('CartPole-v0')

# To view the parameters
?env.env

In [None]:
buckets = (1, 1, 6, 3)
actions = env.action_space.n
state_value_bounds = list(zip(env.observation_space.low, env.observation_space.high))

#Defining the lower and the upper limits to the state value

state_value_bounds[1] = [-0.5, 0.5]
state_value_bounds[3] = [-math.radians(50), math.radians(50)]
action_index = len(buckets)
rewards = []

In [None]:
# Initialise the Q table with the zeros 

Q_table = np.zeros(buckets + (actions,))
Q_table

In [None]:
min_explore_rate = 0.01
min_learning_rate = 0.1

max_episodes = 800
max_time_steps = 200
streak_to_end = 120
solved_time = 199
discount = 0.99
no_streaks = 0

In [None]:
def select_action(state_value, explore_rate):
    if random.random() < explore_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state_value])
    return action


def select_explore_rate(x):
    return max(min_explore_rate, min(1, 1.0 - math.log10((x+1)/25)))


def select_learning_rate(x):
    return max(min_learning_rate, min(0.5, 1.0 - math.log10((x+1)/25)))


def bucketize_state_value(state_value):
    bucket_indexes = []
    for i in range(len(state_value)):
        if state_value[i] <= state_value_bounds[i][0]:
            bucket_index = 0
        elif state_value[i] >= state_value_bounds[i][1]:
            bucket_index = buckets[i] - 1
        else:
            bound_width = state_value_bounds[i][1] - state_value_bounds[i][0]
            offset = (buckets[i]-1)*state_value_bounds[i][0]/bound_width
            scaling = (buckets[i]-1)/bound_width
            bucket_index = int(round(scaling*state_value[i] - offset))
        bucket_indexes.append(bucket_index)
    return tuple(bucket_indexes)

In [None]:
for episode_no in range(max_episodes):
    explore_rate = select_explore_rate(episode_no)
    learning_rate = select_learning_rate(episode_no)

    # Update Q table 
    
    observation = env.reset()

    start_state_value = bucketize_state_value(observation)
    previous_state_value = start_state_value

    for time_step in range(max_time_steps):
        env.render()
        selected_action = select_action(previous_state_value, explore_rate)
        observation, reward_gain, completed, _ = env.step(selected_action)
        state_value = bucketize_state_value(observation)
        best_Q_value = np.amax(Q_table[state_value])
        Q_table[previous_state_value + (selected_action,)] += learning_rate * (
                    reward_gain + discount * (best_Q_value) - Q_table[previous_state_value + (selected_action,)])

        print('Episode_number : %d' % episode_no)
        print('Time_step : %d' % time_step)
        print('Selection_action : %d' % selected_action)
        print('Current_state : %s' % str(state_value))
        print('Reward  : %f' % reward_gain)
        print('Best Q_value : %f' % best_Q_value)
        print('Learning_rate : %f' % learning_rate)
        print('Explore_rate : %f' % explore_rate)
        print('Streak_number : %d' % no_streaks)
        rewards.append(best_Q_value)
            
        if completed:
            print('Episode %d finished after %f time steps' % (episode_no, time_step))
            
            if time_step >= solved_time:
                no_streaks += 1
            else:
                no_streaks = 0
            break

        previous_state_value = state_value

    if no_streaks > streak_to_end:
        break
        

In [None]:
# Displaying the final Q_Table with the parameters when we compare it to the first previous zeros Q_Table
# If it does not run along the code then interrupt the kernel and run the cell
Q_table 

In [None]:
# Displaying the resuts: If does not run along the code then interrupt the kernel and run the cell
plt.plot(rewards)
plt.title("Best Q_value")
plt.show()
