In [15]:
import numpy as np
import gymnasium as gym
import random
from tqdm import tqdm
import time
from matplotlib import pyplot as plt

In [16]:

class QLearn:
    def __init__(self,env:gym.Env):
        self.env = env
        self.obs_space: gym.spaces.Box = env.observation_space 

    def set_num_of_increments(self,num_of_increments: np.ndarray):
        self.num_of_increments: np.ndarray = num_of_increments
        self.increment: np.ndarray = self._obs_increment()

    
    def _obs_increment(self) -> np.ndarray:
        high = np.array([1.5, 1.5, 3, 2, 1.5, 3,1,1])
        low = np.array([-1.5,-1.5,-3,-2,-1.5,-3,0,0])
        # range = self.obs_space.high - self.obs_space.low
        range = high - low
        increment = range / self.num_of_increments
        return increment

    def regulate_obs(self, obs: np.ndarray,time_step:int = None)-> np.ndarray:
        float_num = obs // self.increment + (self.num_of_increments //2)
        float_num = float_num[:-2]
        bool_num = obs[-2:]

        low_limit = np.zeros(8)
        high_limit = self.num_of_increments -1

        reg_obs = np.concatenate((float_num,bool_num)).astype(int)
        reg_obs = reg_obs.clip(low_limit,high_limit).astype(int)
        if time_step:
            return self.add_time_dim(reg_obs,time_step)
        else:
            return reg_obs
    
    def add_time_dim(self,obs:np.ndarray, time_step: int):
        time_incr = 20
        time_step = min(4,time_step//time_incr)
        return np.insert(obs,0,time_step)


def create_Q_table(*dim):
    return np.zeros(dim)

#Bellman equation, Q
def Q_observed(s2_reward:np.float64, s2_obs: np.ndarray, q_table: np.ndarray, gamma:float):
    arg_max_Q = np.argmax(q_table[tuple(s2_obs)])
    return s2_reward + gamma * arg_max_Q

def Q_observed(reward:np.float64, s2_obs: np.ndarray, q_table: np.ndarray, gamma:float):
    arg_max_Q = np.argmax(q_table[tuple(s2_obs)])
    return reward + gamma * arg_max_Q

def Q_expected(s1_obs: np.ndarray,s1_action: np.int64, q_table: np.ndarray):
    index = *s1_obs , s1_action
    return q_table[index]

def update_Q(Q_observed: float,Q_expected: float, alpha: float, q_table: np.ndarray,s1_obs: np.ndarray,s1_action: np.int64):
    index = *s1_obs , s1_action
    q_table[index] = Q_expected + alpha * (Q_observed - Q_expected)
    return q_table

def action_policy(q_table: np.ndarray, s1_obs: np.ndarray):
    return np.argmax(q_table[tuple(s1_obs)])

def reform_reward(reward,observation):
    # min_y_speed = -0.7
    # if observation[3] < min_y_speed:
    #     y_speed=observation[3]
    # else:
    #     y_speed = 0

    # min_x_speed = 1.5
    # if abs(observation[3]) < min_x_speed:
    #     y_speed=abs(observation[3])
    # else:
    #     y_speed = 0
    # reward = reward - 2.5*(y_speed**2) -5*(observation[5]**2) - 2.5*(y_speed**2)
    # # - 5*((observation[1]-1.5)*observation[3])
    return reward

def expo_decay(large_epsilon, small_epsilon, epoch, steps):
    a = large_epsilon
    b = small_epsilon
    e = np.e
    z = 1- steps/epoch
    return z*((a-b)/e)*(e**z)+b

In [17]:
time_incre = 5
load_data = True
states = np.array([10,10,10,10,10,10,2,2])
if load_data:
    q_table = np.load("q_table_60M.npy")
else:
    q_table = create_Q_table(*states,4)

In [18]:

env = gym.make("LunarLander-v2")
observation, info = env.reset()
max_alpha = 0.1
min_alpha = 0.0001
# max_alpha = 0.01
# min_alpha = 0.01
gamma = 0.55
max_epsilon = 0.3
min_epsilon = 0.01
# max_epsilon = 0.1
# min_epsilon = 0.1
epoch = 60_000_000

agent = QLearn(env)
agent.set_num_of_increments(states)
game_step = 0
obs = agent.regulate_obs(observation)



### Train model

In [14]:
try:
    for _ in tqdm(range(epoch)):
        epsilon = expo_decay(max_epsilon,min_epsilon,epoch,_)
        alpha = expo_decay(max_alpha,min_alpha,epoch,_)
        policy_action = action_policy(q_table,obs)
        action = policy_action if random.random() > epsilon else env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        new_obs = agent.regulate_obs(observation)
        reward = reform_reward(reward,observation)

        q_observe = Q_observed(reward,new_obs,q_table,gamma)
        q_expected = Q_expected(obs,action,q_table)

        update_Q(q_observe,q_expected,alpha,q_table,obs,action)

        if terminated or truncated:
            observation, info = env.reset()
            obs = agent.regulate_obs(observation)
        else:
            obs = new_obs
except KeyboardInterrupt:
    env.close()


  0%|          | 26206/60000000 [00:10<6:22:35, 2612.61it/s]


In [None]:
from playsound import playsound
playsound('./yakemashita.wav')

#### Render Lunar Lander

In [19]:
try:
    env = gym.make("LunarLander-v2",render_mode="human")
    observation, info = env.reset()
    obs = agent.regulate_obs(observation)
    reward = 0

    while True:
        game_step += 1
        action = action_policy(q_table,obs)
        observation, reward, terminated, truncated, info = env.step(action)
        new_obs = agent.regulate_obs(observation)

        if terminated or truncated:
            observation, info = env.reset()
            game_step = 0
            obs = agent.regulate_obs(observation)
        else:
            obs = new_obs
except KeyboardInterrupt:
    env.close()

#### Check surivial rate

In [21]:
print("Start test")
try:
    env = gym.make("LunarLander-v2")
    observation, info = env.reset()

    obs = agent.regulate_obs(observation,0)


    term_count = 0
    survival_count = 0
    for _ in range(1000000):
        game_step += 1

        policy_action = action_policy(q_table,obs)
        action = policy_action
        observation, reward, terminated, truncated, info = env.step(action)
        new_obs = agent.regulate_obs(observation)

        if terminated or truncated:
            observation, info = env.reset()
            game_step = 0
            obs = agent.regulate_obs(observation)
            term_count += 1
            if reward > 99:
                survival_count +=1
                print(f"\rterm: {term_count}, survival rate: {survival_count/term_count}",end="")
        else:
            obs = new_obs
    print(f"Final rate {100000/term_count}")
    env.close()
except KeyboardInterrupt:
    env.close()

Start test
term: 635, survival rate: 0.08976377952755905

In [41]:
np.save("./q_table_60M",q_table)

In [86]:
env.close

<bound method Wrapper.close of <TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLander-v2>>>>>>