In [26]:
import numpy as np
import gymnasium as gym
import random

In [27]:

class QLearn:
    def __init__(self,env:gym.Env):
        self.env = env
        self.obs_space: gym.spaces.Box = env.observation_space 

    def set_num_of_increments(self,num_of_increments: np.ndarray):
        self.num_of_increments: np.ndarray = num_of_increments
        self.increment: np.ndarray = self._obs_increment()

    
    def _obs_increment(self) -> np.ndarray:
        high = np.array([1.5,1.5,3,3,1.5,3,1,1])
        low = np.array([-1.5,-1.5,-3,-3,-1.5,-3,0,0])
        # range = self.obs_space.high - self.obs_space.low
        range = high - low
        increment = range / self.num_of_increments
        return increment

    def regulate_obs(self, obs: np.ndarray,time_step:int = None)-> np.ndarray:
        float_num = obs // self.increment + (self.num_of_increments //2)
        float_num = float_num[:-2]
        bool_num = obs[-2:]

        low_limit = np.zeros(8)
        high_limit = self.num_of_increments -1

        reg_obs = np.concatenate((float_num,bool_num)).astype(int)
        reg_obs = reg_obs.clip(low_limit,high_limit).astype(int)
        if time_step:
            return self.add_time_dim(reg_obs,time_step)
        else:
            return reg_obs
    
    def add_time_dim(self,obs:np.ndarray, time_step: int):
        time_incr = 20
        time_step = min(4,time_step//time_incr)
        return np.insert(obs,0,time_step)


def create_Q_table(*dim):
    return np.zeros(dim)

#Bellman equation, Q
def Q_observed(s2_reward:np.float64, s2_obs: np.ndarray, q_table: np.ndarray, gamma:float):
    arg_max_Q = np.argmax(q_table[tuple(s2_obs)])
    return s2_reward + gamma * arg_max_Q

def Q_observed(reward:np.float64, s2_obs: np.ndarray, q_table: np.ndarray, gamma:float):
    arg_max_Q = np.argmax(q_table[tuple(s2_obs)])
    return reward + gamma * arg_max_Q

def Q_expected(s1_obs: np.ndarray,s1_action: np.int64, q_table: np.ndarray):
    index = *s1_obs , s1_action
    return q_table[index]

def update_Q(Q_observed: float,Q_expected: float, alpha: float, q_table: np.ndarray,s1_obs: np.ndarray,s1_action: np.int64):
    index = *s1_obs , s1_action
    q_table[index] = Q_expected + alpha * (Q_observed - Q_expected)
    return q_table

def action_policy(q_table: np.ndarray, s1_obs: np.ndarray):
    return np.argmax(q_table[tuple(s1_obs)])


def expo_decay(large_epsilon, small_epsilon, epoch, steps):
    a = large_epsilon
    b = small_epsilon
    e = np.e
    z = 1- steps/epoch
    return z*((a-b)/e)*(e**z)+b







In [28]:
time_incre = 5
states = np.array([10,10,10,10,10,10,2,2])
# q_table = create_Q_table(time_incre,*states,4)
q_table = create_Q_table(*states,4)

In [29]:
env = gym.make("LunarLander-v2")
agent = QLearn(env)

sample = env.observation_space.sample()
print(sample)
agent.set_num_of_increments(states)
print(agent.regulate_obs(sample,70))


[ 0.8863708   0.18356812  4.439961    4.8638315  -1.8767257  -0.27511042
  0.38076794  0.08213469]
[3 7 5 9 9 0 4 0 0]


In [30]:

env = gym.make("LunarLander-v2")
observation, info = env.reset()
max_alpha = 0.1
min_alpha = 0.001
gamma = 0.55
# epsilon = 0.1
max_epsilon = 0.3
min_epsilon = 0.001
# num_of_increments = 10
epoch = 4_000_000

agent = QLearn(env)
agent.set_num_of_increments(states)
game_step = 0
obs = agent.regulate_obs(observation)



In [31]:
for _ in range(epoch):
    epsilon = expo_decay(max_epsilon,min_epsilon,epoch,_)
    alpha = expo_decay(max_alpha,min_alpha,epoch,_)
    policy_action = action_policy(q_table,obs)
    action = policy_action if random.random() > epsilon else env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    new_obs = agent.regulate_obs(observation)

    q_observe = Q_observed(reward,new_obs,q_table,gamma)
    q_expected = Q_expected(obs,action,q_table)

    update_Q(q_observe,q_expected,alpha,q_table,obs,action)

    if terminated or truncated:
        observation, info = env.reset()
        obs = agent.regulate_obs(observation)
    else:
        obs = new_obs

    if _ % 100_000 == 0:
        print(_)


0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000


In [32]:
from playsound import playsound
playsound('./yakemashita.wav')

In [33]:
env = gym.make("LunarLander-v2",render_mode="human")
observation, info = env.reset()
obs = agent.regulate_obs(observation)



for _ in range(epoch):
    game_step += 1
    action = action_policy(q_table,obs)
    observation, reward, terminated, truncated, info = env.step(action)
    new_obs = agent.regulate_obs(observation)

    if terminated or truncated:
        observation, info = env.reset()
        game_step = 0
        obs = agent.regulate_obs(observation)
    else:
        obs = new_obs

KeyboardInterrupt: 

In [37]:
env = gym.make("LunarLander-v2")
observation, info = env.reset()

obs = agent.regulate_obs(observation,0)


term_count = 0
survival_count = 0
for _ in range(1000000):
    game_step += 1

    policy_action = action_policy(q_table,obs)
    action = policy_action
    observation, reward, terminated, truncated, info = env.step(action)
    new_obs = agent.regulate_obs(observation)

    if terminated or truncated:
        observation, info = env.reset()
        game_step = 0
        obs = agent.regulate_obs(observation)
        term_count += 1
        if reward > 99:
            survival_count +=1
            print(f"term: {term_count}, survival rate: {survival_count/term_count}")
    else:
        obs = new_obs
print(100000/term_count)

term: 1, survival rate: 1.0
term: 3, survival rate: 0.6666666666666666
term: 30, survival rate: 0.1
term: 47, survival rate: 0.0851063829787234
term: 55, survival rate: 0.09090909090909091
term: 68, survival rate: 0.08823529411764706
term: 104, survival rate: 0.0673076923076923
term: 116, survival rate: 0.06896551724137931
term: 120, survival rate: 0.075
term: 132, survival rate: 0.07575757575757576
term: 151, survival rate: 0.0728476821192053
term: 154, survival rate: 0.07792207792207792
term: 169, survival rate: 0.07692307692307693
term: 189, survival rate: 0.07407407407407407
term: 203, survival rate: 0.07389162561576355
term: 207, survival rate: 0.07729468599033816
term: 211, survival rate: 0.08056872037914692
term: 226, survival rate: 0.07964601769911504
term: 234, survival rate: 0.0811965811965812
term: 249, survival rate: 0.08032128514056225
term: 293, survival rate: 0.07167235494880546
term: 309, survival rate: 0.07119741100323625
term: 325, survival rate: 0.07076923076923076
t

In [None]:
env.close

<bound method Wrapper.close of <TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLander-v2>>>>>>

array([20, 20, 20, 20, 20, 20,  2,  2])