In [6]:
import numpy as np
import gymnasium as gym
import random

In [7]:

class QLearn:
    def __init__(self,env:gym.Env):
        self.env = env
        self.obs_space: gym.spaces.Box = env.observation_space 
        self.reward_list = []
        self.obs_list = []
        self.action_list = []

    def set_num_of_increments(self,num_of_increments: np.ndarray):
        self.num_of_increments: np.ndarray = num_of_increments
        self.increment: np.ndarray = self._obs_increment()

    
    def _obs_increment(self) -> np.ndarray:
        high = np.array([1.5,1.5,3,3,1.5,3,1,1])
        low = np.array([-1.5,-1.5,-3,-3,-1.5,-3,0,0])
        # range = self.obs_space.high - self.obs_space.low
        range = high - low
        increment = range / self.num_of_increments
        return increment

    def regulate_obs(self, obs: np.ndarray,time_step:int = None)-> np.ndarray:
        float_num = obs // self.increment + (self.num_of_increments //2)
        float_num = float_num[:-2]
        bool_num = obs[-2:]

        low_limit = np.zeros(8)
        high_limit = self.num_of_increments -1

        reg_obs = np.concatenate((float_num,bool_num)).astype(int)
        reg_obs = reg_obs.clip(low_limit,high_limit).astype(int)
        if time_step:
            return self.add_time_dim(reg_obs,time_step)
        else:
            return reg_obs
    
    # def add_time_dim(self,obs:np.ndarray, time_step: int):
    #     time_incr = 20
    #     time_step = min(4,time_step//time_incr)
    #     return np.insert(obs,0,time_step)


    # #Bellman equation, Q
    # def _Q_observed(s2_reward:np.float64, s2_obs: np.ndarray, q_table: np.ndarray, gamma:float):
    #     arg_max_Q = np.argmax(q_table[tuple(s2_obs)])
    #     return s2_reward + gamma * arg_max_Q


    def create_Q_table(self,*dim):
        self.q_table = np.zeros(dim)

    def _update_list(self,items:list,new_item,max_cap:int):
        if len(items) < max_cap:
            items.append(new_item)
        else:
            items.append(new_item)
            items.pop(0)
        return items

    def _Q_observed(self, gamma:float):
        reward_sum = 0
        for i in range(len(self.reward_list)):
            reward_sum += self.reward_list[0]*(gamma**i)
            
        new_obs = self.obs_list[-1]
        arg_max_Q = np.argmax(self.q_table[tuple(new_obs)])
        return reward_sum + (gamma**len(self.reward_list)) * arg_max_Q

    def _Q_expected(self):
        index = *self.obs_list[0] , self.action_list[0]
        return self.q_table[index]

    def update_Q(self,obs:np.ndarray,reward:np.float64,alpha,action,gamma,terminate:bool=False):
        if not terminate:
            self.obs_list = self._update_list(self.obs_list,obs,8)
            self.reward_list = self._update_list(self.reward_list,reward,7)
            self.action_list = self._update_list(self.action_list,action,8)
            Q_observed = self._Q_observed(gamma)
            Q_expected = self._Q_expected()

            index = *self.obs_list[0] , self.action_list[0]
            self.q_table[index] = Q_expected + alpha * (Q_observed - Q_expected)
        else:
            for _ in range(len(reward)):
                self.update_Q(obs,reward,alpha,action,gamma)



    def action_policy(self, s1_obs: np.ndarray):
        action =  np.argmax(self.q_table[tuple(s1_obs)])
        self.action_list = self._update_list(self.action_list,action,4)
        return action


def expo_decay(large_epsilon, small_epsilon, epoch, steps):
    a = large_epsilon
    b = small_epsilon
    e = np.e
    z = 1- steps/epoch
    return z*((a-b)/e)*(e**z)+b







In [8]:
env = gym.make("LunarLander-v2")
agent = QLearn(env)

time_incre = 5
states = np.array([10,10,10,10,10,10,2,2])
# q_table = create_Q_table(time_incre,*states,4)
agent.create_Q_table(*states,4)
agent.q_table.shape

(10, 10, 10, 10, 10, 10, 2, 2, 4)

In [9]:

env = gym.make("LunarLander-v2")
observation, info = env.reset()
max_alpha = 0.1
min_alpha = 0.001
gamma = 0.9
# epsilon = 0.1
max_epsilon = 0.3
min_epsilon = 0.01
# num_of_increments = 10
epoch = 4_000_000
items = []

agent.set_num_of_increments(states)
game_step = 0
obs = agent.regulate_obs(observation)



In [10]:
for _ in range(epoch):
    epsilon = expo_decay(max_epsilon,min_epsilon,epoch,_)
    alpha = expo_decay(max_alpha,min_alpha,epoch,_)
    policy_action = agent.action_policy(obs)
    action = policy_action if random.random() > epsilon else env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    new_obs = agent.regulate_obs(observation)

    agent.update_Q(obs,reward,alpha,action,gamma)

    if terminated or truncated:
        observation, info = env.reset()
        obs = agent.regulate_obs(observation)
    else:
        obs = new_obs

    if _ % 100_000 == 0:
        print(_)


0


100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000


In [11]:
from playsound import playsound
playsound('./yakemashita.wav')


    Error 259 for command:
        play ./yakemashita.wav wait
    The driver cannot recognize the specified command parameter.


PlaysoundException: 
    Error 259 for command:
        play ./yakemashita.wav wait
    The driver cannot recognize the specified command parameter.

In [12]:
env = gym.make("LunarLander-v2",render_mode="human")
observation, info = env.reset()
obs = agent.regulate_obs(observation)



for _ in range(epoch):
    game_step += 1
    action = agent.action_policy(obs)
    observation, reward, terminated, truncated, info = env.step(action)
    new_obs = agent.regulate_obs(observation)

    if terminated or truncated:
        observation, info = env.reset()
        game_step = 0
        obs = agent.regulate_obs(observation)
    else:
        obs = new_obs

KeyboardInterrupt: 

In [9]:
env = gym.make("LunarLander-v2")
observation, info = env.reset()

obs = agent.regulate_obs(observation,0)


term_count = 0
survival_count = 0
for _ in range(100000):
    game_step += 1

    policy_action = agent.action_policy(obs)
    action = policy_action
    observation, reward, terminated, truncated, info = env.step(action)
    new_obs = agent.regulate_obs(observation)

    if terminated or truncated:
        observation, info = env.reset()
        game_step = 0
        obs = agent.regulate_obs(observation)
        term_count += 1
        if reward > 99:
            survival_count +=1
            print(f"term: {term_count}, survival rate: {survival_count/term_count}")
    else:
        obs = new_obs
print(100000/term_count)

term: 47, survival rate: 0.02127659574468085
term: 125, survival rate: 0.016
term: 257, survival rate: 0.011673151750972763
term: 408, survival rate: 0.00980392156862745
term: 490, survival rate: 0.01020408163265306
term: 540, survival rate: 0.011111111111111112
176.67844522968198


In [None]:
env.close

<bound method Wrapper.close of <TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLander-v2>>>>>>

array([20, 20, 20, 20, 20, 20,  2,  2])