In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gymnasium 
import soulsgym
import logging
import numpy as np
from sklearn.preprocessing import normalize
from transformation import GameStateTransformer
import gym, random, pickle, os.path, math, glob
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import os
print(torch.cuda.is_available())

True


In [2]:
class Nstep_Memory_Buffer(object):
    # memory buffer to store episodic memory
    def __init__(self, memory_size=1000, n_multi_step = 1, gamma = 0.99):
        self.buffer = []
        self.memory_size = memory_size
        self.n_multi_step = n_multi_step
        self.gamma = gamma
        self.next_idx = 0
        
    def push(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if len(self.buffer) <= self.memory_size: # buffer not full
            self.buffer.append(data)
        else: # buffer is full
            self.buffer[self.next_idx] = data
        self.next_idx = (self.next_idx + 1) % self.memory_size

    def sample(self, batch_size):
        # sample episodic memory
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(batch_size):
            finish = random.randint(self.n_multi_step, self.size() - 1)
            begin = finish-self.n_multi_step
            sum_reward = 0 # n_step rewards
            data = self.buffer[begin:finish]
            state = data[0][0]
            action = data[0][1]
            for j in range(self.n_multi_step):
                # compute the n-th reward
                sum_reward += (self.gamma**j) * data[j][2]
                if data[j][4]:
                    # manage end of episode
                    states_look_ahead = data[j][3]
                    done_look_ahead = True
                    break
                else:
                    states_look_ahead = data[j][3]
                    done_look_ahead = False
            
            states.append(state)
            actions.append(action)
            rewards.append(sum_reward)
            next_states.append(states_look_ahead)
            dones.append(done_look_ahead)

        return np.concatenate(states), actions, rewards, np.concatenate(next_states), dones
    
    def size(self):
        return len(self.buffer)

In [3]:
class DQN(nn.Module):
    def __init__(self,input_dims,output_dims,layer_dims):
        super(DQN, self).__init__()
        
        self.advantage = nn.Sequential(
            nn.Linear(input_dims, layer_dims),
            nn.ReLU(),
            nn.Linear(layer_dims, layer_dims),
            nn.ReLU(),
            nn.Linear(layer_dims, layer_dims),
            nn.ReLU(),
            nn.Linear(layer_dims, output_dims)
        )
        
        self.value = nn.Sequential(
            nn.Linear(input_dims, layer_dims),
            nn.ReLU(),
            nn.Linear(layer_dims, layer_dims),
            nn.ReLU(),
            nn.Linear(layer_dims, layer_dims),
            nn.ReLU(),
            nn.Linear(layer_dims, 1)
        )
        
    def forward(self, x):
        """Compute the forward pass of the network.

        Args:
            x: Network input.

        Returns:
            The network output.
        """
       
        advantage = self.advantage(x)
        value     = self.value(x)
        return value + advantage  - advantage.mean()

In [4]:
class Nstep_DQNAgent: 
    def __init__(self, in_channels = 71, action_space = [], USE_CUDA =True, memory_size = 100000, n_multi_step = 4, gamma = 0.99, epsilon  = 1, lr = 1e-3):
        self.epsilon = epsilon
        self.action_space = action_space
        self.n_multi_step = n_multi_step
        self.gamma = gamma 
        self.memory_buffer = Nstep_Memory_Buffer(memory_size, n_multi_step = n_multi_step, gamma = gamma)
        self.DQN = DQN(in_channels, 20,128)
        self.DQN_target = DQN(in_channels, 20, 128)
        self.DQN_target.load_state_dict(self.DQN.state_dict())


        self.USE_CUDA = USE_CUDA
        if USE_CUDA:
            self.DQN = self.DQN.cuda()
            self.DQN_target = self.DQN_target.cuda()
        self.optimizer = optim.RMSprop(self.DQN.parameters(),lr=lr, eps=0.001, alpha=0.95)
    def value(self, state):
        state = torch.from_numpy(state).cuda()
        q_values = self.DQN(state)
        return q_values
    
    def act(self, state, epsilon = None,action_mask = None):
        """
        sample actions with epsilon-greedy policy
        recap: with p = epsilon pick random action, else pick action with highest Q(s,a)
        """
        if epsilon is None: 
            epsilon = self.epsilon
        c = torch.as_tensor(action_mask, dtype=torch.bool).cuda()
        actions_value = self.value(state).cuda()
        actions_value = torch.where(c, actions_value, torch.tensor([-torch.inf], dtype=torch.float32).cuda())
        actions_value  = actions_value.cpu().detach().numpy()
        if random.random()<epsilon:
            action = random.randrange(self.action_space.n)
        else:

            action = actions_value.argmax(0)
        if type(action)!=int:
            action = np.int64(action)
        return action
    
    def compute_td_loss(self, states, actions, rewards, next_states, is_done, gamma=0.99):
        """ Compute td loss using torch operations only. Use the formula above. """
        actions = torch.tensor(actions).long()    # shape: [batch_size]
        rewards = torch.tensor(rewards, dtype =torch.float)  # shape: [batch_size]
        is_done = torch.tensor(is_done).bool()  # shape: [batch_size]
        
        if self.USE_CUDA:
            actions = actions.cuda()
            rewards = rewards.cuda()
            is_done = is_done.cuda()
            states = torch.reshape(states, (-1, 71)).cuda()
            next_states = torch.reshape(next_states, (-1, 71)).cuda()

        # get q-values for all actions in current states
        predicted_qvalues = self.DQN(states)

        # select q-values for chosen actions
        predicted_qvalues_for_actions = predicted_qvalues[
          range(states.shape[0]), actions
        ]

        # compute q-values for all actions in next states

        
        
        predicted_next_qvalues_current = self.DQN(next_states)
        predicted_next_qvalues_target = self.DQN_target(next_states)
        # compute V*(next_states) using predicted next q-values
        next_state_values =  predicted_next_qvalues_target.gather(1, torch.max(predicted_next_qvalues_current, 1)[1].unsqueeze(1)).squeeze(1)

        # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
        target_qvalues_for_actions = rewards + (self.gamma**self.n_multi_step) *next_state_values

        # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
        target_qvalues_for_actions = torch.where(
            is_done, rewards, target_qvalues_for_actions)

        # mean squared error loss to minimize
        #loss = torch.mean((predicted_qvalues_for_actions -
        #                   target_qvalues_for_actions.detach()) ** 2)
        loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())

        return loss
    
    def sample_from_buffer(self, batch_size):
        # sample episodic memory
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(batch_size):
            finish = random.randint(self.n_multi_step, self.memory_buffer.size() - 1)
            begin = finish-self.n_multi_step
            sum_reward = 0 # n_step rewards
            data = self.memory_buffer.buffer[begin:finish]
            state = data[0][0] # s0
            action = data[0][1] # a0
            for j in range(self.n_multi_step):
                # compute the n-th reward
                sum_reward += (self.gamma**j) * data[j][2] # sum reward
                if data[j][4]:
                    # manage end of episode
                    states_look_ahead = data[j][3] # st
                    done_look_ahead = True
                    break
                else:
                    states_look_ahead = data[j][3] # st
                    done_look_ahead = False
            
            states.append(torch.from_numpy(state))
            actions.append(action)
            rewards.append(sum_reward)
            next_states.append(torch.from_numpy(states_look_ahead))
            dones.append(done_look_ahead)

        return torch.cat(states), actions, rewards, torch.cat(next_states), dones

    def learn_from_experience(self, batch_size):
        if self.memory_buffer.size() > batch_size:
            states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
            td_loss = self.compute_td_loss(states, actions, rewards, next_states, dones)
            self.optimizer.zero_grad()
            td_loss.backward()
            for param in self.DQN.parameters():
                param.grad.data.clamp_(-1, 1)

            self.optimizer.step()
            return(td_loss.item())
        else:
            return(0)
    def save_model(self,n):
        eval_net_path = './model{}.pth'.format(str(n))
        target_net_path = './model_target{}.pth'.format(str(n))
        torch.save(self.DQN, eval_net_path)
        torch.save(self.DQN_target,target_net_path)
    def load_model(self,n):
        eval_net_path = './{}.pth'.format('model'+str(n))
        self.DQN = torch.load(eval_net_path)
        self.DQN_target = torch.load(eval_net_path)
        print('DQN:load_complete')

In [5]:
n_epoch = 10000
learning_start = 100000
batch_size  = 64
epsilon = 0.3



env = gymnasium.make("SoulsGymIudex-v0")
env = env.unwrapped
terminated = False
action_space = env.action_space

N_ACTIONS = env.action_space.n #n個動作可選   s 

tf_transformer = GameStateTransformer()


ep_r_list = []
agent = Nstep_DQNAgent(71,action_space = action_space,memory_size = 100000)
agent.load_model(1000)
losses = 0
loss_list = []
for i in range(1001,n_epoch):
    if i %200 == 0:
        agent.save_model(i)
    terminated = False
    obs, info = env.reset()
    obs = tf_transformer.transform(obs)
    phase = 1
    action_mask = np.zeros(20)
    action_mask[info["allowed_actions"]] = 1
    rewards = 0
    step = 0
    while not terminated:
        action = agent.act(obs,epsilon,action_mask)
        next_obs, reward, terminated, truncated, info = env.step(action)
        next_obs = tf_transformer.transform(next_obs)
        agent.memory_buffer.push(obs, action, reward, next_obs, terminated)
        rewards += reward
        step += 1
        losses = 0
        if agent.memory_buffer.size() >= learning_start:
            loss = agent.learn_from_experience(batch_size)
            losses += loss
            losses.append(loss)
        obs = next_obs
        action_mask[:] = 0
        action_mask[info["allowed_actions"]] = 1
    if epsilon >0.2:
        pass
        #epsilon -= 0.001
    if i% 100:
        agent.DQN_target.load_state_dict(agent.DQN.state_dict())
    try:
        print("epoch{} : reward:{}".format(i,rewards))
    except:
        pass

    ep_r_list.append(reward/step)

DQN:load_complete
epoch1001 : reward:18.74274324351548
epoch1002 : reward:-38.62158752180408
epoch1003 : reward:42.559548805749984
epoch1004 : reward:-49.65065314519134
epoch1005 : reward:24.692362010611113
epoch1006 : reward:25.16082126847694
epoch1007 : reward:16.44805724356779
epoch1008 : reward:32.93027317686041
epoch1009 : reward:17.812362851072432
epoch1010 : reward:25.967454953814332
epoch1011 : reward:14.669130259886927
epoch1012 : reward:41.11546005300252
epoch1013 : reward:16.185787440204617
epoch1014 : reward:-60.48185199262736
epoch1015 : reward:42.1385990629135
epoch1016 : reward:0.9881573000443815
epoch1017 : reward:25.647440366492738
epoch1018 : reward:-86.9910033680811
epoch1019 : reward:-42.81604252867346
epoch1020 : reward:-6.978970362935746
epoch1021 : reward:2.421032751784942
epoch1022 : reward:40.625286742259675
epoch1023 : reward:27.230267040005522
epoch1024 : reward:8.550976689399555
epoch1025 : reward:47.515757996636594
epoch1026 : reward:-27.056468060828188
epo


KeyboardInterrupt



In [None]:
try:
    env.close()
except:
    pass

In [None]:
print(len(all_rewards))

In [None]:
# file_path = "array.txt"

# with open(file_path, "a") as file:
#     for item in all_rewards:
#         file.write(str(item) + "\n")