base (Python 3.10.8) に接続されました

In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as option
from torch.autograd import Variable

import numpy as np
import gymnasium as gym
import time
import copy

import settings
from replay_buffer import ReplayBuffer

#make Generator
rng = np.random.default_rng()

#make environment
env = gym.make('MountainCar-v0')
obs_num = env.observation_space.shape[0]
act_num = env.action_space.n

#make Neural Network
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(obs_num, settings.HIDDEN_SIZE)
        self.fc2 = nn.Linear(settings.HIDDEN_SIZE, settings.HIDDEN_SIZE)
        self.fc3 = nn.Linear(settings.HIDDEN_SIZE, settings.HIDDEN_SIZE)
        self.fc4 = nn.Linear(settings.HIDDEN_SIZE, act_num)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        y = F.relu(self.fc4(x))
        return y
        
def main():
    Q_train = NN()
    Q_target = copy.deepcopy(Q_train)
    optimizer = option.RMSprop(Q_train.parameters(), lr=0.00015, alpha=0.95, eps=0.01)  #最適化
    
    total_step = 0
    memory = ReplayBuffer(settings.BAFFER_SIZE)
    total_rewards = []
    
    #学習開始
    print("\t".join(["episode", "epsilon", "reward", "total_step", "time"]))
    start = time.time()
    
    for episode in range(settings.EPISODE_NUM):
        pobs, _ = env.reset()
        step = 0        #step
        done = False    #judge end game
        total_reward = 0   #累積報酬
        while not done and (step < settings.STEP_MAX):
            #行動選択(適当な行動値)
            act = env.action_space.sample()
            # ε-greedy法
            if rng.random() > settings.EPSILON:
                pobs_ = np.array(pobs, dtype="float32").reshape((1, obs_num))
                pobs_ = Variable(torch.from_numpy(pobs_))
                act = Q_train(pobs_)
                max, indices = torch.max(act.data, 1)  #valueとindicesが返ってくる
                act = indices.numpy()[0]

            #実行
            obs, reward, done, _, _ = env.step(act)
            
            # update reward 1
            if obs[0] > -0.2 :
                reward = 0.5
            
            # update reward 2
            if obs[0] > 0.5:
                reward = 100
            #add memory
            memory.add((pobs, act, reward, obs, done))  #次状態、行動、報酬、状態、エピソード終了判定をbufferに格納
            
            
            #学習
            if len(memory) == settings.BAFFER_SIZE:
                if total_step % settings.TRAIN_FREQ == 0:
                    for i in range(int(settings.BAFFER_SIZE/settings.BATCH_SIZE)):
                        batch = memory.sample(settings.BATCH_SIZE)
                        pobss = np.array([b[0] for b in batch], dtype="float32").reshape((settings.BATCH_SIZE, obs_num))
                        acts = np.array([b[1] for b in batch], dtype="int")
                        rewards = np.array([b[2] for b in batch], dtype="float32")
                        obss = np.array([b[3] for b in batch], dtype="float32").reshape((settings.BATCH_SIZE, obs_num))
                        dones = np.array([b[4] for b in batch], dtype="float32")
                        
                        #set y
                        pobss_ = Variable(torch.from_numpy(pobss))
                        q = Q_train(pobss_)
                        obss_ = Variable(torch.from_numpy(obss))
                        maxs, indices = torch.max(Q_target(obss_).data, 1)
                        maxq = maxs.numpy() #maxQ
                        target = copy.deepcopy(q.data.numpy())
                        for j in range(settings.BATCH_SIZE):
                            target[j, acts[j]] = rewards[j]+settings.GAMMA*maxq[j]*(not dones[j])    #教師信号
                        optimizer.zero_grad()
                        loss = nn.MSELoss()(q, Variable(torch.from_numpy(target)))
                        loss.backward()
                        optimizer.step()
                #Q関数の更新
                if total_step % settings.UPDATE_TARGET_Q_FREQ == 0:
                    Q_target = copy.deepcopy(Q_train)
            #εの減少
            if settings.EPSILON > settings.EPSILON_MIN and total_step > settings.START_REDUCE_EPSILON:
                settings.EPSILON -= settings.EPSILON_DECREASE
                
            #次の行動へ
            total_reward += reward
            step += 1
            total_step += 1
            pobs = obs       
            
        total_rewards.append(total_reward)  #累積報酬を記録
        
        if(episode + 1) % settings.LOG_FREQ == 0:
            r = sum(total_rewards[((episode + 1) - settings.LOG_FREQ):(episode + 1)])/settings.LOG_FREQ
            elapsed_time = time.time() - start
            print("\t".join(map(str, [episode + 1, settings.EPSILON, r, total_step, str(elapsed_time)+"[sec]"])))
            start = time.time()
            

            
                        



if __name__ == "__main__":
    main()            

episode	epsilon	reward	total_step	time
1000	0.0999999999999992	-200.0	200000	288.44751715660095[sec]
2000	0.0999999999999992	-200.0	400000	289.387996673584[sec]
3000	0.0999999999999992	-200.0	600000	290.02379989624023[sec]
4000	0.0999999999999992	-200.0	800000	289.3160035610199[sec]
5000	0.0999999999999992	-200.0	1000000	290.0819492340088[sec]


base (Python 3.10.8) に接続されました

In [1]:
import gymnasium as gym
import math
import random
import numpy as np
from collections import deque
import torch
from torch import nn
import torch.nn.functional as F

GAMMA = 0.95
LEARNING_RATE = 0.001
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 1000
TOTAL_TS = 10000
SYNC_FREQ = 10
seed = 2023

torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

class ReplayBuffer(object):
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = []
        self.index = 0
        
    def append(self, obj):
        if self.size() > self.buffer_size:
            print('buffer size larger than set value, trimming...')
            self.buffer = self.buffer[(self.size() - self.buffer_size):]
        elif self.size() == self.buffer_size:
            self.buffer[self.index] = obj
            self.index += 1
            self.index %= self.buffer_size
        else:
            self.buffer.append(obj)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size, device="cpu"):
        if self.size() < batch_size:
            batch = random.sample(self.buffer, self.size())
        else:
            batch = random.sample(self.buffer, batch_size)

        res = []
        for i in range(5):
            k = np.stack(tuple(item[i] for item in batch), axis=0)
            res.append(torch.tensor(k, device=device))
        return res[0], res[1], res[2], res[3], res[4]

device = "cuda" # "cpu"
env = gym.make("CartPole-v1")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n
buffer = ReplayBuffer(buffer_size=MEMORY_SIZE)
main_net = nn.Sequential(
    nn.Linear(n_observations, 128), nn.ReLU(),
    nn.Linear(128, 128), nn.ReLU(),
    nn.Linear(128, n_actions)
).to(device)
criterion = nn.MSELoss()
optimiser = torch.optim.Adam(main_net.parameters())

from copy import deepcopy
target_net = deepcopy(main_net)

num_episode = global_ts = 0
ep_rewards = list()
while global_ts <= TOTAL_TS:
    state, done = env.reset(), False
    ep_reward, ep_ts = 0.0, 0
    while not done:
        eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * global_ts / EPS_DECAY)
        if np.random.rand() < eps:
            action = np.random.choice(n_actions)
        else:
            with torch.no_grad():
                q = main_net(torch.tensor(state[None, :], device=device)).detach().cpu().numpy()
            action = np.argmax(q[0])

        next_state, reward, done, info = env.step(action)
        ep_reward += reward
        reward = reward if not done else -reward
        buffer.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            ep_rewards.append(ep_reward)
            if (num_episode == 0) or ((num_episode + 1) % 50 == 0):
                print(f"Ep-{num_episode + 1} {global_ts + 1}/{TOTAL_TS} Eps: {eps:.2f}, Reward: {ep_reward}")
            break
        
        if buffer.size() >= BATCH_SIZE:
            obses_t, actions, rewards, obses_tp1, dones = buffer.sample(BATCH_SIZE, device)
            # Optimize the model
            with torch.no_grad():
                target_q = target_net(obses_tp1).detach().max(1)[0]
            target = rewards + GAMMA * target_q * (1 - dones.float())
            val_t = main_net(obses_t).gather(1, actions.unsqueeze(1)).squeeze(1)
            loss = F.mse_loss(val_t.float(), target.float())
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

            # Periodically sync the target and main networks
            if (global_ts + 1) % SYNC_FREQ == 0:
                target_net.load_state_dict(main_net.state_dict())
        ep_ts += 1
        global_ts += 1
    num_episode += 1

import matplotlib.pyplot as plt

def moving_average(a, n=5) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

plt.plot(moving_average(ep_rewards))
plt.xlabel("#Episode")
plt.ylabel("Ep-Return")

ValueError: too many values to unpack (expected 4)

In [2]:
import gymnasium as gym
import math
import random
import numpy as np
from collections import deque
import torch
from torch import nn
import torch.nn.functional as F

GAMMA = 0.95
LEARNING_RATE = 0.001
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 1000
TOTAL_TS = 10000
SYNC_FREQ = 10
seed = 2023

torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

class ReplayBuffer(object):
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = []
        self.index = 0
        
    def append(self, obj):
        if self.size() > self.buffer_size:
            print('buffer size larger than set value, trimming...')
            self.buffer = self.buffer[(self.size() - self.buffer_size):]
        elif self.size() == self.buffer_size:
            self.buffer[self.index] = obj
            self.index += 1
            self.index %= self.buffer_size
        else:
            self.buffer.append(obj)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size, device="cpu"):
        if self.size() < batch_size:
            batch = random.sample(self.buffer, self.size())
        else:
            batch = random.sample(self.buffer, batch_size)

        res = []
        for i in range(5):
            k = np.stack(tuple(item[i] for item in batch), axis=0)
            res.append(torch.tensor(k, device=device))
        return res[0], res[1], res[2], res[3], res[4]

device = "cuda" # "cpu"
env = gym.make("CartPole-v1")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n
buffer = ReplayBuffer(buffer_size=MEMORY_SIZE)
main_net = nn.Sequential(
    nn.Linear(n_observations, 128), nn.ReLU(),
    nn.Linear(128, 128), nn.ReLU(),
    nn.Linear(128, n_actions)
).to(device)
criterion = nn.MSELoss()
optimiser = torch.optim.Adam(main_net.parameters())

from copy import deepcopy
target_net = deepcopy(main_net)

num_episode = global_ts = 0
ep_rewards = list()
while global_ts <= TOTAL_TS:
    state, done = env.reset(), False
    ep_reward, ep_ts = 0.0, 0
    while not done:
        eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * global_ts / EPS_DECAY)
        if np.random.rand() < eps:
            action = np.random.choice(n_actions)
        else:
            with torch.no_grad():
                q = main_net(torch.tensor(state[None, :], device=device)).detach().cpu().numpy()
            action = np.argmax(q[0])

        next_state, reward, done, info, _ = env.step(action)
        ep_reward += reward
        reward = reward if not done else -reward
        buffer.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            ep_rewards.append(ep_reward)
            if (num_episode == 0) or ((num_episode + 1) % 50 == 0):
                print(f"Ep-{num_episode + 1} {global_ts + 1}/{TOTAL_TS} Eps: {eps:.2f}, Reward: {ep_reward}")
            break
        
        if buffer.size() >= BATCH_SIZE:
            obses_t, actions, rewards, obses_tp1, dones = buffer.sample(BATCH_SIZE, device)
            # Optimize the model
            with torch.no_grad():
                target_q = target_net(obses_tp1).detach().max(1)[0]
            target = rewards + GAMMA * target_q * (1 - dones.float())
            val_t = main_net(obses_t).gather(1, actions.unsqueeze(1)).squeeze(1)
            loss = F.mse_loss(val_t.float(), target.float())
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

            # Periodically sync the target and main networks
            if (global_ts + 1) % SYNC_FREQ == 0:
                target_net.load_state_dict(main_net.state_dict())
        ep_ts += 1
        global_ts += 1
    num_episode += 1

import matplotlib.pyplot as plt

def moving_average(a, n=5) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

plt.plot(moving_average(ep_rewards))
plt.xlabel("#Episode")
plt.ylabel("Ep-Return")

Ep-1 16/10000 Eps: 0.99, Reward: 16.0


  arrays = [asanyarray(arr) for arr in arrays]


ValueError: all input arrays must have the same shape

base (Python 3.10.8) に接続されました

In [1]:
batch

NameError: name 'batch' is not defined

In [2]:
import gymnasium as gym
import math
import random
import numpy as np
from collections import deque
import torch
from torch import nn
import torch.nn.functional as F

GAMMA = 0.95
LEARNING_RATE = 0.001
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 1000
TOTAL_TS = 10000
SYNC_FREQ = 10
seed = 2023

torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

class ReplayBuffer(object):
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = []
        self.index = 0
        
    def append(self, obj):
        if self.size() > self.buffer_size:
            print('buffer size larger than set value, trimming...')
            self.buffer = self.buffer[(self.size() - self.buffer_size):]
        elif self.size() == self.buffer_size:
            self.buffer[self.index] = obj
            self.index += 1
            self.index %= self.buffer_size
        else:
            self.buffer.append(obj)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size, device="cpu"):
        if self.size() < batch_size:
            batch = random.sample(self.buffer, self.size())
        else:
            batch = random.sample(self.buffer, batch_size)

        res = []
        for i in range(5):
            k = np.stack(tuple(item[i] for item in batch), axis=0)
            res.append(torch.tensor(k, device=device))
        return res[0], res[1], res[2], res[3], res[4]

device = "cuda" # "cpu"
env = gym.make("CartPole-v1")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n
buffer = ReplayBuffer(buffer_size=MEMORY_SIZE)
main_net = nn.Sequential(
    nn.Linear(n_observations, 128), nn.ReLU(),
    nn.Linear(128, 128), nn.ReLU(),
    nn.Linear(128, n_actions)
).to(device)
criterion = nn.MSELoss()
optimiser = torch.optim.Adam(main_net.parameters())

from copy import deepcopy
target_net = deepcopy(main_net)

num_episode = global_ts = 0
ep_rewards = list()
while global_ts <= TOTAL_TS:
    state, done = env.reset(), False
    ep_reward, ep_ts = 0.0, 0
    while not done:
        eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * global_ts / EPS_DECAY)
        if np.random.rand() < eps:
            action = np.random.choice(n_actions)
        else:
            with torch.no_grad():
                q = main_net(torch.tensor(state[None, :], device=device)).detach().cpu().numpy()
            action = np.argmax(q[0])

        next_state, reward, done, info, _ = env.step(action)
        ep_reward += reward
        reward = reward if not done else -reward
        buffer.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            ep_rewards.append(ep_reward)
            if (num_episode == 0) or ((num_episode + 1) % 50 == 0):
                print(f"Ep-{num_episode + 1} {global_ts + 1}/{TOTAL_TS} Eps: {eps:.2f}, Reward: {ep_reward}")
            break
        
        if buffer.size() >= BATCH_SIZE:
            obses_t, actions, rewards, obses_tp1, dones = buffer.sample(BATCH_SIZE, device)
            # Optimize the model
            with torch.no_grad():
                target_q = target_net(obses_tp1).detach().max(1)[0]
            target = rewards + GAMMA * target_q * (1 - dones.float())
            val_t = main_net(obses_t).gather(1, actions.unsqueeze(1)).squeeze(1)
            loss = F.mse_loss(val_t.float(), target.float())
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

            # Periodically sync the target and main networks
            if (global_ts + 1) % SYNC_FREQ == 0:
                target_net.load_state_dict(main_net.state_dict())
        ep_ts += 1
        global_ts += 1
    num_episode += 1

import matplotlib.pyplot as plt

def moving_average(a, n=5) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

plt.plot(moving_average(ep_rewards))
plt.xlabel("#Episode")
plt.ylabel("Ep-Return")

Ep-1 12/10000 Eps: 0.99, Reward: 12.0


  arrays = [asanyarray(arr) for arr in arrays]


ValueError: all input arrays must have the same shape

In [3]:
batch

NameError: name 'batch' is not defined