Jupyterlab Shortcuts
- Shift + Enter : Run
- Enter/Esc : Mode change(Edit/Command)
- Fn + Up/Down : Move between cell 
- A / B : Insert Cell Above/Below
- D, D : Delete selected cell

Random
- random.sample(population, k) : population sequence 에서 독립적으로 k length의 데이터를 sample

In [None]:
Pytorch

In [18]:
from collections import deque
import numpy as np
import random
import gym
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
env = gym.make('CartPole-v0')
state = env.reset()

# for discrete env
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print(state_dim, action_dim)

4 2


Network
- 

In [25]:
class Net(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(state_dim, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, action_dim)
    
    def forward(self, state):
        out = F.relu(self.fc1(state))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out

In [10]:
q = Net(state_dim, action_dim)
qtarget = Net(state_dim, action_dim)

In [21]:
# network의 weight값을 print
# network의 weight값을 다른 network의 weight값으로 맞춰줌
weight_name = 'fc1.weight'
print(q.state_dict()[weight_name][1])
print(qtarget.state_dict()[weight_name][1])
qtarget.load_state_dict(q.state_dict())
print(q.state_dict()[weight_name][1])
print(qtarget.state_dict()[weight_name][1])

tensor([-0.3967, -0.3701,  0.4617, -0.1940])
tensor([-0.3967, -0.3701,  0.4617, -0.1940])
tensor([-0.3967, -0.3701,  0.4617, -0.1940])
tensor([-0.3967, -0.3701,  0.4617, -0.1940])


In [31]:
class DQNAgent:
    def __init__(self, env, epsilon):
        # if env is discrete,
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        print(state_dim, action_dim)
        self.epsilon = epsilon
                
        self.q = Net(state_dim, action_dim)
        self.qtarget = Net(state_dim, action_dim)
        
        self.optimizer = optim.Adam(self.q.parameters())
    
    def update_qtarget_parameter(self):
        self.qtarget.load_state_dict(self.q.state_dict())    
    
    def get_action(self, state):
        # state : (ndarray) --> (tensor)
        # action : (tensor) --> (int)
        if self.epsilon < random.random():
            selected_action = random.randrange(action_dim)
        else:
            qval = self.q.forward(torch.FloatTensor(state))
            selected_action = qval.argmax().numpy()
        return selected_action
    
    def learn(self, mini_batch):
        # batch_size 만큼 크기의 loss의 합을 최소화
        # zero_grad()를 매번 해줘야 하나?
        # why l1 loss?
        losses = []
        for sample in mini_batch:
            losses.append((torch.FloatTensor(sample['predict']) - self.q.forward(torch.FloatTensor(sample['state'])))^2)
        loss = F.smooth_l1_loss()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    def _compute_dqn_loss(self):
        pass
    
    def _plot(self, scores):
        plt.plot(scores)
        plt.show()

Replay Buffer
- maxlen만큼 sample들을 저장하고 있으며, 그 이상으로 sample들이 들어오면 FIFO방식으로 eject한다

In [12]:
class ReplayBuffer:
    def __init__(self, buffer_size, gamma):
        # deque는 maxlen만큼만 FIFO
        self._memory = deque(maxlen=buffer_size)
        self.gamma = gamma
    
    def add(self, sample):
        if sample['done']:
            sample['predict'] = sample['reward']
        else:
            sample['predict'] = sample['reward'] + self.gamma*max(sample['qtarget'])
        self._memory.append(sample)
    
    def sample(self, batch_size):
        # random하게 batch_size만큼 sample
        mini_batch = random.sample(self._memory, batch_size)        
        return mini_batch
    
    ###
    def store(self, sample)
        self._memory.append(sample)
    
    def sample_batch(self)
        mini_batch = random.sample(self._memory, batch_size)        
        return mini_batch

## Playing Atari with Deep Reinforcement Learning, Algorithm 1
- Q1. q 와 qtarget을 따로 둬야 하는 이유? - 행동하는 정책(q)과 학습하는 정책(q target)을 따로 둔다
- Q2. 매 스텝마다 learn을 해야 하는것인가?
- Q3. 언제 network(q, qtarget)의 parameter를 업데이트?
- Q4. DQN의 action selection strategy? greedy? -- 그냥 epsilon greedy
- Q5. zero_grad()?

In [33]:
# parameters
episode_len = 1000
plot_len = 50
buffer_size = 1000
batch_size = 10
epsilon = 0.0
gamma = 0.99

# env
env_list = ['CartPole-v0']

# init class
env = gym.make(env_list[0])
agent = DQNAgent(env, epsilon)
buffer = ReplayBuffer(buffer_size, gamma)

# To-do
# plot
scores = []
for i in range(episode_len):
    state = env.reset()
    total_reward = 0
    avg_reward = 0
    while True:
        print(state)
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        sample = {'state':state, 'reward': reward, 'done': done, 'qtarget': agent.qtarget.forward(next_state)}
        buffer.add(sample)
        if done:
            mini_batch = buffer.sample(batch_size)
            agent.learn(mini_batch)
            agent.update_qtarget_parameter()
            total_reward = 0
            break
    avg_reward += total_reward
    if i%plot_len == 0:
        avg_reward = avg_reward / plot_len
        scores.append(avg_reward)

4 2
[-0.03020097  0.04437784  0.03425706  0.00074789]


AttributeError: 'numpy.ndarray' object has no attribute 'dim'

In [None]:
# parameters
episode_len = 1000
plot_len = 50
buffer_size = 1000
batch_size = 10
epsilon = 0.0
gamma = 0.99

# env
env_list = ['CartPole-v0']

# init class
env = gym.make(env_list[0])
agent = DQNAgent(env, epsilon)
buffer = ReplayBuffer(buffer_size, gamma)

# To-do
# plot
scores = []
for i in range(episode_len):
    score = 0
    state = env.reset()
    while True:
        print(state)
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        score += reward
        sample = {'state': state, 'action': action, 
                  'next_state': next_state, 'reward': reward, 'done': done}
        buffer.store(sample)
        if done:
            mini_batch = buffer.sample(batch_size)
            agent.learn(mini_batch)
            agent.update_qtarget_parameter()
            total_reward = 0
            break
    
    if (len(buffer._memory) >= batch_size) and (i%plot_len == 0):
        scores.append(score/plot_len)
        agent._plot(scores)
        score = 0