In [2]:
import gym
import copy
import torch
import pickle
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim

from logger import Logger
from importlib import reload
import sys

ModuleNotFoundError: No module named 'logger'

In [44]:
class Network(nn.Module):
    def __init__(self, len_state, num_quant, num_actions):
        nn.Module.__init__(self)
        
        self.num_quant = num_quant
        self.num_actions = num_actions
        
        self.layer1 = nn.Linear(len_state, 256)
        self.layer2 = nn.Linear(256, num_actions*num_quant)  

    def forward(self, x):
        input = torch.Tensor(x)

        x = self.layer1(input)
        x = torch.tanh(x)
        x = self.layer2(x)
        return x.view(-1, self.num_actions, self.num_quant)
    
    def select_action(self, state, eps):
        if not isinstance(state, torch.Tensor): state = torch.Tensor([state])    
        
        action = torch.randint(0, 2, (1,))
        if random.random() > eps: action = self.forward(state).mean(2).max(1)[1]
        
        return int(action)

In [244]:
eps_start, eps_end, eps_dec = 0.25, 0.01, 5
eps = lambda steps: eps_end + (eps_start - eps_end) * np.exp(-1. * steps / eps_dec)

In [62]:
reload(sys.modules['rl_utils'])
from rl_utils import ReplayMemory, huber

env_name = 'MountainCar-v0'
env = gym.make(env_name)

memory = ReplayMemory(10000)
logger = Logger('q-net', fmt={'loss': '.5f'})

In [63]:
Z = Network(len_state=len(env.reset()), num_quant=2, num_actions=env.action_space.n)
Ztgt = Network(len_state=len(env.reset()), num_quant=2, num_actions=env.action_space.n)
optimizer = optim.Adam(Z.parameters(), 1e-3)

In [81]:
steps_done = 0
running_reward = None
gamma, batch_size = 0.99, 32 
tau = torch.Tensor((2 * np.arange(Z.num_quant) + 1) / (2.0 * Z.num_quant))
tau

tensor([0.2500, 0.7500])

In [361]:
n = 16
np.arange(1./(2 * n), 1, 1/n)
np.full(1, 2)

array([2])

In [381]:
s, e = [], []
for i in range(10):
    temp = copy.deepcopy(env)
    s.append(temp.reset(seed = i)[0])
    e.append(temp)
s

[array([-0.47260767,  0.        ], dtype=float32),
 array([-0.49763566,  0.        ], dtype=float32),
 array([-0.5476776,  0.       ], dtype=float32),
 array([-0.5828702,  0.       ], dtype=float32),
 array([-0.41138878,  0.        ], dtype=float32),
 array([-0.4389994,  0.       ], dtype=float32),
 array([-0.49236712,  0.        ], dtype=float32),
 array([-0.47498092,  0.        ], dtype=float32),
 array([-0.53460556,  0.        ], dtype=float32),
 array([-0.42595017,  0.        ], dtype=float32)]

In [373]:
def step(env, action):
    s_prime, r, done, trunc, _ = env.step(action)
    return [s_prime, r, done, trunc]

step_func = np.vectorize(step)

In [398]:
a = np.array([pow(.99, n) for n in range(3)])
(a * [1, 2, 3]).sum()

5.920299999999999

In [404]:
from time import process_time

def testgpu():
    if torch.backends.mps.is_available():
        mps_device = torch.device("mps")
    t0 = process_time()
    x = torch.ones(n1, device=mps_device)
    y = x + torch.rand(n1, device=mps_device)
    t1 = process_time()
    print(f"Total time with gpu ({n1}): {t1-t0}")
    t0 = process_time()
    x = torch.ones(n2, device=mps_device)
    y = x + torch.rand(n2, device=mps_device)
    t1 = process_time()
    print(f"Total time with gpu ({n2}): {t1-t0}")

def testcpu():
    t0 = process_time()
    x = torch.ones(n1)
    y = x + torch.rand(n1)
    t1 = process_time()
    print(f"Total time with cpu ({n1}): {t1-t0}")
    t0 = process_time()
    x = torch.ones(n2)
    y = x + torch.rand(n2)
    t1 = process_time()
    print(f"Total time with cpu ({n2}): {t1-t0}")


In [406]:
n1 = 10000
n2 = 100000000
testcpu()
testgpu()

Total time with cpu (10000): 0.0005150000000000432
Total time with cpu (100000000): 1.408566999999998
Total time with gpu (10000): 0.002881000000002132
Total time with gpu (100000000): 0.0014089999999953307


In [8]:
pow(2, np.ceil(np.log2(10000)))

16384.0

In [11]:
random.sample(1000, 32)

TypeError: Population must be a sequence.  For dicts or sets, use sorted(d).

In [6]:
a = torch.randn(10, 4)
torch.mean(a, 0)

tensor([ 0.0062, -0.2653,  0.2676,  0.2517])

In [65]:
for episode in range(501): 
    sum_reward = 0
    state = env.reset()[0]
    while True:
        steps_done += 1
        
        action = Z.select_action(torch.Tensor([state]), eps(steps_done))
        next_state, reward, done, trunc, _ = env.step(action)
        print(state, action, next_state, reward)
        
        memory.push(state, action, next_state, reward, float(done | trunc))
        sum_reward += reward
        
        if len(memory) < batch_size: break    
        states, actions, rewards, next_states, dones = memory.sample(batch_size)
        
        theta = Z(states)[np.arange(batch_size), actions]
        
        Znext = Ztgt(next_states).detach()
        Znext_max = Znext[np.arange(batch_size), Znext.mean(2).max(1)[1]]
        Ttheta = rewards + gamma * (1 - dones) * Znext_max
        
        diff = Ttheta.t().unsqueeze(-1) - theta 
        loss = huber(diff) * (tau - (diff.detach() < 0).float()).abs()
        loss = loss.mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        state = next_state
        
        if steps_done % 100 == 0:
            Ztgt.load_state_dict(Z.state_dict())
            
        if done and episode % 50 == 0:
            logger.add(episode, steps=steps_done, running_reward=running_reward, loss=loss.data.numpy())
            logger.iter_info()
            
        if (done or trunc): 
            running_reward = sum_reward  if not running_reward else 0.2 * sum_reward + running_reward*0.8
            break

(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


RuntimeError: The size of tensor a (32) must match the size of tensor b (2) at non-singleton dimension 1

In [7]:
[1, 2, 3] + []

[1, 2, 3]