Skip to content
Permalink
Browse files

updated linting

  • Loading branch information...
qfettes committed Jun 14, 2018
1 parent 01ef451 commit 8c1d37cd63f557277e6b537f6e0464d91f3142c3
Showing with 30 additions and 31 deletions.
  1. +4 −1 .gitignore
  2. +8 −10 agents/Categorical_DQN.py
  3. +6 −7 agents/DQN.py
  4. +1 −1 agents/Double_DQN.py
  5. +7 −7 agents/QuantileRegression_DQN.py
  6. +1 −1 agents/Rainbow.py
  7. +1 −1 utils/ReplayMemory.py
  8. +2 −3 utils/wrappers.py
@@ -6,4 +6,7 @@ __pycache__/
.ipynb_checkpoints/

#VSCode Meta
.vscode/
.vscode/

#linting
.mypy_cache/
@@ -7,11 +7,12 @@
from networks.networks import CategoricalDQN
from utils.hyperparameters import ATOMS, V_MAX, V_MIN, device


class Model(DQN_Agent):
def __init__(self, static_policy=False, env=None):
self.atoms=ATOMS
self.v_max=V_MAX
self.v_min=V_MIN
self.atoms = ATOMS
self.v_max = V_MAX
self.v_min = V_MIN
self.supports = torch.linspace(self.v_min, self.v_max, self.atoms).view(1, 1, self.atoms).to(device)
self.delta = (self.v_max - self.v_min) / (self.atoms - 1)

@@ -25,23 +26,21 @@ def projection_distribution(self, batch_vars):
batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values, indices, weights = batch_vars

with torch.no_grad():
max_next_dist = torch.zeros((self.batch_size, 1, self.atoms), device=device, dtype=torch.float) + 1./self.atoms
max_next_dist = torch.zeros((self.batch_size, 1, self.atoms), device=device, dtype=torch.float) + 1. / self.atoms
if not empty_next_state_values:
max_next_action = self.get_max_next_state_action(non_final_next_states)
self.target_model.sample_noise()
max_next_dist[non_final_mask] = self.target_model(non_final_next_states).gather(1, max_next_action)
max_next_dist = max_next_dist.squeeze()


Tz = batch_reward.view(-1, 1) + (self.gamma**self.nsteps)*self.supports.view(1, -1) * non_final_mask.to(torch.float).view(-1, 1)
Tz = batch_reward.view(-1, 1) + (self.gamma**self.nsteps) * self.supports.view(1, -1) * non_final_mask.to(torch.float).view(-1, 1)
Tz = Tz.clamp(self.v_min, self.v_max)
b = (Tz - self.v_min) / self.delta
l = b.floor().to(torch.int64)
u = b.ceil().to(torch.int64)
l[(u > 0) * (l == u)] -= 1
u[(l < (self.atoms - 1)) * (l == u)] += 1


offset = torch.linspace(0, (self.batch_size - 1) * self.atoms, self.batch_size).unsqueeze(dim=1).expand(self.batch_size, self.atoms).to(batch_action)
m = batch_state.new_zeros(self.batch_size, self.atoms)
m.view(-1).index_add_(0, (l + offset).view(-1), (max_next_dist * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b)
@@ -55,7 +54,7 @@ def compute_loss(self, batch_vars):
batch_action = batch_action.unsqueeze(dim=-1).expand(-1, -1, self.atoms)
batch_reward = batch_reward.view(-1, 1, 1)

#estimate
# estimate
self.model.sample_noise()
current_dist = self.model(batch_state).gather(1, batch_action).squeeze()

@@ -69,11 +68,10 @@ def compute_loss(self, batch_vars):

return loss


def get_action(self, s, eps):
with torch.no_grad():
if np.random.random() >= eps or self.static_policy or self.noisy:
X = torch.tensor([s], device=device, dtype=torch.float) #pylint: disable=E1102
X = torch.tensor([s], device=device, dtype=torch.float)
self.model.sample_noise()
a = self.model(X) * self.supports
a = a.sum(dim=2).max(1)[1].view(1, 1)
@@ -81,13 +81,13 @@ def prep_minibatch(self):

shape = (-1,)+self.num_feats

batch_state = torch.tensor(batch_state, device=device, dtype=torch.float).view(shape) #pylint: disable=E1102
batch_action = torch.tensor(batch_action, device=device, dtype=torch.long).squeeze().view(-1, 1) #pylint: disable=E1102
batch_reward = torch.tensor(batch_reward, device=device, dtype=torch.float).squeeze().view(-1, 1) #pylint: disable=E1102
batch_state = torch.tensor(batch_state, device=device, dtype=torch.float).view(shape)
batch_action = torch.tensor(batch_action, device=device, dtype=torch.long).squeeze().view(-1, 1)
batch_reward = torch.tensor(batch_reward, device=device, dtype=torch.float).squeeze().view(-1, 1)

non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=device, dtype=torch.uint8) #pylint: disable=E1102
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=device, dtype=torch.uint8)
try: #sometimes all next states are false
non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=device, dtype=torch.float).view(shape) #pylint: disable=E1102
non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=device, dtype=torch.float).view(shape)
empty_next_state_values = False
except:
non_final_next_states = None
@@ -145,11 +145,10 @@ def update(self, s, a, r, s_, frame=0):
self.update_target_model()
return loss.item()


def get_action(self, s, eps=0.1):
with torch.no_grad():
if np.random.random() >= eps or self.static_policy or self.noisy:
X = torch.tensor([s], device=device, dtype=torch.float) #pylint: disable=E1102
X = torch.tensor([s], device=device, dtype=torch.float)
self.model.sample_noise()
tmp = timer()
a = self.model(X).max(1)[1].view(1, 1)
@@ -9,4 +9,4 @@ def __init__(self, static_policy=False, env=None):
super(Model, self).__init__(static_policy, env)

def get_max_next_state_action(self, next_states):
return self.model(next_states).max(dim=1)[1].view(-1, 1)
return self.model(next_states).max(dim=1)[1].view(-1, 1)
@@ -10,7 +10,7 @@
class Model(DQN_Agent):
def __init__(self, static_policy=False, env=None):
self.num_quantiles = QUANTILES
self.cumulative_density = torch.tensor((2 * np.arange(self.num_quantiles) + 1) / (2.0 * self.num_quantiles), device=device, dtype=torch.float) #pylint: disable=E1102
self.cumulative_density = torch.tensor((2 * np.arange(self.num_quantiles) + 1) / (2.0 * self.num_quantiles), device=device, dtype=torch.float)
self.quantile_weight = 1.0 / self.num_quantiles

super(Model, self).__init__(static_policy, env)
@@ -30,7 +30,7 @@ def next_distribution(self, batch_vars):
max_next_action = self.get_max_next_state_action(non_final_next_states)
quantiles_next[non_final_mask] = self.target_model(non_final_next_states).gather(1, max_next_action).squeeze(dim=1)

quantiles_next = batch_reward + (self.gamma*quantiles_next)
quantiles_next = batch_reward + (self.gamma * quantiles_next)

return quantiles_next

@@ -39,7 +39,7 @@ def compute_loss(self, batch_vars):

batch_action = batch_action.unsqueeze(dim=-1).expand(-1, -1, self.num_quantiles)

#estimate
# estimate
self.model.sample_noise()
quantiles = self.model(batch_state)
quantiles = quantiles.gather(1, batch_action).squeeze(1)
@@ -49,7 +49,7 @@ def compute_loss(self, batch_vars):
diff = quantiles_next.t().unsqueeze(-1) - quantiles.unsqueeze(0)

loss = self.huber(diff) * torch.abs(self.cumulative_density.view(1, -1) - (diff < 0).to(torch.float))
loss = loss.transpose(0,1)
loss = loss.transpose(0, 1)
if self.priority_replay:
self.memory.update_priorities(indices, loss.detach().mean(1).sum(-1).abs().cpu().numpy().tolist())
loss = loss * weights.view(self.batch_size, 1, 1)
@@ -60,13 +60,13 @@ def compute_loss(self, batch_vars):
def get_action(self, s, eps):
with torch.no_grad():
if np.random.random() >= eps or self.static_policy or self.noisy:
X = torch.tensor([s], device=device, dtype=torch.float) #pylint: disable=E1102
X = torch.tensor([s], device=device, dtype=torch.float)
self.model.sample_noise()
a = (self.model(X)*self.quantile_weight).sum(dim=2).max(dim=1)[1]
a = (self.model(X) * self.quantile_weight).sum(dim=2).max(dim=1)[1]
return a.item()
else:
return np.random.randint(0, self.num_actions)

def get_max_next_state_action(self, next_states):
next_dist = self.target_model(next_states)*self.quantile_weight
next_dist = self.target_model(next_states) * self.quantile_weight
return next_dist.sum(dim=2).max(1)[1].view(next_states.size(0), 1, 1).expand(-1, -1, self.num_quantiles)
@@ -73,7 +73,7 @@ def compute_loss(self, batch_vars):

def get_action(self, s, eps):
with torch.no_grad():
X = torch.tensor([s], device=device, dtype=torch.float) #pylint: disable=E1102
X = torch.tensor([s], device=device, dtype=torch.float)
self.model.sample_noise()
a = self.model(X) * self.supports
a = a.sum(dim=2).max(1)[1].view(1, 1)
@@ -137,7 +137,7 @@ def sample(self, batch_size):
p_sample = self._it_sum[idx] / self._it_sum.sum()
weight = (p_sample * len(self._storage)) ** (-beta)
weights.append(weight / max_weight)
weights = torch.tensor(weights, device=device, dtype=torch.float) #pylint: disable=E1102
weights = torch.tensor(weights, device=device, dtype=torch.float)
encoded_sample = self._encode_sample(idxes)
return encoded_sample, idxes, weights

@@ -5,7 +5,6 @@
import cv2
cv2.ocl.setUseOpenCL(False)

#pylint: disable=E0202

class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
@@ -24,7 +23,7 @@ def reset(self, **kwargs):
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
assert noops > 0
obs = None
for _ in range(noops):
@@ -120,7 +119,7 @@ def step(self, action):

return max_frame, total_reward, done, info

def reset(self, **kwargs): #pylint: disable=E0102
def reset(self, **kwargs):
return self.env.reset(**kwargs)

class ClipRewardEnv(gym.RewardWrapper):

0 comments on commit 8c1d37c

Please sign in to comment.
You can’t perform that action at this time.