In [1]:
import pickle

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F
import numpy as np

from itertools import count

In [3]:
device='cuda'

In [4]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [5]:
class TD3():
    
    def __init__(self, heu_explore=False, role=['retailer'], target_inv=None, state_dim=16, action_dim=2):
        
        self.heu_explore = heu_explore
#         self.env = build_beer_game_uniform(player=role)
        self.env = build_beer_game_uniform_multi_player(players=role)
        
        self.ad = ActionDecoder(action_space, action_type='y')
        self.noise = ExplorationActionMod(action_space, target_inv=target_inv)

#         state_dim  = 8
#         action_dim = 1
        
#         state_dim  = 16
#         action_dim = 2
        
        
        hidden_dim = 256

        self.value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
        self.value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
        self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

        self.target_value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
        self.target_value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
        self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

        soft_update(self.value_net1, self.target_value_net1, soft_tau=1.0)
        soft_update(self.value_net2, self.target_value_net2, soft_tau=1.0)
        soft_update(self.policy_net, self.target_policy_net, soft_tau=1.0)


        self.value_criterion = nn.MSELoss()

        policy_lr = 3e-4
        value_lr  = 3e-4

        self.value_optimizer1 = optim.Adam(self.value_net1.parameters(), lr=value_lr)
        self.value_optimizer2 = optim.Adam(self.value_net2.parameters(), lr=value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)


        replay_buffer_size = 2 ** 20
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        
        self.episode_idx = 0
        self.reward_history = []
        self.training_history = []
        
        
    def td3_update(self, 
                   step,
                   batch_size,
                   gamma=0.95,
                   soft_tau=1e-2,
                   noise_std=0.2 * 1,
                   noise_clip=0.5 * 1,
                   policy_update=2,
                  ):

        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)

        state      = torch.FloatTensor(state).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        action     = torch.FloatTensor(action).to(device)
        reward     = torch.FloatTensor(reward).unsqueeze(1).to(device)
        done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

        next_action = self.target_policy_net(next_state)
        noise = torch.normal(torch.zeros(next_action.size()), noise_std).to(device)
        noise = torch.clamp(noise, -noise_clip, noise_clip)
        next_action += noise

        target_q_value1  = self.target_value_net1(next_state, next_action)
        target_q_value2  = self.target_value_net2(next_state, next_action)
        target_q_value   = torch.min(target_q_value1, target_q_value2)
        expected_q_value = reward + (1.0 - done) * gamma * target_q_value

        q_value1 = self.value_net1(state, action)
        q_value2 = self.value_net2(state, action)

        value_loss1 = self.value_criterion(q_value1, expected_q_value.detach())
        value_loss2 = self.value_criterion(q_value2, expected_q_value.detach())

        self.value_optimizer1.zero_grad()
        value_loss1.backward()
        self.value_optimizer1.step()

        self.value_optimizer2.zero_grad()
        value_loss2.backward()
        self.value_optimizer2.step()

        if step % policy_update == 0:
            policy_loss = self.value_net1(state, self.policy_net(state))
            policy_loss = -policy_loss.mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            soft_update(self.value_net1, self.target_value_net1, soft_tau=soft_tau)
            soft_update(self.value_net2, self.target_value_net2, soft_tau=soft_tau)
            soft_update(self.policy_net, self.target_policy_net, soft_tau=soft_tau)

        return value_loss1.item(), value_loss2.item()
    
    
    def train(self, run, episode=1000):
        for epi in tqdm(range(episode)):
            state = self.env.reset()
            episode_reward = 0
            epi_vl1, epi_vl2 = 0, 0
            actions = []

            for step in count():
                action = self.policy_net.get_action(state)

                # add noise to the action during training
                action = self.noise.add_noise(action, state, self.episode_idx, exploration_strategy='gaussian', heu_explore=self.heu_explore)
                
#                 print(action)
                quantity = self.ad.decode(action, state)
                
#                 print(quantity)
                next_state, reward, done, _ = self.env.step(quantity)

                actions.append(action)

                self.replay_buffer.push(state, action, reward, next_state, done)
        #         replay_buffer.push(state, quantity, reward, next_state, done)


                if len(self.replay_buffer) > batch_size:
                    step_vl1, step_vl2 = self.td3_update(step, batch_size)
                    epi_vl1 += step_vl1
                    epi_vl2 += step_vl2

                state = next_state
                episode_reward += reward


                if done:
                    self.episode_idx += 1
                    self.training_history.append([run, self.episode_idx, self.heu_explore, episode_reward, epi_vl1/step, epi_vl2/step])
                    break


            if self.episode_idx % 10 == 0:
                for i in range(5):
                    
                    self.reward_history.append(self.test(run))


#             if self.episode_idx % 15 == 0:
#                 plot_test()

    def test(self, run):

#         cul_rewards = []

        state = self.env.reset()

        cul_reward = 0
        for t in count():
            action = self.policy_net.get_action(state)
#             quantity = int(self.ad.decode(action, state))
            quantity = self.ad.decode(action, state)
#             print(quantity)
            next_state, reward, done, _ = self.env.step(quantity)

            cul_reward += reward

            state = next_state
            if done:
                break

        return [run, self.heu_explore, self.episode_idx, cul_reward]
#         cul_rewards.append(cul_reward)

#         self.reward_history.append(np.mean(cul_rewards))

In [6]:
class ActionDecoder():
    def __init__(self, action_space, action_type='y'):
        self.low  = action_space['low']
        self.high = action_space['high']
        
        self.action_type = action_type
    
    def decode(self, action, state):
        quantity = self.low + (np.array(action) + 1.0) * 0.5 * (self.high - self.low)
        quantity = np.clip(quantity, self.low, self.high)

        if self.action_type == 'd+y':  # TODO
#             quantity = max(0, (state[2] + quantity)[0])
            quantity = np.maximum(0, [state[i*8+2] + quantity[i] for i in range(quantity.size)])
        elif self.action_type == 'y':
#             quantity = quantity[0] + 8.0
            quantity = quantity + 8.0

        return np.round(quantity, decimals=0)



In [7]:
class ExplorationActionMod(object):
    def __init__(self, action_space, max_sigma=0.9, min_sigma=0.1, decay_episode=1000, target_inv=None):
        self.low  = action_space['low']
        self.high = action_space['high']
        
        self.max_sigma = max_sigma
        self.min_sigma = min_sigma
        self.decay_episode = decay_episode
        
        self.target_inv = target_inv
    
    def add_noise(self, action, state, t=0, exploration_strategy='gaussian', heu_explore=False):
        

        
        
        if exploration_strategy == 'gaussian':
            # sigma linearly decays from max to min 
            sigma  = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_episode)
            action = action + np.random.normal(size=len(action)) * sigma
        
        elif exploration_strategy == 'epsilon':
#             EPS = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_episode)
            if sample < EPS:
                action = [random.random()*2-1]
            else:
                action = action + np.zeros(1)
                
               
        if heu_explore:
            assert self.target_inv is not None
            
#         EPS = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_episode)
        EPS = self.max_sigma - self.max_sigma * min(1.0, t / self.decay_episode)
        sample = random.random()        
        
        if heu_explore == True:
#             if sample < EPS ** 3 * 0.05:
#             if sample < EPS ** 3:
            if sample < (EPS/2) ** 2:
                # exploration guided by heuristics
#                 self.target_inv = 19 # // TODO
#                 bsq = self.target_inv - (state[0] + state[3] - state[1])
    
                bsq = np.array(self.target_inv) - np.array([state[i*8+0] + state[i*8+3] - state[i*8+1] for i in range(self.target_inv.size)])

    #             if self.action_type == 'd+y':
    #                 hea = np.clip((bsq - state[2]).cpu().numpy(), -8, 8) + 8
    #             elif self.action_type == 'y':
#                 hea = (np.clip(bsq, 2, 18) - 2 ) / 8.0 - 1.0

                # if action type is d+y 
                d = np.array([state[2]]+[state[(i-1)*8+5] for i in range(1,self.target_inv.size)])
                unfilled_demand = d + [state[i*8+1] for i in range(self.target_inv.size)]
                unfilled_demand[0] = state[1] 
                
                bsq = np.array(self.target_inv) - np.array([state[i*8+0] + state[i*8+3] - unfilled_demand[i] for i in range(self.target_inv.size)])

                    
#                 d = np.array([state[2]]+[state[(i-1)*8+6] for i in range(1,self.target_inv.size)])
        
                hea = np.clip(bsq-d, -8, 8) / 8.0
#                 action = np.array([hea])
                action = np.array(hea)


        
#         return np.clip(action, self.low, self.high)
        return np.clip(action, -1.0, 1.0)
    
#https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/gaussian_strategy.py

In [8]:
# utilized = [i for i in range(32)]
# utilized.remove(26)
# utilized.remove(18)
# utilized.remove(10)


class ValueNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
#         state = state[:, utilized]
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
    

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(PolicyNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, num_actions)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
#         state = state[:, utilized]
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))
        return x
    
    def get_action(self, state):
        state  = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.forward(state)
        return action.detach().cpu().numpy()[0]

In [9]:
with open('models_single_player_manufacturer.pickle', 'rb') as f:
    loadedmodels = pickle.load(f)

In [10]:
r = 0
runs = 100

for i in range(runs):
    result = loadedmodels[0].test(1000)
    r += result[3]
    
print(r/runs)

-1248.14
