In [1]:
import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
import copy
import collections

In [4]:
params = {  'double_q':         True,
            'nn_layer_width':   32,
            'rollout_steps':    1,        # no nstep rollout
            'replay_size':      24*365*2, # 240*365*2/10,
            'replay_initial':   24*10,    # 240*10/10,
            'target_net_sync':  24*10,    # 240*10/10,
            'epsilon_frames':   24*30*6,  # 240*30*6/10,
            'epsilon_start':    1.0,
            'epsilon_final':    0.01,
            'learning_rate':    1E-3,
            'gamma':            0.90, 
            'batch_size':       32
        }

In [5]:
class DQN(nn.Module):
    def __init__(self, input_shape, n_actions, layer_width):
        super(DQN, self).__init__()
        
        self.fc = nn.Sequential(
            nn.Linear(input_shape, layer_width),
            nn.ReLU(),
            nn.Linear(layer_width, n_actions)
        )

    def forward(self, x):
        return self.fc(x.float())

In [6]:
class TargetNet:
    """
    Wrapper around model which provides copy of it instead of trained weights
    """
    def __init__(self, model):
        self.model = model
        self.target_model = copy.deepcopy(model)

    def sync(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def alpha_sync(self, alpha):
        """
        Blend params of target net with params from the model
        :param alpha:
        """
        assert isinstance(alpha, float)
        assert 0.0 < alpha <= 1.0
        state = self.model.state_dict()
        tgt_state = self.target_model.state_dict()
        for k, v in state.items():
            tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v
        self.target_model.load_state_dict(tgt_state)

# End of TargetNet()
########################################################

In [7]:
class night_utility_generator(object): # generates utility of data, dependent on time
    def __init__(self):
        pass    
    def get_utility(self, time):
        assert 0<=time<24, 'Invalid time'
        if 0<=time<6 or 18<time<24:
            utility = 0.5
        else:
            utility = 0.2
        return utility

In [8]:
class toy_utility(gym.Env):
    def __init__(self):
        super(toy_utility, self).__init__()

        # Actions = 10 discrete duty cycles
        self.NO_OF_DUTY_CYCLES = 10
        self.action_space = spaces.Discrete(n=self.NO_OF_DUTY_CYCLES)

        # Observation = [utility]
        self.observation_space = spaces.Box(low=0, 
                                            high=1, 
                                            shape=(1,))
        self.time_obs = 0
        self.time_idx = 0

        self.MIN_DC = 1/self.NO_OF_DUTY_CYCLES # Minimum duty cycle

    def reset(self):
        self.TIMESLOTS_PER_DAY = 24
        self.utility_gen = night_utility_generator()
        self.time = 0
        self.time_idx = 0
        self.utility_obs = self.utility_gen.get_utility(self.time_obs)
        return np.array(self.utility_obs)

    def step(self,action):
        self.time_idx += 1
        self.time_obs = self.time_idx%self.TIMESLOTS_PER_DAY
        self.utility_obs = self.utility_gen.get_utility(self.time_obs)
        
        reward = self.reward(action)
        
        if self.time_idx == self.TIMESLOTS_PER_DAY*1000:
            done = True
        else:
            done = False
        
        info = {}
        return np.array(self.utility_obs), reward, done, info

    def reward(self,action):
        sense_dc = action/self.NO_OF_DUTY_CYCLES + self.MIN_DC
        if sense_dc >= self.utility_obs:
            return self.utility_obs
        else:
            return sense_dc *0.75


In [9]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states),np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(next_states)

In [10]:
def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=False):
    states, actions, rewards, dones, next_states = batch    
    
    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.tensor(dones,dtype=torch.bool).to(device)

    state_action_values = net(states_v).gather(dim=1, 
                                               index=actions_v.unsqueeze(-1)).squeeze(-1)

    if double: # double-DQN
        next_state_actions = net(next_states_v).max(dim=1)[1] # get greedy actions from policy net
        next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    else:
        next_state_values = tgt_net(next_states_v).max(dim=1)[0]
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [11]:
def calc_values_of_states(states, net, device="cpu"):
    mean_vals = []
    # np.array_split splits the array into sub-arrays 
    # (in this case, each sub-array has a max size of 64)
    # For an array of length l that should be split into n sections, 
    # it returns l % n sub-arrays of size l//n + 1 and the rest of size l//n.
    for batch in np.array_split(states, 64): 
        states_v = torch.tensor(batch).to(device)
        action_values_v = net(states_v) #get q_values of all actions for all the 64 states
        best_action_values_v = action_values_v.max(1)[0] # get maximum q_values for each of the 64 states
        mean_vals.append(best_action_values_v.mean().item()) # take mean of the 64 max q_values
    return np.mean(mean_vals)

In [12]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [13]:
env = toy_utility()



In [14]:
device = torch.device('cpu')
policy_net = DQN(env.observation_space.shape[0],
                               env.action_space.n,
                               params['nn_layer_width']).to(device)
target_net = TargetNet(policy_net)
target_net.sync()
buffer = ExperienceBuffer(params['replay_size'])
optimizer = optim.Adam(policy_net.parameters(), lr=params['learning_rate'])

In [19]:
utility_log =[]
action_log = []
reward_log = []
frame_idx = 0

state = env.reset()
utility_log.append(state)

is_done = False

while not is_done:
    frame_idx += 1
    # Get epsilon
    epsilon = max(params['epsilon_start'], params['epsilon_final'] - frame_idx /params['epsilon_frames'])

    # Get action
    if np.random.random() < epsilon: # random exploratory action
        action = np.random.randint(env.action_space.n)
    else: # greedy action
        with torch.no_grad():
            state_v = torch.tensor(np.array(state, copy=False)).unsqueeze(dim=0).to(device) # convert to torch tensor
            q_vals_v = policy_net(state_v) # get Q-values
            _, action_idx = torch.max(q_vals_v, dim=1) # Argmax Q-values and extract action
            action = int(action_idx.item()) # convert from torch to python variable

    # do step in the environment
    new_state, reward, is_done, _ = env.step(action)
    
    # Record log
    utility_log.append(new_state)
    action_log.append(action/10 + 0.1)
    reward_log.append(reward)
            
    # Record experience
    exp = Experience(state, action, reward, is_done, new_state)
    buffer.append(exp)

    if len(buffer) < params['replay_initial']:
        continue 
    # Train
    optimizer.zero_grad()
    batch = buffer.sample(params['batch_size'])
    loss_v = calc_loss(batch, 
                      policy_net, 
                      target_net.target_model, 
                      device=device,
                       gamma = params['gamma'],
                      double=params['double_q'])
    loss_v.backward()
    optimizer.step()
        
    


# for i in range(50):
#     action =  np.random.randint(10)
#     next_state, reward, done, info = env.step(action)
    

RuntimeError: size mismatch, m1: [1 x 32], m2: [1 x 32] at C:\w\1\s\tmp_conda_3.6_171155\conda\conda-bld\pytorch_1570813991702\work\aten\src\TH/generic/THTensorMath.cpp:197

In [None]:
plt.plot(action_log)
plt.plot(utility_log)
plt.plot(reward_log)