In [1]:
import sys
sys.path.append("/Users/ratnadeepb/research/park")

In [28]:
import numpy as np
import random
# from IPython.display import clear_output
from collections import deque
import progressbar

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Reshape #, Embedding
from tensorflow.keras.optimizers import Adam
import park

In [15]:
# default floatx is float32
# to change individual layer type add `dtype=float64` to layer constructor
tf.keras.backend.set_floatx('float64') # change all layer default output to float64; default is float32

In [129]:
env = park.make("load_balance")
print('Number of states: {}'.format(env.observation_space.shape[0]))
print('Number of actions: {}'.format(env.action_space.n))

INFO:root:Making new env load_balance


Number of states: 11
Number of actions: 10


In [120]:
tf.Tensor(state.reshape(-1, 1), dtype="float64")

TypeError: __init__() missing 1 required positional argument: 'value_index'

In [138]:
class Agent:
    def __init__(self, env, optimizer):
        
        # Initialize atributes
#         self._state_size = env.observation_space.shape[0]
        self._state_size = 1
        self._action_size = env.action_space.n
        self._optimizer = optimizer
        
        self.expirience_replay = deque(maxlen=2000)
        
        # Initialize discount and exploration rate
        self.gamma = 0.6
        self.epsilon = 0.1
        
        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_target_model()

    def store(self, state, action, reward, next_state, terminated):
        # Store experience
        self.expirience_replay.append((state, action, reward, next_state, terminated))
    
    def _build_compile_model(self):
        # Create the model
        # a feedforward network
        model = Sequential([
                # Adds a densely-connected layer with 64 units to the model:
#                 Dense(64, activation='relu', input_shape=self._state_size),
#                 Dense(64, activation='relu', input_shape=(None, 11, 1)),
                Dense(64, activation='relu', input_dim=self._state_size),
                Dense(64, activation='relu'),
                tf.keras.layers.Dropout(0.2),
                # Add an output layer with env.action_space.n (10) output units:
                Dense(env.action_space.n)])
        
        # Configure a model for mean-squared error regression.
        model.compile(loss='mse', optimizer=self._optimizer, metrics=['mae'])
        return model

    def align_target_model(self):
        self.target_network.set_weights(self.q_network.get_weights())
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            # Explore
            return env.action_space.sample()
        # greedy action
        q_values = self.q_network.predict([state.reshape(-1, 1)])
        return np.argmax(q_values[0])

    def retrain(self, batch_size):
        # pick random samples from experience memory and train the network
        minibatch = random.sample(self.expirience_replay, batch_size)
        
        for state, action, reward, next_state, terminated in minibatch:
            
            target = self.q_network.predict(state)
            
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict([next_state.reshape(-1, 1)])
                target[0][action] = reward + self.gamma * np.amax(t)
            
            self.q_network.fit(state, target, epochs=1, verbose=0)

In [139]:
optimizer = Adam(learning_rate=0.01)
agent = Agent(env, optimizer)

batch_size = 10
num_of_episodes = 2
timesteps_per_episode = 10
agent.q_network.summary()

Model: "sequential_50"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_147 (Dense)            (None, 64)                128       
_________________________________________________________________
dense_148 (Dense)            (None, 64)                4160      
_________________________________________________________________
dropout_34 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_149 (Dense)            (None, 10)                650       
Total params: 4,938
Trainable params: 4,938
Non-trainable params: 0
_________________________________________________________________


In [140]:
for e in range(0, num_of_episodes):
    # Reset the env
    state = env.reset()
    
    # Initialize variables
    reward = 0
    terminated = False
    
    bar = progressbar.ProgressBar(maxval=timesteps_per_episode/10, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    for timestep in range(timesteps_per_episode):
        # Run Action
        action = agent.act(state)
        
        # Take action    
        # step() is a function provided by OpenAIGym to step through the environment
        # the step function provides if episode has terminated
        next_state, reward, terminated, info = env.step(action)
        agent.store(state, action, reward, next_state, terminated)
        
        state = next_state
        
        if terminated:
            agent.align_target_model()
            break
            
        if len(agent.expirience_replay) > batch_size:
            agent.retrain(batch_size)
        
        if timestep%10 == 0:
            bar.update(timestep/10 + 1)
    
    bar.finish()
    if (e + 1) % 10 == 0:
        print("**********************************")
        print("Episode: {}".format(e + 1))
#         env.render()
        print("**********************************")



In [141]:
agent.q_network.weights

[<tf.Variable 'dense_147/kernel:0' shape=(1, 64) dtype=float64, numpy=
 array([[-0.07484313, -0.03218543,  0.08100189, -0.16856313, -0.05738871,
         -0.15596949, -0.14344993, -0.09284663,  0.04901748, -0.05874215,
          0.1394715 ,  0.12418054,  0.09317688,  0.1738387 , -0.17761232,
          0.19872926, -0.04659388,  0.00336205, -0.11145804, -0.02452014,
          0.09349994,  0.1200101 ,  0.06909576, -0.1703019 , -0.08488101,
          0.11024516, -0.16083452,  0.18351591, -0.06915469, -0.02201007,
          0.11208523, -0.20749332, -0.05990693,  0.07194697,  0.02194111,
         -0.1948713 ,  0.07672964, -0.26843145, -0.10092921,  0.04712746,
          0.05592956, -0.02204784,  0.18843578, -0.2820346 ,  0.05883852,
          0.04763998, -0.02797758, -0.00174423, -0.21928572,  0.02982591,
          0.13115012,  0.07525647,  0.12058703, -0.08779514,  0.08614374,
          0.07572894,  0.08091547, -0.05658172, -0.24889282, -0.02891738,
         -0.21521194, -0.22526737, -0.270

In [142]:
agent.target_network.weights

[<tf.Variable 'dense_150/kernel:0' shape=(1, 64) dtype=float64, numpy=
 array([[-0.07484313, -0.03218543,  0.17315779, -0.16856313,  0.0026635 ,
         -0.15596949, -0.14344993, -0.09284663,  0.07072231, -0.05874215,
          0.13977964,  0.09572458,  0.19067264,  0.22574636, -0.17761232,
          0.29249721,  0.01542693,  0.08546279, -0.11145804, -0.02452014,
          0.12830808,  0.05534588,  0.0334858 , -0.1703019 , -0.08488101,
          0.17767946, -0.16083452,  0.28813957, -0.06915469,  0.02237131,
          0.14971392, -0.20749332, -0.05990693,  0.17887608,  0.13525768,
         -0.1948713 ,  0.17141749, -0.26843145, -0.10092921,  0.06444795,
          0.12692374,  0.1562484 ,  0.21710207, -0.2820346 ,  0.19915185,
          0.12244356,  0.05251989, -0.00174423, -0.21928572,  0.13328565,
          0.22977678,  0.16539467,  0.25694555, -0.08779514,  0.18567115,
          0.08629407,  0.0757643 ,  0.00623719, -0.24889282, -0.02891738,
         -0.21521194, -0.22526737, -0.270

In [143]:
agent.target_network.get_weights()

[array([[-0.07484313, -0.03218543,  0.17315779, -0.16856313,  0.0026635 ,
         -0.15596949, -0.14344993, -0.09284663,  0.07072231, -0.05874215,
          0.13977964,  0.09572458,  0.19067264,  0.22574636, -0.17761232,
          0.29249721,  0.01542693,  0.08546279, -0.11145804, -0.02452014,
          0.12830808,  0.05534588,  0.0334858 , -0.1703019 , -0.08488101,
          0.17767946, -0.16083452,  0.28813957, -0.06915469,  0.02237131,
          0.14971392, -0.20749332, -0.05990693,  0.17887608,  0.13525768,
         -0.1948713 ,  0.17141749, -0.26843145, -0.10092921,  0.06444795,
          0.12692374,  0.1562484 ,  0.21710207, -0.2820346 ,  0.19915185,
          0.12244356,  0.05251989, -0.00174423, -0.21928572,  0.13328565,
          0.22977678,  0.16539467,  0.25694555, -0.08779514,  0.18567115,
          0.08629407,  0.0757643 ,  0.00623719, -0.24889282, -0.02891738,
         -0.21521194, -0.22526737, -0.27067115,  0.07373622]]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [144]:
predictions = agent.target_network.predict(state)

In [145]:
tf.nn.softmax(predictions).numpy()

array([[2.08133906e-023, 5.38588840e-001, 3.51537731e-020,
        1.30923538e-014, 3.44650671e-051, 1.49113586e-006,
        7.85996391e-028, 3.04814369e-018, 8.12625168e-064,
        4.61409669e-001],
       [1.19858361e-011, 5.17721167e-001, 4.07615134e-010,
        1.79405487e-007, 7.87012688e-025, 1.19350524e-003,
        9.54757569e-014, 3.38769756e-009, 8.01598259e-031,
        4.81085144e-001],
       [1.00000000e-001, 1.00000000e-001, 1.00000000e-001,
        1.00000000e-001, 1.00000000e-001, 1.00000000e-001,
        1.00000000e-001, 1.00000000e-001, 1.00000000e-001,
        1.00000000e-001],
       [7.18458417e-009, 5.10575050e-001, 9.70732865e-008,
        8.68440316e-006, 1.33111905e-018, 5.76873461e-003,
        2.02754389e-010, 4.63488723e-007, 5.01850435e-023,
        4.83646963e-001],
       [1.00000000e-001, 1.00000000e-001, 1.00000000e-001,
        1.00000000e-001, 1.00000000e-001, 1.00000000e-001,
        1.00000000e-001, 1.00000000e-001, 1.00000000e-001,
        1.0

In [117]:
state.shape

(11,)