# Consolidation with DQN

In [1]:
"""
Gosssip-based virual machine consolidation in a cloud environment.
"""

'\nGosssip-based virual machine consolidation in a cloud environment.\n'

#### Load depenedancies

In [2]:
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np
import random
import math
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os

Using TensorFlow backend.


# Define the environment

In [3]:
class ConsolidationEnv(gym.Env):
    """
    Description:
        A datacenter has many virtual machines(VM). VMs need to get consolidated into some of the physical machines so some otheer physical machines can be turned off.
    Source:
        This environment corresponds to gosssip-based VM consolidation
    Observation: 
        Type: Box(2)
        Num	Observation                 Min         Max
        0	CPU Utilization            0.0            100.0
        1	Memory Utilization         0.0            100.0
        
    Actions:
        Type: Discrete(2)
        Num	Action
        0	Low 
        1	Medium
        2	High
        3	xHigh
        4	2xHigh
        5	3xHigh
        6	4xhigh
        7	5xHigh
        8	Overload
        
        
    Rewards-out:
        In sender mode, rewards are given to a PM to move out its VMs so that it can switch off
        Level	Reward
        0	1000
        1	900
        2	800
        3	700
        4	600
        5	500
        6	400
        7	300
        8	200
        
    Rewards-in:
        In recipient mode, rewards are given to avoid SLA violation. It occurs when a PM moves to an overload state
        Level	Reward
        0	100
        1	100
        2	100
        3	100
        4	100
        5	100
        6	100
        7	100
        8	-2000
        
    Starting State:
        All observations are assigned a uniform random value between 0,100
    Episode Termination:
        Pole Angle is more than ±12°
        Cart Position is more than ±2.4 (center of the cart reaches the edge of the display)
        Episode length is greater than 200
        Solved Requirements
        Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.
    """
    
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second' : 50
    }

    def __init__(self):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = (self.masspole + self.masscart)
        self.length = 0.5 # actually half the pole's length
        self.polemass_length = (self.masspole * self.length)
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates
        self.kinematics_integrator = 'euler'

        # Angle at which to fail the episode
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.cpu_min_utilization = 0.0
        self.cpu_max_utilization = 100.0
        self.memory_min_utilization = 0.0
        self.memory_max_utilization = 100.0

        # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
        low = np.array([
            self.cpu_min_utilization,
            self.memory_min_utilization])
        
        high = np.array([
            self.cpu_max_utilization,
            self.memory_max_utilization])

        self.action_space = spaces.Discrete(9) # actions
        self.observation_space = spaces.Box(low, high, dtype=np.float32) # states
        
        self.reward_out = {
            0 : 1000,
            1 : 900,
            2 : 800,
            3 : 700,
            4 : 600,
            5 : 500,
            6 : 400,
            7 : 300,
            8 : 200
        }
        
        self.reward_in = {
            0 : 100,
            1 : 100,
            2 : 100,
            3 : 100,
            4 : 100,
            5 : 100,
            6 : 100,
            7 : 100,
            8 : -2000
        }
        print("Reward {}".format(self.reward_out))

        self.seed()
        self.viewer = None
        self.state = None

        self.steps_beyond_done = None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action==1 else -self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
        xacc  = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        if self.kinematics_integrator == 'euler':
            x  = x + self.tau * x_dot
            x_dot = x_dot + self.tau * xacc
            theta = theta + self.tau * theta_dot
            theta_dot = theta_dot + self.tau * thetaacc
        else: # semi-implicit euler
            x_dot = x_dot + self.tau * xacc
            x  = x + self.tau * x_dot
            theta_dot = theta_dot + self.tau * thetaacc
            theta = theta + self.tau * theta_dot
        self.state = (x,x_dot,theta,theta_dot)
        done =  x < -self.x_threshold \
                or x > self.x_threshold \
                or theta < -self.theta_threshold_radians \
                or theta > self.theta_threshold_radians
        done = bool(done)

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warn("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0

        return np.array(self.state), reward, done, {}

    def reset(self):
        self.state = self.np_random.uniform(low=0.0, high=100.0, size=(2,))
        self.steps_beyond_done = None
        return np.array(self.state)


In [4]:
env = ConsolidationEnv() # initialise environment

Reward {0: 1000, 1: 900, 2: 800, 3: 700, 4: 600, 5: 500, 6: 400, 7: 300, 8: 200}


In [5]:
state_size = env.observation_space.shape[0]
state_size

2

In [6]:
action_size = env.action_space.n
action_size

9

In [7]:
batch_size = 32

In [8]:
n_episodes = 1000 # n games we want agen to play

In [9]:
output_dir = 'model_outpu/cartpole/'

In [10]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [11]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # double-ended queue; acts like list, but elements can be added/removed from either end
        self.gamma = 0.95 # decay or discount rate: enables agent to take into account future actions in addition to the immediate ones, but discounted at this rate
        self.epsilon = 1.0 # exploration rate: how much to act randomly; more initially than later due to epsilon decay
        self.epsilon_decay = 0.995 # decrease number of random explorations as the agent's performance (hopefully) improves over time
        self.epsilon_min = 0.01 # minimum amount of random exploration permitted
        self.learning_rate = 0.001 # rate at which NN adjusts models parameters via SGD to reduce cost 
        self.model = self._build_model() # private method 
    
    def _build_model(self):
        # neural net to approximate Q-value function:
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu')) # 1st hidden layer; states as input
        model.add(Dense(24, activation='relu')) # 2nd hidden layer
        model.add(Dense(self.action_size, activation='linear')) # 2 actions, so 2 output neurons: 0 and 1 (L/R)
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later

    def act(self, state):
        if np.random.rand() <= self.epsilon: # if acting randomly, take random action
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # if not acting randomly, predict reward value based on current state
        return np.argmax(act_values[0]) # pick the action that will give the highest reward (i.e., go left or right?)

    def replay(self, batch_size): # method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory, batch_size) # sample a minibatch from memory
        for state, action, reward, next_state, done in minibatch: # extract data for each minibatch sample
            target = reward # if done (boolean whether game ended or not, i.e., whether final state or not), then target = reward
            if not done: # if not done, then predict future discounted reward
                target = (reward + self.gamma * # (target) = reward + (discount rate gamma) * 
                          np.amax(self.model.predict(next_state)[0])) # (maximum target Q based on future action a')
            target_f = self.model.predict(state) # approximately map current state to future discounted reward
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0) # single epoch of training with x=state, y=target_f; fit decreases loss btwn target_f and y_hat
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [12]:
agent = DQNAgent(state_size, action_size) # initialise agent

In [13]:
done = False
for e in range(n_episodes): # iterate over new episodes of the game
    print("episode: {}/{}".format(e,n_episodes))
    
    state = env.reset() # reset state at start of each new episode of the game    
    state = np.reshape(state, [1, state_size])
    print("State[CPU, Memory] => {}".format(state))
    
    action = agent.act(state) # action is to select to migrate out some of the VMs from 0 to 8
    reward = env.reward_out[action]
    print("Action:{}, Reward:{}".format(action,reward))
    
#     next_state, reward, done, _ = env.step(action) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position        
#     reward = reward if not done else -10 # reward +1 for each additional frame with pole upright        
#     next_state = np.reshape(next_state, [1, state_size])
#     agent.remember(state, action, reward, next_state, done) # remember the previous timestep's state, actions, reward, etc.        
#     state = next_state # set "current state" for upcoming iteration to the current next state 
    
#     for time in range(5000):  # time represents a frame of the game; goal is to keep pole upright as long as possible up to range, e.g., 500 or 5000 timesteps
#         env.render()
#         action = agent.act(state) # action is either 0 or 1 (move cart left or right); decide on one or other here
#         next_state, reward, done, _ = env.step(action) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position        
#         reward = reward if not done else -10 # reward +1 for each additional frame with pole upright        
#         next_state = np.reshape(next_state, [1, state_size])
#         agent.remember(state, action, reward, next_state, done) # remember the previous timestep's state, actions, reward, etc.        
#         state = next_state # set "current state" for upcoming iteration to the current next state        
#         if done: # episode ends if agent drops pole or we reach timestep 5000
#             print("episode: {}/{}, score: {}, e: {:.2}" # print the episode's score and agent's epsilon
#                   .format(e, n_episodes, time, agent.epsilon))
#             break # exit loop
#     if len(agent.memory) > batch_size:
#         agent.replay(batch_size) # train the agent by replaying the experiences of the episode
    if e % 50 == 0:
        agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")


episode: 0/1000
State[CPU, Memory] => [[10.92854835 26.52157917]]
Action:4, Reward:600
episode: 1/1000
State[CPU, Memory] => [[95.34512827 18.81774225]]
Action:8, Reward:200
episode: 2/1000
State[CPU, Memory] => [[70.42237247  2.04040613]]
Action:4, Reward:600
episode: 3/1000
State[CPU, Memory] => [[43.54217353 21.71481306]]
Action:6, Reward:400
episode: 4/1000
State[CPU, Memory] => [[ 2.93043874 73.25766216]]
Action:4, Reward:600
episode: 5/1000
State[CPU, Memory] => [[54.28609452 79.49538945]]
Action:3, Reward:700
episode: 6/1000
State[CPU, Memory] => [[89.50862328 51.91015832]]
Action:5, Reward:500
episode: 7/1000
State[CPU, Memory] => [[77.57402538 70.35001779]]
Action:7, Reward:300
episode: 8/1000
State[CPU, Memory] => [[64.6195723   3.47058817]]
Action:5, Reward:500
episode: 9/1000
State[CPU, Memory] => [[11.8893452  48.39524371]]
Action:2, Reward:800
episode: 10/1000
State[CPU, Memory] => [[59.97204866 63.16146302]]
Action:7, Reward:300
episode: 11/1000
State[CPU, Memory] => [[7

episode: 303/1000
State[CPU, Memory] => [[29.74773721  5.02825547]]
Action:5, Reward:500
episode: 304/1000
State[CPU, Memory] => [[79.60448237 13.6643466 ]]
Action:4, Reward:600
episode: 305/1000
State[CPU, Memory] => [[26.15293591  4.98936451]]
Action:1, Reward:900
episode: 306/1000
State[CPU, Memory] => [[19.29452223 65.92604431]]
Action:8, Reward:200
episode: 307/1000
State[CPU, Memory] => [[11.84554571 74.8423891 ]]
Action:7, Reward:300
episode: 308/1000
State[CPU, Memory] => [[40.10129563 43.07927374]]
Action:6, Reward:400
episode: 309/1000
State[CPU, Memory] => [[48.28722504 81.96162111]]
Action:6, Reward:400
episode: 310/1000
State[CPU, Memory] => [[56.76551366 92.5952803 ]]
Action:2, Reward:800
episode: 311/1000
State[CPU, Memory] => [[ 0.7617319  85.84236511]]
Action:0, Reward:1000
episode: 312/1000
State[CPU, Memory] => [[63.47574206 43.3242725 ]]
Action:6, Reward:400
episode: 313/1000
State[CPU, Memory] => [[85.434654   37.86078578]]
Action:2, Reward:800
episode: 314/1000
St

Action:0, Reward:1000
episode: 396/1000
State[CPU, Memory] => [[93.92183178  5.39240051]]
Action:6, Reward:400
episode: 397/1000
State[CPU, Memory] => [[21.97331507 60.24802151]]
Action:6, Reward:400
episode: 398/1000
State[CPU, Memory] => [[15.46771697 99.15965571]]
Action:7, Reward:300
episode: 399/1000
State[CPU, Memory] => [[62.6708661   5.05400231]]
Action:3, Reward:700
episode: 400/1000
State[CPU, Memory] => [[97.08958615 82.48519056]]
Action:5, Reward:500
episode: 401/1000
State[CPU, Memory] => [[46.99971284 67.24770133]]
Action:1, Reward:900
episode: 402/1000
State[CPU, Memory] => [[95.82643142 52.29705682]]
Action:3, Reward:700
episode: 403/1000
State[CPU, Memory] => [[18.97299748  0.59856541]]
Action:6, Reward:400
episode: 404/1000
State[CPU, Memory] => [[16.14120193 51.34012508]]
Action:4, Reward:600
episode: 405/1000
State[CPU, Memory] => [[86.96405894 95.4690438 ]]
Action:8, Reward:200
episode: 406/1000
State[CPU, Memory] => [[53.60102683 36.38325389]]
Action:2, Reward:800

episode: 636/1000
State[CPU, Memory] => [[20.46308811  4.96379661]]
Action:5, Reward:500
episode: 637/1000
State[CPU, Memory] => [[38.56984156 59.39591238]]
Action:6, Reward:400
episode: 638/1000
State[CPU, Memory] => [[16.79614626 92.45073708]]
Action:8, Reward:200
episode: 639/1000
State[CPU, Memory] => [[81.28045233 84.92422014]]
Action:6, Reward:400
episode: 640/1000
State[CPU, Memory] => [[66.52867836 85.67895759]]
Action:0, Reward:1000
episode: 641/1000
State[CPU, Memory] => [[57.80360655 11.48720336]]
Action:0, Reward:1000
episode: 642/1000
State[CPU, Memory] => [[68.65691135 10.52378188]]
Action:1, Reward:900
episode: 643/1000
State[CPU, Memory] => [[89.41939097 93.05187814]]
Action:5, Reward:500
episode: 644/1000
State[CPU, Memory] => [[79.0923829   8.28739598]]
Action:6, Reward:400
episode: 645/1000
State[CPU, Memory] => [[46.30551236 34.67288686]]
Action:6, Reward:400
episode: 646/1000
State[CPU, Memory] => [[32.7139375  58.29769817]]
Action:2, Reward:800
episode: 647/1000
S

episode: 969/1000
State[CPU, Memory] => [[26.42074847 99.66675315]]
Action:3, Reward:700
episode: 970/1000
State[CPU, Memory] => [[50.27957732 14.73572232]]
Action:4, Reward:600
episode: 971/1000
State[CPU, Memory] => [[72.95249809 21.43627327]]
Action:0, Reward:1000
episode: 972/1000
State[CPU, Memory] => [[22.51056834 86.38094905]]
Action:2, Reward:800
episode: 973/1000
State[CPU, Memory] => [[49.70654063 33.20880256]]
Action:5, Reward:500
episode: 974/1000
State[CPU, Memory] => [[59.71864232 10.3822512 ]]
Action:8, Reward:200
episode: 975/1000
State[CPU, Memory] => [[23.63127402 42.76442777]]
Action:2, Reward:800
episode: 976/1000
State[CPU, Memory] => [[44.41727397  5.73562231]]
Action:8, Reward:200
episode: 977/1000
State[CPU, Memory] => [[ 7.94569947 30.13671026]]
Action:1, Reward:900
episode: 978/1000
State[CPU, Memory] => [[33.12591311 16.71781865]]
Action:3, Reward:700
episode: 979/1000
State[CPU, Memory] => [[59.94610853 47.00272967]]
Action:6, Reward:400
episode: 980/1000
St