In [1]:
import matplotlib.pyplot as plt
import time
import random
from collections import deque
import tensorflow as tf
from tqdm import tqdm 
from tensorflow.keras import Sequential 
from tensorflow.keras.activations import relu, linear
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop
import math
from tensorflow.keras.optimizers import Adam
from tqdm import trange
import pandas as pd
from IPython.display import clear_output
from deadlineSchedulingEnv import deadlineSchedulingEnv
import concurrent.futures
import numpy as np

In [2]:
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")
print(f"TensorFlow version: {tf.__version__}")


TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.13.0


In [3]:
GAMMA = 0.99
LEARNING_RATE = 1e-5

MEMORY_SIZE = 1000
BATCH_SIZE = 32

EPSILON_MAX = 1.0
EPSILON_MIN = 0.01
EXPLORATION_DECAY = 0.9995

In [8]:
state_dim=3
action_dim=2
intermediate_dim=16
T=13
B=10
processing_costs=[0.1,0.3,0.6,0.8]
num_states=T*B

In [5]:
class QModel:
    def __init__(self, input_dim, output_dim, lr):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr
        self.Qpolicy = self.create()
        self.Qtarget = self.create() 
        self.Qtarget.set_weights(self.Qpolicy.get_weights())
        
    def create(self):
        model = Sequential()
        model.add(tf.keras.layers.InputLayer(input_shape=(1,state_dim)))
        model.add(Dense(512,activation = 'relu'))
        model.add(Dense(256, activation = 'relu'))
        model.add(Dense(128, activation = 'relu'))
        model.add(Dense(self.output_dim, activation = 'linear'))
        # model.compile(optimizer = RMSprop(learning_rate = self.lr, rho = 0.95, epsilon = 1e-7), loss = "mse", metrics = ['accuracy'])
        model.compile(optimizer =tf.keras.optimizers.legacy.Adam(learning_rate=self.lr), loss = "mse", metrics = ['accuracy'])
        return model

In [6]:
class DQNSolver:
    def __init__(self, all_states,state_space,action_space, decay_coe = 0.99, 
                  memory_size = MEMORY_SIZE,EXPLORATION_DECAY=EXPLORATION_DECAY,LEARNING_RATE=LEARNING_RATE,EPSILON_MAX=EPSILON_MAX,EPSILON_MIN=EPSILON_MIN,BATCH_SIZE=BATCH_SIZE):
    
        self.all_states=all_states
        self.states = state_space
        self.n_actions = action_space
        
        self.actions = [i for i in range(self.n_actions)]
        
        self.lr = LEARNING_RATE
        self.gamma = GAMMA
        self.epsilon = EPSILON_MAX
        self.decay_coe = decay_coe
        self.min_eps = EPSILON_MIN
        #self.episodes = episodes
        self.batch_size = BATCH_SIZE
        self.memory = deque(maxlen = memory_size) # replay memory 
        
        self.terminal_state = False # end of the episode
        self.target_counter = 0 
        self.exploration_decay=EXPLORATION_DECAY
        # Plot data
        #self.timestep = self.episodes / 10

        
        
        
        self.model = QModel((1,self.states), self.n_actions, self.lr)
        # Smooth epsilon 
        # self.a = 0.35
        # self.b = 0.1
        # self.c = 0.01
        
    def state_shape(self,states):
        states = np.array(states)
        return states.reshape(-1,*states.shape)
    def update_target_model(self):
        """
        Updates the current target_q_net with the q_net which brings all the
        training in the q_net to the target_q_net.
        :return: None
        """
        self.model.Qtarget.set_weights(self.model.Qpolicy.get_weights())
    def decrement_epsilon(self):
        '''
        if self.epsilon > self.min_eps:
            self.epsilon *= self.decay_coe
        else:
            self.epsilon = self.min_eps
        '''
        # s_time = (time - self.a*self.episodes) / (self.b*self.episodes) 
        # cosh = np.cosh(math.exp(-s_time))
        # self.epsilon = 1 - (1/cosh + (time*self.c/self.episodes))
        # if self.epsilon>self.min_eps:
        self.epsilon*=self.exploration_decay
        # else:
        #     self.epsilon=self.min_eps
    def forget(self):
        self.memory.clear()

    def remember(self, s, a, r, s_, done,subsidy):
        self.memory.append([self.state_shape(s), a, r, self.state_shape(s_), done,subsidy])
        
    def act(self, states):
        if np.random.random() > (1 - self.epsilon):
            action = np.random.choice(self.actions)
        else:
            states = self.state_shape(states)
            states.reshape(1,1,self.states)
#             states=[states]
#             states=np.array(states)
            #print(states.shape)
            action = np.argmax(np.array(self.model.Qpolicy.predict_on_batch(states)))
            
        return action
            
    def minibatch(self):
        return random.sample(self.memory, self.batch_size)
    

        
    def train(self):
        # X - states passed to the NN, y - target
        
        X, y = [], []
        
        if len(self.memory) >= self.batch_size: 
            SARS = self.minibatch()
        
            s = self.state_shape([row[0] for row in SARS])
            s=s.reshape(BATCH_SIZE,1,state_dim)
            #print(s.shape)
            qvalue = np.array(self.model.Qpolicy.predict_on_batch(s))
            #print(qvalue)

            s_ = self.state_shape([row[3] for row in SARS])
            s_=s_.reshape(BATCH_SIZE,1,state_dim)
            future_qvalue = np.array(self.model.Qtarget.predict_on_batch(s_))
            #print(future_qvalue)

            for index, (state, action, reward, state_, done,subsidy) in enumerate(SARS):
                if done == True:
                    Qtarget = reward +(1-action)*(subsidy)
                else:
                    Qtarget = reward +(1-action)*(subsidy) + np.max(future_qvalue[index][0])-np.array(self.model.Qpolicy.predict_on_batch((self.state_shape(self.all_states)).reshape(len(self.all_states),1,state_dim))).sum()/(len(self.all_states)*action_dim)
            
                qcurr = qvalue[index][0]
                #print(qcurr)
                qcurr[int(action)] = Qtarget 
                #print(qcurr)
                X.append(state)
                y.append(qcurr)
            X, y = np.array(X).reshape(self.batch_size,1,self.states), np.array(y).reshape(self.batch_size, 1, self.n_actions)
            
            loss = self.model.Qpolicy.train_on_batch(X, y,return_dict=True)
            


                
  

In [31]:
action_space=[0,1]
state_space_dqn=np.array([[[[i,j,k] for j in range(B)] for i in range(T)] for k in range(0,4)]).reshape(4*T*B,3)
state_space_arm1=np.array([[[i,j,0] for j in range(B)] for i in range(T)]).reshape(T*B,3)
state_space_arm2=np.array([[[i,j,1] for j in range(B)] for i in range(T)]).reshape(T*B,3)
state_space_arm3=np.array([[[i,j,2] for j in range(B)] for i in range(T)]).reshape(T*B,3)
state_space_arm4=np.array([[[i,j,3] for j in range(B)] for i in range(T)]).reshape(T*B,3)

N=20
M=5

epsilon=1
subsidy=0
arm_indexes=[i for i in range(N)]
gamma=0.9995

In [32]:
state_space_arm3[100]

array([10,  0,  2])

In [33]:
def newArrival(state_space):
    return np.array(random.sample(list(state_space), 1), dtype=np.float32).reshape(3,)


In [34]:
newArrival(state_space_arm3)

array([11.,  6.,  2.], dtype=float32)

In [35]:
def step(state, action,state_space,processing_cost=0.8):
        ''' function to calculate the reward and next state. '''
        currentState = state

        if action == 1:
            if (currentState[1] >= 0) and (currentState[0] == 0): 
                reward = 0 
                nextState = newArrival(state_space)
            elif (currentState[1] >= 0) and (currentState[0] > 1): 
                reward = (1 - processing_cost)
                currentState[0] -= 1
                currentState[1] -= 1
                if currentState[1] < 0:
                    currentState[1] = 0
                    reward = 0
                nextState = np.array([currentState[0], currentState[1], currentState[2]],  dtype=np.float32)
            elif (currentState[1] >= 0) and (currentState[0] == 1): 
                reward = ((1 - processing_cost) - 0.2*(((currentState[1]) - 1)**2)) 
                if (currentState[1] == 0):
                    reward = 0
                currentState[1] = 0
                currentState[0] = 0
                nextState = newArrival(state_space)

        elif action == 0:
            if (currentState[1] >= 0)  and (currentState[0] == 0):
                reward = 0
                nextState = newArrival(state_space)
            elif (currentState[1] >= 0) and (currentState[0] > 1): 
                reward = 0
                currentState[0] -= 1
                nextState = np.array([currentState[0], currentState[1], currentState[2]], dtype=np.float32)
            elif (currentState[1] >= 0) and (currentState[0] == 1):  
                reward =  -0.2*(((currentState[1]))**2)  
                currentState[1] = 0
                currentState[0] = 0
                nextState = newArrival(state_space)

        
        return nextState, reward 

In [36]:
nextState, reward=step(np.array([1,3,0]),1,state_space_arm1,processing_costs[0])
print(nextState)
print(reward)

[3. 7. 0.]
0.09999999999999998


In [43]:
nextState.shape

(3,)

In [37]:
dqn_solver=DQNSolver(all_states=state_space_dqn,state_space=state_dim,action_space=action_dim)



2024-04-05 02:37:15.347698: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-04-05 02:37:15.347769: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-04-05 02:37:15.347773: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-04-05 02:37:15.347924: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-05 02:37:15.348696: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [40]:
newArrival_state_space_arm1=np.array([[[0,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
x=list(newArrival_state_space_arm1)
x.append(np.array([0,0,0]))
newArrival_state_space_arm1=np.array(x)
newArrival_state_space_arm2=np.array([[[1,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
x=list(newArrival_state_space_arm2)
x.append(np.array([1,0,0]))
newArrival_state_space_arm2=np.array(x)
newArrival_state_space_arm3=np.array([[[2,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
x=list(newArrival_state_space_arm3)
x.append(np.array([2,0,0]))
newArrival_state_space_arm3=np.array(x)
newArrival_state_space_arm4=np.array([[[3,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
x=list(newArrival_state_space_arm4)
x.append(np.array([3,0,0]))
newArrival_state_space_arm4=np.array(x)

In [42]:
current_state=[newArrival(newArrival_state_space_arm1) for _ in range(int(N/4))]
current_state.extend([newArrival(newArrival_state_space_arm2) for _ in range(int(N/4))])
current_state.extend([newArrival(newArrival_state_space_arm3) for _ in range(int(N/4))])
current_state.extend([newArrival(newArrival_state_space_arm4) for _ in range(int(N/4))])

rewards=[]
subsidies=[]
action_sums=[]
betas=[]
alphas=[]
epsilon=1

In [None]:
def for_each_arm(x):
    current_state=x[0]
    dqn_solver=x[1]
    subsidy=x[2]
    type=x[3]
    state_space_arm1=np.array([[[i,j,0] for j in range(B)] for i in range(T)]).reshape(T*B,3)
    state_space_arm2=np.array([[[i,j,1] for j in range(B)] for i in range(T)]).reshape(T*B,3)
    state_space_arm3=np.array([[[i,j,2] for j in range(B)] for i in range(T)]).reshape(T*B,3)
    state_space_arm4=np.array([[[i,j,3] for j in range(B)] for i in range(T)]).reshape(T*B,3)
    newArrival_state_space_arm1=np.array([[[0,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
    x=list(newArrival_state_space_arm1)
    x.append(np.array([0,0,0]))
    newArrival_state_space_arm1=np.array(x)
    newArrival_state_space_arm2=np.array([[[1,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
    x=list(newArrival_state_space_arm2)
    x.append(np.array([1,0,0]))
    newArrival_state_space_arm2=np.array(x)
    newArrival_state_space_arm3=np.array([[[2,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
    x=list(newArrival_state_space_arm3)
    x.append(np.array([2,0,0]))
    newArrival_state_space_arm3=np.array(x)
    newArrival_state_space_arm4=np.array([[[3,i,j] for j in range(B)] for i in range(1,T)]).reshape((T-1)*B,3)
    x=list(newArrival_state_space_arm4)
    x.append(np.array([3,0,0]))
    newArrival_state_space_arm4=np.array(x)
    s=np.reshape(current_state,(1,3))
    action=dqn_solver.act(s)
    if type==0:
        x=step(current_state,action,state_space_arm1,newArrival_state_space_arm1,processing_cost=0.1)
    elif type==1:
        x=step(current_state,action,state_space_arm2,newArrival_state_space_arm2,processing_cost=0.3)
    elif type==2:
        x=step(current_state,action,state_space_arm3,newArrival_state_space_arm3,processing_cost=0.6)
    else:
        x=step(current_state,action,state_space_arm4,newArrival_state_space_arm4,processing_cost=0.8)
    new_state=x[0]
    s_=np.reshape(new_state,(1,3))
    reward=x[1]
    dqn_solver.remember(s, action, reward, s_, False,subsidy)
    dqn_solver.train()
    return action,current_state

In [50]:
j=15
j%(N/4)==0

True

In [None]:
for i in range(1,150001):
    action_sum=0
    reward_sum=0
    beta=1/(1+np.ceil(i*np.log(i)/5000))
    
    all=[]
    type=-1
    for j in range(len(current_state)):
        if j%(N/4)==0:
            type=type+1
        all.append([current_state[j],dqn_solver,subsidy,type])


    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures=[executor.submit(for_each_arm,x) for x in all]
        results = [future.result() for future in concurrent.futures.as_completed(futures)]
    
    for k,result in enumerate(results):
        action_sum+=result[0]
        current_state[k]=result[1]
    if i%100==0:
        print(i)
    dqn_solver.update_target_model()
    dqn_solver.decrement_epsilon()
    subsidy+=beta*(action_sum-M)
    subsidies.append(subsidy)
    rewards.append(reward_sum)
    action_sums.append(action_sum)
    betas.append(beta)
    clear_output(wait=True)

In [None]:
dqn_solver.epsilon

In [None]:
plt.plot(subsidies)
plt.xlabel('Steps')
plt.ylabel('Subsidy')
plt.title('Subsidy vs Steps for Deadline Scehduling')

In [None]:
df=pd.DataFrame(rewards)


In [None]:
df=pd.DataFrame()
df['action_sums']=action_sums
plt.plot(df['action_sums'].rolling(100000,min_periods=1).mean())
plt.xlabel('Steps')
plt.ylabel('Number of Arms activated')
plt.title(' Moving Average of number of arms activated for Deadline Scehduling ')
