# RL Agent for Training Num of Users for a Server

## Preset Data:

In [1]:
import random
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

In [2]:
datafile = 'dataset/dual_s_data_mini.csv'
users_low = 1
users_res = 2
users_high = 19
#number_of_service = 2

latency_threshold = 10

1. states: RAM(MB), #Cores, BG WL(%), GPU(MB), S1:Users, S2:Users

2. More reward for the number closer to (s1,s2) with latency below given threshold

## RL Agent:

In [3]:
for action in range(25):
    u1 = (action//5)*4 + 1
    u2 = (action%5)*4 + 1
    print(action, ":", (u1, u2))

0 : (1, 1)
1 : (1, 5)
2 : (1, 9)
3 : (1, 13)
4 : (1, 17)
5 : (5, 1)
6 : (5, 5)
7 : (5, 9)
8 : (5, 13)
9 : (5, 17)
10 : (9, 1)
11 : (9, 5)
12 : (9, 9)
13 : (9, 13)
14 : (9, 17)
15 : (13, 1)
16 : (13, 5)
17 : (13, 9)
18 : (13, 13)
19 : (13, 17)
20 : (17, 1)
21 : (17, 5)
22 : (17, 9)
23 : (17, 13)
24 : (17, 17)


In [4]:
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding


class yolosystem(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, n_actions, filename):
        
        super(yolosystem, self).__init__()
        
        self.n_actions = n_actions #total number of action space after ranging [10, 20, 30 ...]
        self.action_space = spaces.Discrete(self.n_actions) #total number of users in the action space; starts with zero
        self.observation_space = spaces.Box(low=np.array([0,0,0,0,0,0]), high=np.array([11000]*6), shape=(6, ), dtype=np.int32) #<RAM, Core, Workload>
        self.seed()
        self.current_obs = np.array( [3000, 2, 40, 2, 1, 1] ) #current observation = <ram, cores, workload%>

        #Load dataset
        self.df = pd.read_csv(filename)
        #data preprocessing step
#         self.df['ram'] = self.df['ram'].div(1000).round(0).astype(int)
#         self.df['workload_cpu'] = self.df['workload_cpu'].div(10).round(0).astype(int)
        self.df['workload_gpu'] = self.df['workload_gpu'].multiply(1/80).round(0).astype(int) #round gpu workload
#         self.df['users_yolo'] = self.df['users_yolo'].div(100).round(0).astype(int)
#         self.df['users_mnet'] = self.df['users_mnet'].div(100).round(0).astype(int)
        
        # ... others
        #get unique data in set
        self.ram = self.df.ram.unique()
        self.cores = self.df.cores.unique()
        self.workload_cpu = self.df.workload_cpu.unique()
        print(self.df)
       
        

    def seed(self, seed=1010):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action) #action should be in action space
        state = self.current_obs
        done = True #Episodes ends after each action

        #compute latecy from the number of users
        reward = self.get_reward(state, action) #linear latency           
#         print(action, reward)
        self.current_obs = self.get_random_state() #go to a random state
        
#         print(self.current_obs)
        return self.current_obs, reward, done, {} #no-states, reward, episode-done, no-info

    def reset(self):
        self.current_obs = self.get_random_state()
        return self.current_obs #current state of the system with no load

    def render(self, mode='human', close=False):
        print(f"Current State:<{self.current_obs}>")
        
    
    #compute latency
    def get_reward(self, state, action):
        #change action to users
        
        #100 action space
#         u1 = (action//10)*2 + 1
#         u2 = (action%10)*2 + 1
        
        #25 action space
        u1 = (action//5)*4 + 1
        u2 = (action%5)*4 + 1
        #sample time from dataframe
        gram = state[0]
        gcores = state[1]
        gwl_c = state[2]
        gwl_g = state[3]
        gs1 = u1
        gs2 = u2
#         print("user:", gs1, gs2, "act:", action)

        
        fetch_state = self.df.loc[ (self.df['ram'] == gram) & (self.df['cores']== gcores) & (self.df['workload_cpu']==gwl_c) & (self.df['workload_gpu']==gwl_g) & (self.df['users_yolo']==gs1) & (self.df['users_mnet']==gs2)]
        
        
        if fetch_state.empty:
            return -20 #DQN 8
#             return 0 #dn 9
#         print(fetch_state)
        time1 = fetch_state.sample().iloc[0]['time_yolo'] #fetch time from the dataframe
        time2 = fetch_state.sample().iloc[0]['time_mnet']
        tm = max(time1, time2)
        
        #compute reward=======================
#         print("time", tm)
        if (tm <= latency_threshold): # and (gs1 <= state[4]) and (gs2 <= state[5]):
#             return 100*np.exp( ( 0.005*(gs1 - state[4]) ) ) + 100*np.exp( ( 0.005*(gs2 - state[5]) ) ) #dqn9
#             return 200*np.exp( ( 0.004*(gs1 - state[4]) ) ) + 200*np.exp( ( 0.004*(gs2 - state[5]) ) ) #dqn10
#             return -np.exp( ( -0.01*(gs1 - state[4]) ) ) - np.exp( ( -0.01*(gs2 - state[5]) ) )
#             return np.exp( ( 0.01*(gs1 - state[4]) ) ) + np.exp( ( 0.01*(gs2 - state[5]) ) ) #DQN6
            return  (gs1 - state[4]) +  (gs2 - state[5]) + u1 + u2 #DQN8 (best)
#             return  1 + u1 + u2 #DQN7
        else:
            return - u1 - u2    #DQN 8  
#             return 0 #dn9

        
    
    #get to some random state after taking an action
    def get_random_state(self):
        #generate state randomly
        gram = np.random.choice(self.ram, 1)[0]
        gcores = np.random.choice(self.cores, 1)[0]
        gwl_c = np.random.choice(self.workload_cpu, 1)[0]
        
        #fetch gamma for the state
        fetch_state = self.df.loc[ (self.df['ram'] == gram) & (self.df['cores']== gcores) & (self.df['workload_cpu']==gwl_c) ]
        gwl_g = fetch_state.sample().iloc[0]['workload_gpu'] #fetch workload randmoly
        
        gs1 = random.randrange(1, 20, 5)
        gs2 = random.randrange(1, 20, 5)
        
        return np.array( [gram, gcores, gwl_c, gwl_g, gs1, gs2] )

## RL Agent with Baseline3

In [5]:
from stable_baselines3.common.env_checker import check_env
env = yolosystem(25, datafile ) #100 and 25
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

         ram  cores  workload_cpu  workload_gpu  users_yolo  users_mnet  \
0       3000      2            40             2           1           1   
1       3000      2            40             2           1           3   
2       3000      2            40             2           1           5   
3       3000      2            40             2           1           7   
4       3000      2            40             2           1           9   
...      ...    ...           ...           ...         ...         ...   
23995  11000      5            60            10          19          11   
23996  11000      5            60            10          19          13   
23997  11000      5            60            10          19          15   
23998  11000      5            60            10          19          17   
23999  11000      5            60            10          19          19   

       time_yolo  time_mnet  
0       4.181969   6.505600  
1       4.096301   6.710087  
2       4

In [6]:
print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

Box(0, 11000, (6,), int32)
Discrete(25)
20


In [7]:
for i in range(100):
    t = env.get_reward([3000, 2, 40, 2, 1, 1], i)
    print(t)

2
10
18
26
34
10
18
26
34
42
18
26
34
42
50
26
34
42
50
58
34
42
50
58
66
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20
-20


In [8]:
n_steps = 10

for step in range(n_steps):
    print("Step {}".format(step + 1))
    obs, reward, done, info = env.step(env.action_space.sample())
    print('reward=', reward)
    env.render()

Step 1
reward= 45.0
Current State:<[3000.    3.   40.    6.   10.   15.]>
Step 2
reward= 19.0
Current State:<[9000.    3.   60.    3.   11.    2.]>
Step 3
reward= 23.0
Current State:<[11000.     2.    50.     3.    16.    19.]>
Step 4
reward= -23.0
Current State:<[3000.    4.   50.    2.   11.   17.]>
Step 5
reward= -16.0
Current State:<[7000.    2.   40.    2.   18.    6.]>
Step 6
reward= 36.0
Current State:<[5000.    2.   50.   10.    7.   18.]>
Step 7
reward= 3.0
Current State:<[5000.    5.   50.   10.   12.    6.]>
Step 8
reward= 50.0
Current State:<[9000.    2.   40.    3.   14.   11.]>
Step 9
reward= 43.0
Current State:<[11000.     3.    40.    10.    13.    10.]>
Step 10
reward= 13.0
Current State:<[3000.    3.   60.    2.    9.   10.]>


In [9]:

from stable_baselines3.common.monitor import Monitor
import os
# Create log dir
log_dir = './agent_tensorboard/'
os.makedirs(log_dir, exist_ok=True)

env = Monitor(env, log_dir)


In [10]:
from stable_baselines3 import DQN
from stable_baselines3.dqn import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv

# wrap it
env = DummyVecEnv([lambda: env])

In [11]:
# from stable_baselines3 import A2C
# from stable_baselines3.common.env_util import make_vec_env


# model = A2C("MlpPolicy", env, verbose=1, tensorboard_log='./agent_tensorboard/')
# model.learn(total_timesteps=25000)

In [12]:
model = DQN(MlpPolicy, env, verbose=0, tensorboard_log = log_dir, exploration_fraction=0.4, learning_starts=10000,  train_freq=30, target_update_interval=5000, exploration_final_eps=0.05)

In [13]:
model.learn(total_timesteps=30000) #reset_num_timesteps=False

<stable_baselines3.dqn.dqn.DQN at 0x7fd273185fa0>

In [14]:
# Save the agent
model.save(f"edge_agent_action")
# model.save(f"edge_agent_{latency_threshold}_lin")
# del model  # delete trained model to demonstrate loading

In [15]:
# Load the trained agent
# from stable_baselines3 import DQN
# model = DQN.load("edge_agent_action_threat")
#return action and state
#model.predict(np.array([2000, 4, 30]), deterministic=True)

## Predict

In [None]:
# import time

# begin = time.time()
# model.predict(np.array([3, 4, 5, 2, 5, 5]), deterministic=True)
# end = time.time()
# t = end-begin
# print(f"{t}seconds")

In [None]:
act = model.predict(np.array([3000, 2, 40, 2, 4, 4]), deterministic=True)[0]
print(act)
u1 = (act//10)*2 + 1
u2 = (act%10)*2 + 1
print(u1, u2)