# This cell has two classes:
1. `EnvChannel`: Defines the environment
2. `ControlAgent`: Defines the control agent

# Parameters:
1. ```alpha```: Learning rate of agent
2. `gamma`: Discount factor
3. `epsilon`: for epsilon-greedy policy
4. `num_states`: Number of states in system, currently three(Good, Medium, Bad)
5. `d1`: Delay threshold for Good state and medium state
6. `d2`: Delay threshold for Medium and bad state

# Return:
1. ```action (int)```: -1: Decrease Resolution; 0: No change; 1: Increase Resolution
2. `state (int)`: 0: Bad; 1: Medium; 2: Good


# Arguments:
1. `state_list`: List of integer states. 
2. `avg_confidence`: (Float) Average confidence 
    
# Sample use:

Initialize ControlAgent class:
```
agent = ControlAgent(resolution_list=[100,200,300], d1=0.00103,d2=0.00161)
```
To train and get optimal actions
```
action = agent.get_signal(delay_list = [.5,.6], curr_resolution=200, error_score=.5)
```

To get random actions:
```
action = agent.get_signal(delay_list, avg_confidence, random_actions=True)
```

# To Do:
1. ~~Instead of taking state_list directly, estimate the states indirectly from packet delay.~~ **Done**
2. ~~Update the function get_delay_factor.~~ **Done** 


In [124]:
import numpy as np
import pickle
import sys


class EnvChannel:
    def __init__(self, resolution_list, d1=.01, d2=.02):

        self.num_delay_bins = 3
        self.num_resolutions = len(resolution_list)

        self.resolution_dict = {resolution: i for i, resolution in enumerate(resolution_list)}
        

        self.num_states = self.num_resolutions*self.num_delay_bins
        self.curr_state = (0,0)
        self.prev_state = (0,0)
        self.reward = 0
        self.action = 0
        self.num_actions = 3
        self.valid_actions = [0, 1, 2]  # Reduce, No change, Increase
        self.d1 = d1
        self.d2 = d2
        self.avg_delay = 0
        self.curr_resolution = 0
        self.error_score=  0

    def sample_action(self):
        return np.random.choice(self.valid_actions)
    

    def get_reward(self):
        return 0.5*self.resolution_state/(self.delay_state+1) +0.5*self.error_score
    
    
    def step(self, action):
        self.action = action
        self.reward = self.get_reward()
        return self.reward, self.estimate_state()

    
    def estimate_state(self):
        self.resolution_state = self.resolution_dict[self.curr_resolution]
        if self.avg_delay <= self.d1:
            self.delay_state = 0  # Good state
        elif self.avg_delay <= self.d2:
            self.delay_state = 1  # Medium state
        else:
            self.delay_state = 2  # Bad state
        self.curr_state = (self.delay_state, self.resolution_state)
        return self.curr_state

    
    def reset(self):
        self.__init__(num_states=self.num_states)
        return self.curr_state


class ControlAgent:
    def __init__(self,
                 resolution_list,
                 d1,
                 d2,
                 alpha=0.1,
                 gamma=.1,
                 epsilon=.3,
                 random_actions=False):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.env = EnvChannel(resolution_list, d1, d2)
        self.q_table = np.zeros([self.env.num_delay_bins, self.env.num_resolutions, self.env.num_actions])
        self.all_epochs = []
        self.action_record = []
        self.state_record = []
        self.penalties = []
        self.iteration_i = 0
        self.prev_state = None
        self.prev_avg_delay = 0
        self.random_actions = random_actions
        
  
    def map_action(self, state):
        if state == 0:
            return -1
        if state == 1:
            return 0
        if state == 2:
            return 1
        
 
    def get_signal(self, delay_list, curr_resolution, error_score):
        self.env.avg_delay = np.average(delay_list)
        self.env.curr_resolution = curr_resolution
        self.env.error_score = np.abs(error_score)/ 100.0
        
        self.iteration_i += 1
        if np.mod(self.iteration_i+2, 10) == 0:
            self.alpha = self.alpha*.95
            self.epsilon = self.epsilon*.95
            

        if self.iteration_i == 1:
            state = self.env.estimate_state()
            action = self.env.sample_action()
            reward = 0 

        else:
  
            reward, state = self.env.step(self.prev_action)
            self.state_record.append(self.env.curr_state)
            

            if not self.random_actions:
                old_qvalue = self.q_table[self.prev_state[0],self.prev_state[1], 
                                          self.prev_action]
                next_max = np.max(self.q_table[state[0],state[1], :])

                new_qvalue = (1 - self.alpha) * old_qvalue + \
                    self.alpha * (reward + self.gamma * next_max)
                self.q_table[self.prev_state[0],self.prev_state[1], 
                             self.prev_action] = new_qvalue
                self.penalties.append(reward)
                self.action_record.append(self.prev_action)
                

                if np.random.uniform(0, 1) < self.epsilon:
                    action = self.env.sample_action()  # Explore action space

                else:
                    # Exploit learned values
                    action = np.argmax(self.q_table[state[0],state[1], :])
            else:
                action = self.env.sample_action()  # Explore action space

        self.prev_state = state
        self.prev_action = action
        return self.map_action(action), action, state, reward 

In [125]:
# FOR TESTING
import pandas as pd 
d1 = 0.00103
d2 = 0.00161
agent = ControlAgent(d1,d2, gamma=0.0)
f_name = "../datare.csv"
df =pd.read_csv(f_name)

In [126]:
df_new = pd.DataFrame(columns=['State', 'Action', 'Delay'])
len_df = len(df)
len_df = 50

tmp_delay = []
reward = 0
reward_list = []
for j in range(len_df):
    i = np.mod(j, 99)
#     print(i)
    delay = df['Delay'][i]
    conf = df['Conf_score'][i]
    action, state, rew = agent.get_signal(delay, conf)
    print(state, action, rew)
#     print(agent.q_table)
    reward += rew
    if np.mod(j, 9)==0:
        reward_list.append(np.average(reward))
        reward = 0
 
#     df_new.loc[i] = [state, action, delay]
# df_new.to_csv("data_my_.csv")

-1 1 0
0 -1 -2
1 -1 0
-1 -1 -2
-1 -1 2
-1 -1 2
0 -1 2
0 0 0
-1 1 0
1 1 -2
1 1 2
0 1 2
1 1 0
-1 -1 2
0 -1 2
0 -1 0
0 -1 0
0 -1 0
0 -1 0
1 1 0
0 -1 2
0 0 0
1 1 0
0 -1 2
-1 -1 0
1 1 2
1 1 2
1 1 2
-1 -1 2
0 -1 2
-1 -1 0
0 -1 2
1 0 0
-1 -1 0
0 0 2
-1 -1 0
0 0 2
0 -1 0
1 1 0
0 -1 2
1 -1 0
1 1 -2
0 -1 2
1 1 0
1 1 2
-1 -1 2
1 1 2
-1 -1 2
-1 -1 2
-1 -1 2
