# [ Monte Carlo Control ]
- on-policy Monte Carlo Control

## 1. 특징
1) value function 대신 action value function (=Q function) 사용

2) e-greedy exploration

## 2 .실습
- H 2개를 피해서 Goal에 도착하기

![image.png](attachment:image.png)

### (1) Environment

In [1]:
import numpy as np

In [122]:
class Env:
    def __init__(self,grid_w,grid_h):
        self.grid_w = grid_w
        self.grid_h = grid_h
        self.actions = [(-1,0),(1,0),(0,-1),(0,1)] # up, down, left, right
        self.trap1 = [1,2]
        self.trap2 = [2,1]
        self.goal = [2,2]
        
    def step(self,state,action,reward,penalty):
        x,y=state
        x+=action[0]
        y+=action[1]
        #######################################
        if x<0:
            x=0
        elif x> self.grid_w-1:
            x = self.grid_w-1
        
        if y<0:
            y=0
        elif y>self.grid_h-1:
            y = self.grid_h-1
        
        S = [x,y]
        ########################################
        if (S == self.trap1) or (S== self.trap2):
            R = -1
            D = True
        elif (S==self.goal):
            R = reward
            D = True
        else :
            R = penalty
            D = False
        
        return S,R,D
    
    def reset(self):
        return [0,0]  
            
        

### (2) Agent

In [123]:
class Agent:
    def __init__(self,grid_w,grid_h):
        self.actions = [(-1,0),(1,0),(0,-1),(0,1)]
        self.actions_word = ['UP','DOWN','LEFT','RIGHT']
        self.grid_w = grid_w
        self.grid_h = grid_h
        self.value_table = np.zeros((self.grid_w,self.grid_h))
        self.eps = 0.1
        self.lr = 0.01
        self.dis = 0.9
        self.memory = [] # list for results
    
    def next_states(self, state): # input : current state 
        x,y = state               # output : next state ( for all the action cases )
        next_state = []  
        for A in self.actions:
            x+=A[0]
            y+=A[1]
            
            if x<0:
                x=0
            elif x>self.grid_w-1:
                x=self.grid_w-1
            
            if y<0:
                y=0
            elif y>self.grid_h-1:
                y=self.grid_h-1
                
            next_state.append([x,y])            
        return next_state
                
    def get_action(self,current_state):
        # (1) Exploration
        if np.random.uniform(0,1,1) < self.eps: 
            idx = np.random.choice(len(self.actions),1)[0]
        
        # (2) Exploitation
        else :
            next_v = np.array([])
            for s in self.next_states(current_state):
                next_v = np.append(next_v, self.value_table[tuple(s)]) # cumulative
            max_v = np.amax(next_v) # get the maximum value
            max_index_list = np.where(next_v==max_v)[0]
            
            # finding which action to do ( = idx )
            if len(max_index_list)>1: # multiple max values
                idx = np.random.choice(max_index_list,1)[0]
            else :
                idx = np.argmax(next_v)
                
        # (3) Action!
        action = self.actions[idx] 
        return action
    
    def memorizer(self, S,R,D):
        self.memory.append([S,R,D])
        
    def update(self):
        G_t=0
        visited = []
        for sample in reversed(self.memory):
            S = sample[0]
            R = sample[1]
            if S not in visited:
                visited.append(S)
                G_t = R + self.dis*G_t # total return sum
                V_t = self.value_table[tuple(S)]
                self.value_table[tuple(S)] = V_t + self.lr*(G_t-V_t)
    
    def save_actions(self,action_seq,action):
        idx = self.actions.index(action)
        action_seq.append(self.actions_word[idx])          
                
            

### (3) Implementation

In [124]:
env = Env(grid_w=5,grid_h=5)
agent = Agent(grid_w=5,grid_h=5)
num_episode = 10000
num_success = 0
penalty = -0.3
reward = 2

In [125]:
for episode in range(num_episode):
    action_seq = []
    total_R = 0
    state = env.reset()
    action = agent.get_action(state)
    D = False
    num_walk = 0
    
    while True: # loop until finished ( trap or goal)
        S,R,D = env.step(state,action,reward,penalty) # next State,Return,Done
        agent.memorizer(S,R,D) # save "next S,R,D" in memory
        agent.save_actions(action_seq,action) # save "actions" in action sequences
        num_walk +=1
        
        state = S
        action = agent.get_action(state)
        total_R += R
        
        # finished
        if D:
            if episode %500 ==0:
                print('finished at state',state)
                print('Episode :{}\n Number of steps :{}\n Action sequence : {}\n Total Reward : {}\n'.format(episode,num_walk,action_seq,total_R))
            if state == env.goal:
                num_success +=1
            agent.update()
            agent.memory = []
            break   
        

finished at state [1, 2]
Episode :0
 Number of steps :11
 Action sequence : ['UP', 'RIGHT', 'DOWN', 'LEFT', 'UP', 'UP', 'DOWN', 'RIGHT', 'LEFT', 'RIGHT', 'RIGHT']
 Total Reward : -3.9999999999999996

finished at state [2, 2]
Episode :500
 Number of steps :9
 Action sequence : ['LEFT', 'LEFT', 'DOWN', 'DOWN', 'LEFT', 'DOWN', 'RIGHT', 'RIGHT', 'UP']
 Total Reward : -0.3999999999999999

finished at state [2, 2]
Episode :1000
 Number of steps :10
 Action sequence : ['DOWN', 'LEFT', 'DOWN', 'LEFT', 'DOWN', 'DOWN', 'RIGHT', 'UP', 'RIGHT', 'UP']
 Total Reward : -0.6999999999999997

finished at state [2, 2]
Episode :1500
 Number of steps :6
 Action sequence : ['DOWN', 'DOWN', 'DOWN', 'RIGHT', 'RIGHT', 'UP']
 Total Reward : 0.5

finished at state [2, 2]
Episode :2000
 Number of steps :9
 Action sequence : ['LEFT', 'DOWN', 'DOWN', 'LEFT', 'LEFT', 'DOWN', 'RIGHT', 'RIGHT', 'UP']
 Total Reward : -0.3999999999999999

finished at state [2, 2]
Episode :2500
 Number of steps :11
 Action sequence : ['D

In [126]:
print('Accuarcy :',num_success/num_episode*100,'%')

Accuarcy : 89.16 %


## 3. Reference

"숨니야의 무작정 따라하기" 통해 공부하고 참고하였습니다 :)

( 출처 : https://sumniya.tistory.com/11?category=781573 )