# FrozenLake-v1 환경을 통한 Monte Carlo Method 실습

# ToDo: MC에 대한 설명 추가
- sampling을 통해 추정(approximate)한다.
- 수식 추가

## Library Import

In [1]:
import gym
import numpy as np

**MC 에서는 episode가 종단 상태를 만나 끝에 도달해야 한다.**

FrozenLake 에서는 Goal에 도착하면 1의 reward를 얻고, 나머지 상태에서는 0의 reward를 얻는다.

In [2]:
env = gym.make('FrozenLake-v1')
env = gym.wrappers.TimeLimit(env, max_episode_steps=20)

## Generate episode (sampling)

In [3]:
def generate_episode(env, policy):
    states, actions, rewards = [], [], []
    
    state = env.reset()
    
    while True:
        # Append State
        states.append(state)
        
        # Append Action
        probs = policy[state]
        action = np.random.choice(np.arange(len(probs)), p=probs)
        actions.append(action)
        
        state, reward, done, info = env.step(action)
        
        # Append reward
        rewards.append(reward)
        
        if done:
            break
    
    return states, actions, rewards

**몇번의 episode가 있어야 goal에 도달하는 episode를 얻을 수 있을까?**

- policy probability가 동일한 random policy에서 시뮬레이션해보자.

In [4]:
env = gym.make('FrozenLake-v1')
env = gym.wrappers.TimeLimit(env, max_episode_steps=20)

policy = np.ones([env.nS, env.nA]) / env.nA

step = 0
while True:
    step += 1
    states, actions, rewards = generate_episode(env, policy)
    
    if rewards[-1] == 1.0:
        break
    
print("step:", step)
print('states:', states)
print('actions:', actions)
print('rewards:', rewards)

step: 84
states: [0, 0, 1, 2, 6, 10, 14]
actions: [0, 1, 1, 1, 2, 2, 3]
rewards: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


# Monte Carlo predictino for Value function

## Every-visit MC prediction

각 episode에서 마주치는 모든 state에 대해 state value function을 update한다.

In [5]:
def every_visit_MC_prediction(env, policy, n_sample, gamma = 1.0):
    
    # 특정 state를 방문한 횟수
    N = np.zeros(env.nS)
    
    # state value function
    V = np.zeros(env.nS)
    
    for i in range(n_sample):
        states, actions, rewards = generate_episode(env, policy)
        
        G = 0
        
        for t in range(len(states) -1, -1, -1):
            S = states[t]
            G = gamma * G + rewards[t]
            N[S] += 1
            V[S] = V[S] + (G - V[S]) / N[S]
    
    return V

In [6]:
env = gym.make('FrozenLake-v1')
env = gym.wrappers.TimeLimit(env, max_episode_steps=30)

# sample의 갯수
n_sample = 50000

# random policy
random_policy = np.ones([env.nS, env.nA]) / env.nA

every_visit_Value_function = every_visit_MC_prediction(env, random_policy, n_sample, 0.9)

print('State Value function')
print(every_visit_Value_function)

State Value function
[0.00458278 0.00419146 0.00898969 0.00363001 0.00684434 0.
 0.02625331 0.         0.01960489 0.05973598 0.11115034 0.
 0.         0.1330648  0.38556069 0.        ]


## First-visit MC prediction

각 episode를 통해 backprop update 중에 마주치는 각 state는 한번씩만 update된다. (중복된 state는 update하지 않는다.)

- 구현하는 방법은 여러가지가 있을 수 있음

In [7]:
def first_visit_MC_prediction(env, policy, n_sample, gamma = 1.0):
    
    N = np.zeros(env.nS)
    V = np.zeros(env.nS)
    visit = np.zeros(env.nS, dtype=int) - 1
    
    for i in range(n_sample):
        states, actions, rewards = generate_episode(env, policy)
        
        G = 0
        
        for t in range(len(states) - 1, -1, -1):
            S = states[t]
            G = gamma * G + rewards[t]
            
            if visit[S] != i:
                visit[S] = i
                N[S] += 1
                V[S] = V[S] + (G - V[S]) / N[S]
    
    return V

In [8]:
env = gym.make('FrozenLake-v1')
env = gym.wrappers.TimeLimit(env, max_episode_steps=30)

# sample의 갯수
n_sample = 50000

# random policy
random_policy = np.ones([env.nS, env.nA]) / env.nA

first_visit_Value_function = first_visit_MC_prediction(env, random_policy, n_sample, 0.9)

print('State Value function')
print(first_visit_Value_function)

State Value function
[0.00572872 0.00471985 0.01131949 0.00329021 0.00779079 0.
 0.02619256 0.         0.02077171 0.06251912 0.11460876 0.
 0.         0.14114219 0.43596378 0.        ]


### 두 방법의 비교

In [9]:
for i in range(env.nS):
    print('{:.4f} {:.4f}'.format(every_visit_Value_function[i], first_visit_Value_function[i]))

0.0046 0.0057
0.0042 0.0047
0.0090 0.0113
0.0036 0.0033
0.0068 0.0078
0.0000 0.0000
0.0263 0.0262
0.0000 0.0000
0.0196 0.0208
0.0597 0.0625
0.1112 0.1146
0.0000 0.0000
0.0000 0.0000
0.1331 0.1411
0.3856 0.4360
0.0000 0.0000


# Monte Carlo predictino for Q-function

## Every-visit MC prediction

In [10]:
def every_visit_MC_Q_prediction(env, policy, n_sample, gamma = 1.0):
    N = np.zeros([env.nS, env.nA])
    Q = np.zeros([env.nS, env.nA])
    
    for i in range(n_sample):
        states, actions, rewards = generate_episode(env, policy)
        
        G = 0
        
        for t in range(len(states)-1, -1, -1):
            S = states[t]
            A = actions[t]
            G = gamma * G + rewards[t]
            
            N[S, A] += 1
            Q[S, A] = Q[S, A] + (G - Q[S, A]) / N[S, A]
            
    return Q

In [11]:
env = gym.make('FrozenLake-v1')
env = gym.wrappers.TimeLimit(env, max_episode_steps=30)

# sample의 갯수
n_sample = 50000

# random policy
random_policy = np.ones([env.nS, env.nA]) / env.nA

every_visit_Q = every_visit_MC_Q_prediction(env, random_policy, n_sample, 0.9)

print('Action Value function')
print(every_visit_Q)

Action Value function
[[0.00487364 0.00483908 0.00453006 0.00350828]
 [0.00219985 0.00415718 0.00450523 0.00555318]
 [0.01294797 0.01013901 0.01071531 0.00530159]
 [0.00361115 0.00447873 0.00251629 0.00548525]
 [0.00945723 0.00710523 0.00766139 0.00348851]
 [0.         0.         0.         0.        ]
 [0.0309953  0.0344699  0.03604571 0.00157178]
 [0.         0.         0.         0.        ]
 [0.00741656 0.02422018 0.02003064 0.02589799]
 [0.03993293 0.08026032 0.0735515  0.0373133 ]
 [0.1305101  0.13630822 0.15018893 0.02636338]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.04343365 0.15288292 0.16409099 0.12050047]
 [0.19681495 0.5060536  0.50226863 0.40113193]
 [0.         0.         0.         0.        ]]


## First-visit MC prediction

In [12]:
def first_visit_MC_Q_prediction(env, policy, n_sample, gamma = 1.0):
    N = np.zeros([env.nS, env.nA])
    Q = np.zeros([env.nS, env.nA])
    visit = np.zeros([env.nS, env.nA], dtype='int') - 1
    for i in range(n_sample):
        states, actions, rewards = generate_episode(env, policy)
        
        G = 0
        
        for t in range(len(states)-1, -1, -1):
            S = states[t]
            A = actions[t]
            G = gamma * G + rewards[t]
            
            if visit[S, A] != i:
                visit[S, A] = i
                N[S, A] += 1
                Q[S, A] = Q[S, A] + (G - Q[S, A]) / N[S, A]
            
    return Q

In [13]:
env = gym.make('FrozenLake-v1')
env = gym.wrappers.TimeLimit(env, max_episode_steps=30)

# sample의 갯수
n_sample = 50000

# random policy
random_policy = np.ones([env.nS, env.nA]) / env.nA

first_visit_Q = first_visit_MC_Q_prediction(env, random_policy, n_sample, 0.9)

print('Action Value function')
print(first_visit_Q)

Action Value function
[[0.00506254 0.00544273 0.00465041 0.00421077]
 [0.0028829  0.00409655 0.00412922 0.00550699]
 [0.01117867 0.0092762  0.01357478 0.00458708]
 [0.00410235 0.00363861 0.00280675 0.00620961]
 [0.00877252 0.00809771 0.00675081 0.00335377]
 [0.         0.         0.         0.        ]
 [0.0301989  0.02706705 0.0360676  0.00292674]
 [0.         0.         0.         0.        ]
 [0.0089488  0.02178521 0.02230243 0.02507339]
 [0.04756612 0.07964776 0.0708331  0.04238941]
 [0.14927881 0.12141953 0.1238912  0.01923141]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.05543773 0.17754105 0.17366302 0.12896567]
 [0.20935614 0.50253496 0.51651626 0.42278114]
 [0.         0.         0.         0.        ]]


### 두 방법의 비교

In [14]:
for i in range(env.nS):
    print('{}\n{}\n'.format(np.round(every_visit_Q[i], 4), np.round(first_visit_Q[i], 4)))

[0.0049 0.0048 0.0045 0.0035]
[0.0051 0.0054 0.0047 0.0042]

[0.0022 0.0042 0.0045 0.0056]
[0.0029 0.0041 0.0041 0.0055]

[0.0129 0.0101 0.0107 0.0053]
[0.0112 0.0093 0.0136 0.0046]

[0.0036 0.0045 0.0025 0.0055]
[0.0041 0.0036 0.0028 0.0062]

[0.0095 0.0071 0.0077 0.0035]
[0.0088 0.0081 0.0068 0.0034]

[0. 0. 0. 0.]
[0. 0. 0. 0.]

[0.031  0.0345 0.036  0.0016]
[0.0302 0.0271 0.0361 0.0029]

[0. 0. 0. 0.]
[0. 0. 0. 0.]

[0.0074 0.0242 0.02   0.0259]
[0.0089 0.0218 0.0223 0.0251]

[0.0399 0.0803 0.0736 0.0373]
[0.0476 0.0796 0.0708 0.0424]

[0.1305 0.1363 0.1502 0.0264]
[0.1493 0.1214 0.1239 0.0192]

[0. 0. 0. 0.]
[0. 0. 0. 0.]

[0. 0. 0. 0.]
[0. 0. 0. 0.]

[0.0434 0.1529 0.1641 0.1205]
[0.0554 0.1775 0.1737 0.129 ]

[0.1968 0.5061 0.5023 0.4011]
[0.2094 0.5025 0.5165 0.4228]

[0. 0. 0. 0.]
[0. 0. 0. 0.]



# Monte Calor Control with ${\varepsilon}$-Greedy

## TODO
- 그림과 함께 설명 추가
- Dynamic Programming의 Policy iteration과 어떤 차이가 있는지
- epsilon greedy 방법 설명 (탐험)

설명 추가
- Q function 사용
- epsilon greedy
- value iteration 차용
- GILE: Greedy in the Limit Infinite Exploration