gymnasium 설치: `conda install -c conda-forge gymnasium`

In [None]:
import gymnasium as gym
import numpy as np
import random


In [None]:
# ✅ 환경 생성
env = gym.make("FrozenLake-v1", is_slippery=False)

state, info = env.reset()
print("초기 상태:", state)

print("상태 개수:", env.observation_space.n)
print("행동 개수:", env.action_space.n)

In [None]:
# 랜덤 정책
def run_random_episode(env):
    state, info = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        
        # terminated: 목표 도달 or 구멍에 빠짐, truncated: 최대 스텝 도달
        done = terminated or truncated

        total_reward += reward
        state = next_state

    return total_reward

for i in range(10):
    print(f"Episode {i+1} reward:", run_random_episode(env))


In [None]:
# 성공률(평균 보상) 측정
def random_success_rate(env, n=200):
    rewards = [run_random_episode(env) for _ in range(n)]
    return float(np.mean(rewards))

print("랜덤 성공률(대략):", random_success_rate(env, n=500))


In [None]:
# Q-테이블 초기화
n_states = env.observation_space.n
n_actions = env.action_space.n
Q = np.zeros((n_states, n_actions), dtype=np.float32)

print("Q-table shape:", Q.shape)
print("Q[0] =", Q[0])


In [None]:
# ε-greedy
def choose_action(Q, state, epsilon, env):
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

print("test eps=1.0:", [choose_action(Q, 0, 1.0, env) for _ in range(5)])
print("test eps=0.0:", [choose_action(Q, 0, 0.0, env) for _ in range(5)])


In [None]:
# 학습 루프
alpha = 0.1
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.05

num_episodes = 2000
rewards = []

for episode in range(num_episodes):
    state, info = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = choose_action(Q, state, epsilon, env)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        state = next_state
        total_reward += reward

    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    rewards.append(total_reward)

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode+1}, avg reward (last 100): {np.mean(rewards[-100:]):.3f}")

print("학습 끝! 마지막 100 에피소드 평균:", np.mean(rewards[-100:]))


In [None]:
# 학습 후 greedy 정책 성공률

def run_greedy(Q, env, n=500):
    success = 0
    for _ in range(n):
        state, info = env.reset()
        done = False
        while not done:
            action = int(np.argmax(Q[state]))
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        success += reward
    return success / n

print("학습 후 greedy 성공률:", run_greedy(Q, env, n=500))


In [None]:
arrow = {0:"←", 1:"↓", 2:"→", 3:"↑"}

def print_policy(Q, nrow=4, ncol=4):
    out = []
    for s in range(nrow * ncol):
        a = int(np.argmax(Q[s]))
        out.append(arrow[a])
    for r in range(nrow):
        print(" ".join(out[r*ncol:(r+1)*ncol]))

print_policy(Q)
