In [None]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-yuc_mxrn/gym-walk_7d616d6feeec4330b8c97e7327e8de04
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-yuc_mxrn/gym-walk_7d616d6feeec4330b8c97e7327e8de04
  Resolved https://github.com/mimoralea/gym-walk to commit 5999016267d6de2f5a63307fb00dfd63de319ac1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import warnings ; warnings.filterwarnings('ignore')

import gym, gym_walk
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)

In [None]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [None]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [None]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            state, _, done, h = env.step(pi[state])  # Changed line: Accessing action from pi array
            steps += 1
        results.append(state == goal_state)
    return np.mean(results)

In [None]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            action = pi[state]
            state, reward, done, _ = env.step(action)
            results[-1] += reward
            steps += 1
    return np.mean(results)

In [None]:
env = gym.make('FrozenLake-v1')
P = env.env.P
init_state = env.reset()
goal_state = 15
#LEFT, RIGHT = range(2)

In [None]:
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

In [None]:
def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
    decay_steps = int(max_steps * decay_ratio)
    values = np.logspace(log_start, 0, decay_steps, base=log_base)
    values = (values - values.min()) / (values.max() - values.min())
    values = init_value + (min_value - init_value) * values
    values = np.concatenate((values, np.full(max_steps - decay_steps, min_value)))
    return values

In [None]:
def generate_trajectory(select_action, Q, epsilon, env, max_steps=200):
    state, done, trajectory = env.reset(), False, []
    for _ in range(max_steps):
        if done:
            break
        action = select_action(state, Q, epsilon)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward))
        state = next_state
    return np.array(trajectory, object)

In [None]:
def mc_control(env, gamma=1.0, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5,
               init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9,
               n_episodes=3000, max_steps=200, first_visit=True):
    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))


    alphas = decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
    epsilons = decay_schedule(init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes)

    def select_action(state, Q, epsilon):
        if np.random.rand() < epsilon:
            return np.random.choice(nA)
        return np.argmax(Q[state])

    for episode in range(n_episodes):
        epsilon = epsilons[episode]
        alpha = alphas[episode]
        trajectory = generate_trajectory(select_action, Q, epsilon, env, max_steps)

        G = 0
        visited = set()

        for t in reversed(range(len(trajectory))):
            state, action, reward = trajectory[t]
            G = gamma * G + reward
            if (state, action) not in visited or not first_visit:
                returns_count[state, action] += 1
                Q[state, action] += alpha * (G - Q[state, action])
                visited.add((state, action))


    pi = np.argmax(Q, axis=1)
    V = np.max(Q, axis=1)

    return Q, V, pi

In [None]:
optimal_Q, optimal_V, optimal_pi = mc_control (env,n_episodes = 15000)
print('Name:SANJAY T    Register Number: 212222110039')
print_state_value_function(optimal_Q, P, n_cols=4, prec=2, title='Action-value function:')
print_state_value_function(optimal_V, P, n_cols=4, prec=2, title='State-value function:')
print_policy(optimal_pi, P)

Name:SANJAY T    Register Number: 212222110039
Action-value function:
| 00 [0.43 0.31 0.34 0.36] | 01 [0.14 0.09 0.07 0.29] | 02 [0.24 0.09 0.12 0.08] | 03 [0.05 0.03 0.04 0.08] |
| 04 [0.45 0.28 0.25 0.23] |           | 06 [0.23 0.06 0.16 0.05] |           |
| 08 [0.25 0.28 0.25 0.49] | 09 [0.19 0.54 0.25 0.23] | 10 [0.51 0.26 0.24 0.1 ] |           |
|           | 13 [0.22 0.28 0.62 0.28] | 14 [0.33 0.57 0.78 0.61] |           |
State-value function:
| 00   0.43 | 01   0.29 | 02   0.24 | 03   0.08 |
| 04   0.45 |           | 06   0.23 |           |
| 08   0.49 | 09   0.54 | 10   0.51 |           |
|           | 13   0.62 | 14   0.78 |           |
Policy:
| 00      < | 01      ^ | 02      < | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      > |           |


In [None]:
print('Name:SANJAY T    Register Number: 212222110039')
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(
    probability_success(env, optimal_pi, goal_state=goal_state)*100,
    mean_return(env, optimal_pi)))

Name:SANJAY T    Register Number: 212222110039
Reaches goal 72.00%. Obtains an average undiscounted return of 0.6100.
