<a href="https://colab.research.google.com/github/sravanipopuri2006/-Alpha-beta-pruning-of-Minimax-Search-Algorithm/blob/main/MonteCarloControlExp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import warnings ; warnings.filterwarnings('ignore')

import gym, gym_walk
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)

In [36]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-794825_v/gym-walk_5cc4550efc6b4d7589034f0fb69c76d5
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-794825_v/gym-walk_5cc4550efc6b4d7589034f0fb69c76d5
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [35]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [34]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [33]:
env = gym.make('FrozenLake-v1')
P = env.env.P
init_state = env.reset()
#goal_state = 6
#LEFT, RIGHT = range(2)

In [None]:
P

Exponentially decaying schedule


In [32]:
import numpy as np

def decay_schedule(init_value, min_value,decay_ratio, max_steps,log_start=-2, log_base=10):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    values = np.logspace(log_start, 0, decay_steps,base=log_base, endpoint=True)[::-1]
    values = (values - values.min()) / (values.max() - values.min())
    values = (init_value - min_value) * values + min_value
    values = np.pad(values, (0, rem_steps), 'edge')
    return values




Exploratory Policy Trajectories

In [11]:
from itertools import count
def generate_trajectory(select_action,Q,epsilon,env,max_steps=200):
    done, trajectory = False, []
    while not done:
        state = env.reset()
        for t in count():
            action = select_action(state,Q,epsilon)
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward,
                          next_state, done)
            trajectory.append(experience)
            if done:
                break
            if t >= max_steps - 1:
                trajectory = []
                break
            state = next_state
    return np.array(trajectory,object)



In [31]:
from tqdm import tqdm

Monte Carlo control

In [42]:
def mc_control(
    env, gamma=1.0,
    init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5,
    init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9,
    n_episodes=3000, max_steps=200, first_visit=True
):
    nS, nA = env.observation_space.n, env.action_space.n
    discounts = np.logspace(0, max_steps, num=max_steps, base=gamma, endpoint=False)


    alphas = decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
    epsilons = decay_schedule(init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes)

    Q = np.zeros((nS, nA))
    Q_track = np.zeros((n_episodes, nS, nA))
    pi_track = []

    for e in tqdm(range(n_episodes), leave=False):
        trajectory = generate_trajectory(select_action, Q, epsilons[e], env, max_steps)

        visited = np.zeros((nS, nA), dtype=np.bool_)

        for t, (state, action, reward, _, _) in enumerate(trajectory):
            if visited[state][action] and first_visit:
                continue
            visited[state][action] = True

            n_steps = len(trajectory[t:])
            rewards = np.array([x[2] for x in trajectory[t:]])
            G = np.sum(discounts[:n_steps] * rewards)

            Q[state][action] = Q[state][action] + alphas[e] * (G - Q[state][action])

        Q_track[e] = Q
        pi_track.append(np.argmax(Q, axis=1))

    V = np.max(Q, axis=1)
    pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}

    return Q, V, pi, Q_track, pi_track




In [39]:
import numpy as np

def select_action(state, Q, epsilon):

    nA = Q.shape[1]

    if np.random.rand() < epsilon:

        return np.random.randint(nA)
    else:

        return np.argmax(Q[state])

In [41]:
optimal_Q, optimal_V, optimal_pi, Q_track, pi_track = mc_control(env)

print_state_value_function(optimal_Q, P, n_cols=4, prec=2, title='Action-value function:')
print_state_value_function(optimal_V, P, n_cols=4, prec=2, title='State-value function:')
print_policy(optimal_pi, P)


                                                     

Action-value function:
| 00 [0.07 0.09 0.08 0.08] | 01 [0.02 0.05 0.07 0.07] | 02 [0.09 0.09 0.09 0.08] | 03 [0.06 0.06 0.06 0.09] |
| 04 [0.11 0.03 0.03 0.02] |           | 06 [0.08 0.09 0.09 0.02] |           |
| 08 [0.01 0.1  0.04 0.02] | 09 [0.02 0.07 0.15 0.07] | 10 [0.28 0.18 0.08 0.09] |           |
|           | 13 [0.03 0.02 0.02 0.19] | 14 [0.1  0.2  0.63 0.28] |           |
State-value function:
| 00   0.09 | 01   0.07 | 02   0.09 | 03   0.09 |
| 04   0.11 |           | 06   0.09 |           |
| 08    0.1 | 09   0.15 | 10   0.28 |           |
|           | 13   0.19 | 14   0.63 |           |
Policy:
| 00      v | 01      ^ | 02      v | 03      ^ |
| 04      < |           | 06      > |           |
| 08      v | 09      > | 10      < |           |
|           | 13      ^ | 14      > |           |


