In [1]:
#this is a simple implementation of Monte Carlo Prediction for GridWorld environment.
#the code is written in python3 and uses numpy and pprint libraries.
#the code represents the state-value function for the GridWorld environment.
#code written by : Sourabh Vamdevan.




import numpy as np
import pprint

class GridWorld:
    def __init__(self):
        self.states = [(i, j) for i in range(4) for j in range(4)]  
        self.terminal_states = [(0, 0), (3, 3)]
        self.actions = ["up", "down", "left", "right"]
        self.state = None

    def reset(self):
        self.state = (3, 0)  
        return self.state

    def step(self, action):
        if self.state in self.terminal_states:
            return self.state, 0, True

        i, j = self.state
        if action == "up":
            next_state = (max(i - 1, 0), j)
        elif action == "down":
            next_state = (min(i + 1, 3), j)
        elif action == "left":
            next_state = (i, max(j - 1, 0))
        elif action == "right":
            next_state = (i, min(j + 1, 3))
        else:
            raise ValueError("Invalid action")

        reward = -1  
        done = next_state in self.terminal_states
        self.state = next_state
        return next_state, reward, done


policy = lambda: np.random.choice(["up", "down", "left", "right"])

# Monte Carlo Prediction
def monte_carlo_prediction(env, policy, episodes, gamma=0.9):
    value_table = {state: 0 for state in env.states}
    returns = {state: [] for state in env.states}  # To store returns for each state

    for episode in range(episodes):
        
        state = env.reset()
        episode_data = []
        done = False
        while not done:
            action = policy()
            next_state, reward, done = env.step(action)
            episode_data.append((state, reward))
            state = next_state

        
        G = 0  
        for state, reward in reversed(episode_data):
            G = reward + gamma * G
            if state not in [x[0] for x in episode_data[:episode_data.index((state, reward))]]:
                
                returns[state].append(G)
                value_table[state] = np.mean(returns[state])

        
        print(f"Episode {episode + 1}/{episodes} completed")

    return value_table


if __name__ == "__main__":
    env = GridWorld()
    episodes = 1000
    gamma = 0.9

    print("Running Monte Carlo Prediction...")
    value_table = monte_carlo_prediction(env, policy, episodes, gamma)

    print("\nFinal State-Value Function:")
    pprint.pprint(value_table)


Running Monte Carlo Prediction...
Episode 1/1000 completed
Episode 2/1000 completed
Episode 3/1000 completed
Episode 4/1000 completed
Episode 5/1000 completed
Episode 6/1000 completed
Episode 7/1000 completed
Episode 8/1000 completed
Episode 9/1000 completed
Episode 10/1000 completed
Episode 11/1000 completed
Episode 12/1000 completed
Episode 13/1000 completed
Episode 14/1000 completed
Episode 15/1000 completed
Episode 16/1000 completed
Episode 17/1000 completed
Episode 18/1000 completed
Episode 19/1000 completed
Episode 20/1000 completed
Episode 21/1000 completed
Episode 22/1000 completed
Episode 23/1000 completed
Episode 24/1000 completed
Episode 25/1000 completed
Episode 26/1000 completed
Episode 27/1000 completed
Episode 28/1000 completed
Episode 29/1000 completed
Episode 30/1000 completed
Episode 31/1000 completed
Episode 32/1000 completed
Episode 33/1000 completed
Episode 34/1000 completed
Episode 35/1000 completed
Episode 36/1000 completed
Episode 37/1000 completed
Episode 38/10