In [1]:
import numpy as np

class Environment:
    def __init__(self):
        self.states = ['S', 'A', 'B', 'C', 'G']  # States: S - Start, A, B, C, G - Goal
        self.num_states = len(self.states)
        self.actions = ['left', 'right']  # Actions: left, right
        self.num_actions = len(self.actions)
        self.transitions = {
            'S': {'left': 'S', 'right': 'A'},
            'A': {'left': 'S', 'right': 'B'},
            'B': {'left': 'A', 'right': 'C'},
            'C': {'left': 'B', 'right': 'G'},
            'G': {}
        }  # Transition function
        self.rewards = {
            'S': {'left': 0, 'right': 0},
            'A': {'left': 0, 'right': 0},
            'B': {'left': 0, 'right': 0},
            'C': {'left': 0, 'right': 0},
            'G': {}
        }  # Reward function

    def reset(self):
        return 'S'  # Reset to initial state 'S'

    def step(self, state, action):
        next_state = self.transitions[state][action]
        reward = self.rewards[state][action]
        done = (next_state == 'G')  # Terminates if reaching the goal state 'G'
        return next_state, reward, done

class MonteCarloOffPolicyControl:
    def __init__(self, env, num_episodes, gamma, behavior_policy, target_policy, epsilon):
        self.env = env
        self.num_episodes = num_episodes
        self.gamma = gamma
        self.behavior_policy = behavior_policy
        self.target_policy = target_policy
        self.epsilon = epsilon
        self.q_values = {(s, a): 0 for s in env.states for a in env.actions}
        self.c_values = {(s, a): 0 for s in env.states for a in env.actions}

    def generate_episode(self):
        episode = []
        state = self.env.reset()
        while True:
            action = np.random.choice(self.env.actions, p=self.behavior_policy[state])
            next_state, reward, done = self.env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
            if done:
                break
        return episode

    def update_q_values(self, episode):
        G = 0
        weight = 1
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            self.c_values[(state, action)] += weight
            self.q_values[(state, action)] += (weight / self.c_values[(state, action)]) * (G - self.q_values[(state, action)])
            if action != self.target_policy[state]:
                break
            weight /= self.behavior_policy[state][action]

    def run(self):
        for _ in range(self.num_episodes):
            episode = self.generate_episode()
            self.update_q_values(episode)

# Example usage:
env = Environment()
# Define behavior policy (e.g., epsilon-greedy)
behavior_policy = {'S': [0.5, 0.5], 'A': [0.5, 0.5], 'B': [0.5, 0.5], 'C': [0.5, 0.5], 'G': []}
# Define target policy (e.g., greedy)
target_policy = {'S': 1, 'A': 1, 'B': 1, 'C': 1, 'G': []}
mc_off_policy_control = MonteCarloOffPolicyControl(env, num_episodes=1000, gamma=0.9, behavior_policy=behavior_policy, target_policy=target_policy, epsilon=0.1)
mc_off_policy_control.run()
print("Q Values:", mc_off_policy_control.q_values)

Q Values: {('S', 'left'): 0, ('S', 'right'): 0, ('A', 'left'): 0, ('A', 'right'): 0, ('B', 'left'): 0, ('B', 'right'): 0, ('C', 'left'): 0, ('C', 'right'): 0.0, ('G', 'left'): 0, ('G', 'right'): 0}


In [None]:
import numpy as np

# Define the grid world
GRID_SIZE = 5
START_STATE = (0, 0)
GOAL_STATE = (GRID_SIZE-1, GRID_SIZE-1)
OBSTACLES = [(1, 1), (2, 2), (3, 3)]
ACTIONS = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up

# Define the SARSA parameters
alpha = 0.1
gamma = 0.9
epsilon = 0.1
num_episodes = 1000

# Initialize the Q-values
Q = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

# Define the epsilon-greedy policy
def epsilon_greedy_policy(state, Q, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.randint(len(ACTIONS))  # Explore action space
    else:
        return np.argmax(Q[state])  # Exploit learned values

# Run SARSA
for _ in range(num_episodes):
    state = START_STATE
    action = epsilon_greedy_policy(state, Q, epsilon)
    while state != GOAL_STATE:
        next_state = (state[0] + ACTIONS[action][0], state[1] + ACTIONS[action][1])
        if next_state in OBSTACLES or not(0 <= next_state[0] < GRID_SIZE) or not(0 <= next_state[1] < GRID_SIZE):
            next_state = state  # Stay in the current state if hitting an obstacle or out of bounds
        next_action = epsilon_greedy_policy(next_state, Q, epsilon)
        reward = 1 if next_state == GOAL_STATE else 0
        Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])
        state = next_state
        action = next_action

# Print the learned Q-values
for i in range(GRID_SIZE):
    for j in range(GRID_SIZE):
        print(f"State: ({i}, {j}) - Q-values: {Q[(i, j)]}")


State: (0, 0) - Q-values: [0.42526003 0.31670972 0.17809119 0.32334191]
State: (0, 1) - Q-values: [0.49507268 0.32651324 0.3596054  0.37174281]
State: (0, 2) - Q-values: [0.54198916 0.34742212 0.23641026 0.40073594]
State: (0, 3) - Q-values: [0.63133494 0.42311165 0.40308966 0.46904331]
State: (0, 4) - Q-values: [0.57232147 0.4578545  0.71185721 0.52160307]
State: (1, 0) - Q-values: [0.01546739 0.04315087 0.         0.32468693]
State: (1, 1) - Q-values: [0. 0. 0. 0.]
State: (1, 2) - Q-values: [0.05643965 0.         0.02429257 0.36786944]
State: (1, 3) - Q-values: [0.67497889 0.00145801 0.         0.0533865 ]
State: (1, 4) - Q-values: [0.62766764 0.53323226 0.79552496 0.51813145]
State: (2, 0) - Q-values: [0. 0. 0. 0.]
State: (2, 1) - Q-values: [0. 0. 0. 0.]
State: (2, 2) - Q-values: [0. 0. 0. 0.]
State: (2, 3) - Q-values: [0.71387529 0.         0.         0.0597241 ]
State: (2, 4) - Q-values: [0.65411319 0.46702935 0.88283955 0.60651625]
State: (3, 0) - Q-values: [0. 0. 0. 0.]
State: (