In [4]:
# Write a program in python to demonstrates SARSA, an on-policy TD learning algorithm

# Name: A. N. Deshpande
# Class: B.E AIML
# Roll No: 43514

# To be Checked and Verified if correct

In [3]:
import numpy as np

# Define the environment
class Environment:
    def __init__(self):
        self.num_states = 6
        self.num_actions = 2
        self.state = 2  # Initial state

    def reset(self):
        self.state = 2  # Reset to initial state
        return self.state

    def step(self, action):
        # Transition function
        if action == 0:  # Left
            self.state -= 1
        else:  # Right
            self.state += 1

        # Reward function
        if self.state == 0:
            reward = -1
            done = True
        elif self.state == 5:
            reward = 1
            done = True
        else:
            reward = 0
            done = False

        return self.state, reward, done

# SARSA agent
class SarsaAgent:
    def __init__(self, num_states, num_actions, epsilon=0.1, alpha=0.1, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.Q = np.zeros((num_states, num_actions))

    def choose_action(self, state):
        # Epsilon-greedy policy
        if np.random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(self.Q[state])
        return action

    def update_Q(self, state, action, reward, next_state, next_action):
        # SARSA update rule
        target = reward + self.gamma * self.Q[next_state, next_action]
        self.Q[state, action] += self.alpha * (target - self.Q[state, action])

# Main loop
def main():
    env = Environment()
    agent = SarsaAgent(env.num_states, env.num_actions)

    num_episodes = 1000

    for episode in range(num_episodes):
        state = env.reset()
        action = agent.choose_action(state)

        while True:
            next_state, reward, done = env.step(action)
            next_action = agent.choose_action(next_state)
            agent.update_Q(state, action, reward, next_state, next_action)

            if done:
                break

            state = next_state
            action = next_action

    # Print the learned Q-values
    print("Learned Q-values:")
    print(agent.Q)

if __name__ == "__main__":
    main()


Learned Q-values:
[[ 0.          0.        ]
 [-0.271       0.70384518]
 [ 0.60368832  0.8010412 ]
 [ 0.68400089  0.88297971]
 [ 0.75455573  1.        ]
 [ 0.          0.        ]]
