In [1]:
import numpy as np
import random

In [5]:
# Environment Setup
class RandomEnvironment:
    def __init__(self, num_states=5, num_actions=2):
        self.num_states = num_states
        self.num_actions = num_actions
        self.transition_matrix = np.random.rand(num_states, num_actions, num_states)
        self.transition_matrix /= self.transition_matrix.sum(axis=2, keepdims=True)  # Normalize
        self.reward_matrix = np.random.rand(num_states, num_actions)  # Random rewards
        self.state = np.random.randint(0, num_states)  # Start at a random state
    
    def step(self, action):
        next_state = np.random.choice(self.num_states, p=self.transition_matrix[self.state, action])
        reward = self.reward_matrix[self.state, action]
        self.state = next_state
        return next_state, reward
    
    def reset(self):
        self.state = np.random.randint(0, self.num_states)
        return self.state

# Preprocessing
def preprocess_environment(env):
    return env.num_states, env.num_actions

# Train-Test Split
def train_test_split(data, split_ratio=0.8):
    split_idx = int(len(data) * split_ratio)
    return data[:split_idx], data[split_idx:]

In [6]:
# SARSA Algorithm Class
class SARSA:
    def __init__(self, num_states, num_actions, alpha=0.1, gamma=0.9, epsilon=0.1, episodes=1000):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.episodes = episodes
        self.q_table = np.zeros((num_states, num_actions))  # Initialize Q-table
    
    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return np.random.randint(self.num_actions)  # Explore
        return np.argmax(self.q_table[state])  # Exploit
    
    def train(self, env):
        for _ in range(self.episodes):
            state = env.reset()
            action = self.choose_action(state)
            while True:
                next_state, reward = env.step(action)
                next_action = self.choose_action(next_state)
                # SARSA update rule
                self.q_table[state, action] += self.alpha * (
                    reward + self.gamma * self.q_table[next_state, next_action] - self.q_table[state, action]
                )
                if random.uniform(0, 1) < 0.1:  # Random stopping condition for demo
                    break
                state, action = next_state, next_action
    
    def evaluate(self):
        return np.mean(self.q_table)  # Simple evaluation metric
    
    def deploy_policy(self):
        return np.argmax(self.q_table, axis=1)  # Best actions for each state

In [7]:
# Main Execution
if __name__ == "__main__":
    env = RandomEnvironment()
    num_states, num_actions = preprocess_environment(env)
    
    # Train SARSA Model
    sarsa_agent = SARSA(num_states, num_actions)
    sarsa_agent.train(env)
    
    # Evaluate Model
    avg_q_value = sarsa_agent.evaluate()
    print(f"Average Q-Value: {avg_q_value}")
    
    # Deploy Policy
    optimal_policy = sarsa_agent.deploy_policy()
    print(f"Optimal Policy: {optimal_policy}")

Average Q-Value: 6.6275134896880985
Optimal Policy: [0 1 1 0 1]
