# RL Lab Assignment - 4

### CS22B1093 Rohan G

----------------------

### Implement SARSA and Q-Learning using appropriate update functions for Frozen Lake environment.
### Additionally, implement the same using any environment of choice (refer gymnasium documentation).

#### Importing Necessary Libraries

In [17]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time

#### Frozen Lake

In [18]:
def epsilon_greedy(Q, state, epsilon, n_actions):
    if np.random.rand() < epsilon:
        return np.random.randint(n_actions)  # Random action
    return np.argmax(Q[state])  # Best action

def sarsa(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=100000):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    for episode in range(episodes):
        state, _ = env.reset()  # Reset environment; state is the first element.
        action = epsilon_greedy(Q, state, epsilon, env.action_space.n)
        done = False
        
        while not done:
            next_state, reward, done, truncated, info = env.step(action)
            next_action = epsilon_greedy(Q, next_state, epsilon, env.action_space.n)
            # SARSA update rule
            Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
            state, action = next_state, next_action  # Move to next state and action
            
    return Q

def q_learning(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=100000):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    for episode in range(episodes):
        state, _ = env.reset()  # Reset environment; state is the first element.
        done = False
        
        while not done:
            action = epsilon_greedy(Q, state, epsilon, env.action_space.n)
            next_state, reward, done, truncated, info = env.step(action)
            # Q-Learning update rule
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state  # Move to next state
            
    return Q

if __name__ == "__main__":
    # Create a deterministic FrozenLake environment.
    env = gym.make("FrozenLake-v1", is_slippery=False)
    
    print("Training SARSA on FrozenLake...")
    Q_sarsa = sarsa(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=100000)
    print("SARSA Q-Table:\n", Q_sarsa)
    
    # Re-create the environment for Q-Learning training to start fresh.
    env = gym.make("FrozenLake-v1", is_slippery=False)
    
    print("\nTraining Q-Learning on FrozenLake...")
    Q_qlearning = q_learning(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=100000)
    print("Q-Learning Q-Table:\n", Q_qlearning)


Training SARSA on FrozenLake...
SARSA Q-Table:
 [[0.70192616 0.79955915 0.67809948 0.69903331]
 [0.73445844 0.         0.54812374 0.56956583]
 [0.55043742 0.32666845 0.09450037 0.44302905]
 [0.32747536 0.         0.         0.        ]
 [0.70153063 0.81834821 0.         0.72266045]
 [0.         0.         0.         0.        ]
 [0.         0.9201008  0.         0.44493749]
 [0.         0.         0.         0.        ]
 [0.78332527 0.         0.94038028 0.7401598 ]
 [0.82016871 0.9629409  0.81895418 0.        ]
 [0.92935132 0.98966197 0.         0.8635999 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.97451909 0.98478742 0.89674206]
 [0.97136031 0.98851343 1.         0.94985392]
 [0.         0.         0.         0.        ]]

Training Q-Learning on FrozenLake...
Q-Learning Q-Table:
 [[0.94148015 0.95099005 0.93206535 0.94148015]
 [0.94148015 0.         0.5116827  0.67801647]
 [0.80452842 0.         0.         0.        ]

#### CliffWalker

In [19]:
def epsilon_greedy(Q, state, epsilon, n_actions):
    if np.random.rand() < epsilon:
        return np.random.randint(n_actions)  # Random action (could be 0)
    return np.argmax(Q[state])  # Best (greedy) action

def sarsa(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=50000, print_interval=1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    
    for episode in range(episodes):
        state, _ = env.reset()  # Reset the environment; state is the first element.
        action = epsilon_greedy(Q, state, epsilon, env.action_space.n)
        done = False
        
        while not done:
            next_state, reward, done, truncated, info = env.step(action)
            next_action = epsilon_greedy(Q, next_state, epsilon, env.action_space.n)
            # SARSA update rule:
            Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
            state, action = next_state, next_action  # Move to next state and action
    
    return Q

def q_learning(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=50000, print_interval=1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    
    for episode in range(episodes):
        state, _ = env.reset()  # Reset the environment; state is the first element.
        done = False
        
        while not done:
            action = epsilon_greedy(Q, state, epsilon, env.action_space.n)
            next_state, reward, done, truncated, info = env.step(action)
            # Q-Learning update rule:
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state  # Move to next state
    
    return Q

if __name__ == "__main__":
    # Create the CliffWalking environment.
    env = gym.make("CliffWalking-v0")
    
    print("Training SARSA on CliffWalking...")
    Q_sarsa = sarsa(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=50000, print_interval=1)
    print("Final SARSA Q-Table:\n", Q_sarsa)
    
    # Re-create the environment for Q-Learning to ensure a fresh start.
    env = gym.make("CliffWalking-v0")
    
    print("\nTraining Q-Learning on CliffWalking...")
    Q_qlearning = q_learning(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=50000, print_interval=1)
    print("Final Q-Learning Q-Table:\n", Q_qlearning)


Training SARSA on CliffWalking...
Final SARSA Q-Table:
 [[ -15.37899782  -14.65329748  -16.68538982  -15.33825961]
 [ -14.37660531  -13.47268809  -15.80218594  -15.55899291]
 [ -13.50303413  -12.53734656  -13.42975245  -14.58210229]
 [ -12.35614642  -11.4540435   -12.30718624  -13.87359431]
 [ -11.42098761  -10.44402317  -11.72310666  -12.62525724]
 [ -10.41764245   -9.61452071  -10.61973609  -11.6653004 ]
 [  -9.50395937   -8.70745762   -9.04253092  -10.64137781]
 [  -8.43115768   -7.56530153   -7.86573275   -9.79052122]
 [  -7.39079848   -6.44801526   -6.66467794   -8.56249009]
 [  -6.38598471   -5.5448607    -6.09238931   -7.53796622]
 [  -5.32392208   -4.56923783   -4.45814472   -6.5490896 ]
 [  -4.32324294   -4.28404454   -3.50231496   -5.7500307 ]
 [ -15.2703113   -14.72309962  -18.13914245  -16.09325941]
 [ -14.69202988  -12.86031509  -32.77461631  -15.77509948]
 [ -13.43992087  -12.17633846  -17.21482228  -15.08268657]
 [ -12.4828454   -11.13094785  -18.90623887  -13.39076366]


------------