### TASKS 1. Implement Q-learning algorithm using OpenAI gym environment.

## TASK-1 

In [None]:
import numpy as np
import random
import gym


def q_learning(env, num_episodes=10000, learning_rate=0.8, discount_factor=0.95, exploration_prob=0.2):
 
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
         
            if random.uniform(0, 1) < exploration_prob:
                action = env.action_space.sample()  
            else:
                action = np.argmax(q_table[state, :])  
            next_state, reward, done, _ = env.step(action)

            
            q_table[state, action] = q_table[state, action] + learning_rate * (
                    reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

            state = next_state

    return q_table

if __name__ == "__main__":
    env = gym.make('FrozenLake-v1')
    num_episodes = 10000
    q_table = q_learning(env, num_episodes=num_episodes)

    num_test_episodes = 100
    num_successes = 0

    for i in range(num_test_episodes):
        state = env.reset()
        done = False

        while not done:
            action = np.argmax(q_table[state, :])
            next_state, _, done, _ = env.step(action)
            state = next_state

            if done and state == 15:  
                num_successes += 1

    success_rate = num_successes / num_test_episodes
    print(f"Success rate over {num_test_episodes} episodes: {success_rate}")


#####  TASK-2  The Smartcab&#39;s job is to pick up the passenger at one location and drop them off in another. The agent should receive a high positive reward for a successful drop-off because this behavior is highly desired The agent should be penalized if it tries to drop off a passenger in wrong locations The agent should get a slight negative reward for not making it to the destination after every time-step. The passenger can be in one of the four possible locations: R, G, Y, B, which are represented in row, column coordinates as (0,0), (0,4), (4,0), (4,3) respectively. Additionally, we need to consider a fifth state where the passenger is already inside the taxi. Therefore, the number of possible states for the passenger&#39;s location is 5. The destination can be one of the four possible locations: R, G, Y, B, which are also represented in row, column coordinates. Therefore, the number of possible states for the destination is 4. We have six possible actions:
1. south
2. north
3. east
4. west
5. pickup
6. dropoff
#### Implement the above problem using Gym environment called Taxi-V2.

In [None]:
import gym

env = gym.make('Taxi-v3')


def get_reward(state, next_state, done):
    if done and next_state == 4:  
        return 100
    elif done and next_state != 4:  
        return -10
    elif state == next_state:  
        return -1
    else:
        return 0


def q_learning(env, num_episodes=10000, learning_rate=0.8, discount_factor=0.95, exploration_prob=0.2):
    
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            
            if random.uniform(0, 1) < exploration_prob:
                action = env.action_space.sample()  
            else:
                action = np.argmax(q_table[state, :])  

            next_state, _, done, _ = env.step(action)

        
            reward = get_reward(state, next_state, done)

            
            q_table[state, action] = q_table[state, action] + learning_rate * (
                    reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

            state = next_state

    return q_table

if __name__ == "__main__":
    import numpy as np
    import random

    num_episodes = 10000
    q_table = q_learning(env, num_episodes=num_episodes)

    
    num_test_episodes = 100
    num_successes = 0

    for i in range(num_test_episodes):
        state = env.reset()
        done = False

        while not done:
            action = np.argmax(q_table[state, :])
            next_state, _, done, _ = env.step(action)
            state = next_state

            if done and state == 4:  
                num_successes += 1

    success_rate = num_successes / num_test_episodes
    print(f"Success rate over {num_test_episodes} episodes: {success_rate}")
