In [None]:
!pip install gymnasium matplotlib numpy

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/953.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m450.6/953.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m829.4/953.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [None]:
from google.colab import output
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
import numpy as np
import time

class GridEnvironment(gym.Env):


    def __init__(self):

        self.observation_space = spaces.Discrete(16)
        self.action_space = spaces.Discrete(4)
        self.max_timesteps = 50

        self.timestep = 0
        self.agent_pos = [0, 0]
        self.goal_pos = [3, 3]
        self.state = np.zeros((4,4))
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.goal_pos)] = 0.5


    def reset(self, **kwargs):

        self.state = np.zeros((4,4))
        self.agent_pos = [0, 0]
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.goal_pos)] = 0.5
        self.timestep=0;
        observation = self.state.flatten()

        return observation

    def step(self, action):

        if action == 0:
          self.agent_pos[0] += 1
        if action == 1:
          self.agent_pos[0] -= 1
        if action == 2:
          self.agent_pos[1] += 1
          # uncomment for teleport
          # self.agent_pos=[2,2]
        if action == 3:
          self.agent_pos[1] -= 1

        # Comment this to demonstrate the truncation condition.
        self.agent_pos = np.clip(self.agent_pos, 0, 3)

        self.state = np.zeros((4,4))
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.goal_pos)] = 0.5
        observation = self.state.flatten()

        reward = 0
        if np.array_equal(self.agent_pos, self.goal_pos):
          reward = 1

        self.timestep += 1

        if (self.timestep >= self.max_timesteps or np.array_equal(self.agent_pos, self.goal_pos)):
            terminated=True;
        else:
          terminated= False;

        if(np.all((self.agent_pos >=0 )) & np.all(self.agent_pos <= 2)):
          truncated=True
        else:
          truncated= False;


        return observation, reward, terminated, truncated

    def render(self):
        plt.imshow(self.state)
        plt.show();
        time.sleep(1)
        output.clear()
        print(self.state)

import numpy as np

class QTableAgent:
    def __init__(self, env):
        self.env = env
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.q_table = np.zeros((self.observation_space.n, self.action_space.n))
        # hyper parameters tweaking
        self.learning_rate = 0.1
        self.discount_factor = 0.99
        self.epsilon = 0.1

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_space.n)
        else:
            return np.argmax(self.q_table[np.argmax(state), :])

    def train(self, num_episodes):
        for episode in range(num_episodes):
            state = env.reset()
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)

                # Q-value update using the Q-learning equation
                self.q_table[np.argmax(state), action] = (1 - self.learning_rate) * self.q_table[np.argmax(state), action] + \
                                              self.learning_rate * (reward + self.discount_factor * np.max(self.q_table[np.argmax(next_state), :]))

                state = next_state


    def play(self, playSteps):
        total_reward = 0
        for i in range(playSteps):
            state = env.reset()
            done = False


            while not done:
                self.env.render()
                action = np.argmax(self.q_table[np.argmax(state), :])
                print(state)
                print(env.timestep)
                next_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                state = next_state

        return total_reward


env = GridEnvironment()
agent = QTableAgent(env)
rewardSum=0;

obs = env.reset()
print(obs)
terminated, truncated = False, False
agent.train(50)
print("final reward",agent.play(5))

# while not terminated:
#   action = agent.train(350)
#   obs, reward, terminated, truncated = env.step(action)
#   rewardSum=rewardSum+reward;
#   env.render()
#   time.sleep(1)
#   output.clear()


[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  1.  0.5]]
[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.5]
5
final reward 5
