In [41]:
import math, time, pickle, random
from collections import namedtuple

import numpy as np
import matplotlib
from matplotlib import pyplot as plt

import gym
from gym import spaces

import torch
from torch import nn, optim, autograd
from torch.nn import functional as F

import torchvision
from torchvision import transforms as T

In [42]:
DEVICE= torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [43]:
AGENT_POS= [0, 0]
DIAMOND_POS= [1, 3]
COIN_POS= [3, 2]
MONSTER_POS= [2, 1]
GOAL_POS= [3, 3]

In [44]:
N_STATES= 16
N_ACTIONS= 4
MAX_TIMESTEPS= 200

In [45]:
STATE_LOOKUP= dict()
ACTION_LOOKUP= {0: 'Down', 1: 'Up', 2: 'Right', 3: 'Left'}

k= 0
for i in range(4):
  for j in range(4):
    STATE_LOOKUP[(i, j)]= k
    k+= 1

In [46]:
class Environment(gym.Env):
  metadata= {'render.modes': []}

  def __init__(self) -> None:
    '''
    Initializes the number of states, action, & maximum timestep of the environment
    '''

    self.observation_space= spaces.Discrete(N_STATES)
    self.action_space= spaces.Discrete(N_ACTIONS)
    self.max_timesteps= MAX_TIMESTEPS

  def reset(self) -> int:
    '''
    Resets the environment to its default setup

    Output:
      observation: 2-element list - Coordinates of the agent's default position in the grid
    '''

    self.timestep= 0

    self.agent_pos= AGENT_POS.copy()
    self.diamond_pos= DIAMOND_POS.copy()
    self.coin_pos= COIN_POS.copy()
    self.monster_pos= MONSTER_POS.copy()
    self.goal_pos= GOAL_POS.copy()

    self.state= np.zeros((4, 4))

    self.state[tuple(self.agent_pos)]= 1
    self.state[tuple(self.diamond_pos)]= 5
    self.state[tuple(self.coin_pos)]= 3
    self.state[tuple(self.monster_pos)]= -5
    self.state[tuple(self.goal_pos)]= 10
    
    agent_state= self.agent_pos

    return STATE_LOOKUP[tuple(agent_state)]

  def step(self, action: int) -> tuple:
    '''
    1. Moves the agent as per the given action
    2. Sets the rewards per state in the environment
    3. Calculates the reward of the new state

    Input:
      action: int - Action to be performed

    Output:
      action: int - Action taken
      observation: 2-element list
      reward: int - State reward
      done: bool - Denotes if the learning is complete (or) goal is reached
      info: dict
    '''

    # Initializing the grid
    self.state= np.zeros((4, 4))

    # Move the agent
    if action == 0: # Go down one step
      self.agent_pos[0] += 1
    if action == 1: # Go up one step
      self.agent_pos[0] -= 1
    if action == 2: # Go right one step
      self.agent_pos[1] += 1
    if action == 3: # Go left one step
      self.agent_pos[1] -= 1

    # To keep the agent within the confines of the environment
    self.agent_pos= np.clip(self.agent_pos, 0, 3)

    # Initialize the state rewards
    self.state[tuple(self.agent_pos)]= 1
    self.state[tuple(self.diamond_pos)]= 5
    self.state[tuple(self.coin_pos)]= 3
    self.state[tuple(self.monster_pos)]= -5
    self.state[tuple(self.goal_pos)]= 10

    agent_state= self.agent_pos

    # Calculates the reward for the action
    reward= 0
    if (self.agent_pos == self.diamond_pos).all():
      reward += 5
    if (self.agent_pos == self.coin_pos).all():
      reward += 3
    if (self.agent_pos == self.monster_pos).all():
      reward -= 5
    if (self.agent_pos == self.goal_pos).all():
      reward += 10

    # Timestep increment
    self.timestep += 1

    done= True if ((self.timestep >= self.max_timesteps) or (self.agent_pos == self.goal_pos).all()) else False
    info= {}

    return (STATE_LOOKUP[tuple(agent_state)], reward, done, info)

  def render(self) -> None:
    '''
    Provides pictorial representation of environment
    '''

    plt.imshow(self.state)