<a href="https://colab.research.google.com/github/srikarraju/GridWorld/blob/main/Reinforce_Gridworld.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
class Gridworld:
  def __init__(self,grid_dim=4):
    self.grid_dim = grid_dim
    self.number_states = self.grid_dim*self.grid_dim - 1
    self.curr_state = 0

  def reset(self):
    self.curr_state = 0
    return self.curr_state

  def step(self,state,action):
    curr_state = (int(state/self.grid_dim),state%self.grid_dim)
    if action == 0:#Left
      if curr_state[1]-1<0:
        return state,-1,0
      else:
        next_state = (curr_state[0],curr_state[1]-1)
        if next_state[0]==self.grid_dim-1 and next_state[1]==self.grid_dim-1:
          return next_state[0]*4 + next_state[1],0,1
        else:
          return next_state[0]*4 + next_state[1],-1,0
    elif action ==1:#Right
      if curr_state[1]+1>=self.grid_dim:
        return state,-1,0
      else:
        next_state = (curr_state[0],curr_state[1]+1)
        if next_state[0]==self.grid_dim-1 and next_state[1]==self.grid_dim-1:
          return next_state[0]*4 + next_state[1],0,1
        else:
          return next_state[0]*4 + next_state[1],-1,0
    elif action ==2:#Up
      if curr_state[0]-1<0:
        return state,-1,0
      else:
        next_state = (curr_state[0]-1,curr_state[1])
        if next_state[0]==self.grid_dim-1 and next_state[1]==self.grid_dim-1:
          return next_state[0]*4 + next_state[1],0,1
        else:
          return next_state[0]*4 + next_state[1],-1,0
    elif action ==3:#Down
      if curr_state[0]+1>=self.grid_dim:
        return state,-1,0
      else:
        next_state = (curr_state[0]+1,curr_state[1])
        if next_state[0]==self.grid_dim-1 and next_state[1]==self.grid_dim-1:
          return next_state[0]*4 + next_state[1],0,1
        else:
          return next_state[0]*4 + next_state[1],-1,0
    else:
      print("Invalid Action")
      return None,0,1

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import gym
from collections import deque

In [None]:
class policy_net(nn.Module):
  def __init__(self,state_dim,hidden_dim,action_dim):
    super(policy_net,self).__init__()
    self.h = nn.Linear(state_dim,hidden_dim)
    self.out = nn.Linear(hidden_dim,action_dim)

  def forward(self,x):
    x = F.relu(self.h(x))
    x = F.softmax(self.out(x),dim=1)
    return x

In [None]:
grid_dim = 4
num_actions = 4

env = Gridworld(grid_dim)

policy = policy_net(2,16,4)
optimizer = torch.optim.Adam(policy.parameters())

returns = deque(maxlen=100)
gamma = 0.99

In [None]:
n_episode = 1
losses = []
reinforce_returns = []
#reinforce_baseline_returns = []
while n_episode <=10000:
  rewards,states,actions = [],[],[]
  state = env.reset()
  episode_len = 1
  while True:
    state_vec = np.asarray((int(state/grid_dim),state%grid_dim))
    probs = policy(torch.tensor(state_vec).unsqueeze(0).float())
    sampler = Categorical(probs)
    action = sampler.sample()
    new_state, reward, done = env.step(state,action.item())

    states.append(state_vec)
    actions.append(action)
    rewards.append(reward)

    state = new_state
    episode_len += 1
    if done==True or episode_len>100:
      break

  rewards = np.array(rewards)
  R = torch.tensor([np.sum(rewards[i:]*(gamma**np.array(range(0,len(rewards)-i)))) for i in range(len(rewards))]).float()
  states = torch.tensor(states).float()
  actions = torch.tensor(actions)
  #state_values = torch.flatten(value_fn(states).float())

  probs = policy(states)
  sampler = Categorical(probs)
  logprobs = -sampler.log_prob(actions)

  policy_net_loss = torch.sum(logprobs*R)
  #policy_net_loss = torch.sum(logprobs*(R-state_values))

  optimizer.zero_grad()
  policy_net_loss.backward()
  optimizer.step()

  returns.append(np.sum(rewards))
  if n_episode%100==0:
    print("Episode: {:6d}\tAvg. Return: {:6.2f}".format(n_episode, np.mean(returns)))
  reinforce_returns.append(np.mean(returns))
  n_episode += 1

Episode:    100	Avg. Return: -19.90
Episode:    200	Avg. Return: -14.68
Episode:    300	Avg. Return: -11.34
Episode:    400	Avg. Return: -10.14
Episode:    500	Avg. Return:  -9.73
Episode:    600	Avg. Return:  -8.55
Episode:    700	Avg. Return:  -7.90
Episode:    800	Avg. Return:  -7.78
Episode:    900	Avg. Return:  -7.48
Episode:   1000	Avg. Return:  -7.35
Episode:   1100	Avg. Return:  -6.84
Episode:   1200	Avg. Return:  -7.07
Episode:   1300	Avg. Return:  -6.87
Episode:   1400	Avg. Return:  -6.83
Episode:   1500	Avg. Return:  -6.64
Episode:   1600	Avg. Return:  -6.26
Episode:   1700	Avg. Return:  -6.45
Episode:   1800	Avg. Return:  -6.31
Episode:   1900	Avg. Return:  -6.13
Episode:   2000	Avg. Return:  -6.24
Episode:   2100	Avg. Return:  -6.22
Episode:   2200	Avg. Return:  -5.80
Episode:   2300	Avg. Return:  -5.97
Episode:   2400	Avg. Return:  -6.07
Episode:   2500	Avg. Return:  -5.75
Episode:   2600	Avg. Return:  -5.63
Episode:   2700	Avg. Return:  -5.58
Episode:   2800	Avg. Return:

In [None]:
print(states)
print(actions)

In [None]:
state = env.reset()
rewards,states,actions = [],[],[]
while True:
  print("State: ",state)
  state_vec = np.asarray((int(state/grid_dim),state%grid_dim))
  probs = policy(torch.tensor(state_vec).unsqueeze(0).float())
  sampler = Categorical(probs)
  action = sampler.sample()
  print("Action: ",action.item())
  new_state, reward, done = env.step(state,action.item())

  states.append(state_vec)
  actions.append(action)
  rewards.append(reward)

  state = new_state
  if done==True:
    break
print(states)
print(actions)
print(rewards)

State:  0
Action:  1
State:  1
Action:  1
State:  2
Action:  3
State:  6
Action:  3
State:  10
Action:  1
State:  11
Action:  3
[array([0, 0]), array([0, 1]), array([0, 2]), array([1, 2]), array([2, 2]), array([2, 3])]
[tensor([1]), tensor([1]), tensor([3]), tensor([3]), tensor([1]), tensor([3])]
[-1, -1, -1, -1, -1, 0]


In [None]:
print(state)

15
