<a href="https://colab.research.google.com/github/srikarraju/GridWorld/blob/main/REINFORCE_MDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
num_states =  4
num_actions = 2
branching_factor = 2
std_dev = 0.1
deg_non_stat = 0

In [None]:
import numpy as np
from numpy.random import default_rng

rng = default_rng()
possible_next_states = []
for i in range(num_states):
  state_next_states = []
  for j in range(num_actions):
    next_states = rng.choice(num_states, size=branching_factor, replace=False)
    state_next_states.append(next_states[:])
  possible_next_states.append(state_next_states)

possible_next_states = np.asarray(possible_next_states)
print(possible_next_states.shape)

(4, 2, 2)


In [None]:
trans_probs = []

for i in range(num_states):
  state_probs = []
  for j in range(num_actions):
    intervals = np.random.uniform(size = branching_factor - 1)
    #print(intervals)
    intervals.sort()
    state_action_probs = []
    state_action_probs.append(intervals[0])
    for k in range(branching_factor-2):
      state_action_probs.append(intervals[k+1]-intervals[k])
    state_action_probs.append(1 - intervals[branching_factor - 2])
    state_probs.append(state_action_probs)
  trans_probs.append(state_probs)

trans_probs = np.asarray(trans_probs)
print(trans_probs.shape)

(4, 2, 2)


In [None]:
rewards_mean = []

for i in range(num_states):
  state_rewards = []
  for j in range(num_actions):
    state_action_rewards = []
    for k in range(branching_factor):
      state_action_rewards.append(np.random.normal(0,1))
    state_rewards.append(state_action_rewards)
  rewards_mean.append(state_rewards)

rewards_mean = np.asarray(rewards_mean)
print(rewards_mean.shape)

(4, 2, 2)


In [None]:
def take_action(state,action):
  next_possible_states = possible_next_states[state][action]
  next_state_probs = trans_probs[state][action]

  #print(next_possible_states,next_state_probs)

  next_state_index = np.random.choice(a = np.arange(branching_factor), p = next_state_probs)
  next_state = possible_next_states[state][action][next_state_index]
  reward_mean = rewards_mean[state][action][next_state_index]

  #print(reward_mean)

  actual_reward = np.random.normal(reward_mean,std_dev)

  return next_state, reward_mean

print(take_action(2,1))


(0, 0.36335650872061775)


In [None]:
import numpy as np
from numpy.random import default_rng

rng = default_rng()


def generate_state_features(num_states,d,l):
  state_features = []
  product_dict = {}
  while len(state_features) < num_states:
    one_indices = rng.choice(d, size = l, replace=False)
    product = 1
    for i in range(l):
      product *= one_indices[i]
    feature_vec = np.zeros(d)
    if product not in product_dict:
      for index in one_indices:
        feature_vec[index] = 1
      state_features.append(feature_vec)
      product_dict[product] = 1
  #print(len(product_dict))
  return state_features

print(np.asarray(generate_state_features(num_states,4,2)))

[[0. 0. 1. 1.]
 [1. 0. 0. 1.]
 [0. 1. 0. 1.]
 [0. 1. 1. 0.]]


In [None]:
def generate_state_action_features(state_vec, d, num_actions):
  state_action_features = np.zeros(shape=(num_actions,d*num_actions))
  for i in range(num_actions):
    for j in range(d):
      state_action_features[i][d*i+j] = state_vec[j]
  return state_action_features

print(generate_state_action_features([1,0,0,1,0,0,1,1],8,4))

[[1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 1. 0. 0. 1. 1.]]


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import gym
from collections import deque
import numpy as np

In [None]:
class policy_net(nn.Module):
  def __init__(self,state_dim,hidden_dim,action_dim):
    super(policy_net,self).__init__()
    self.h = nn.Linear(state_dim,hidden_dim)
    self.out = nn.Linear(hidden_dim,action_dim)

  def forward(self,x):
    x = F.relu(self.h(x))
    x = F.softmax(self.out(x),dim=1)
    return x

In [None]:
d, l = 4, 2
policy = policy_net(d,20,num_actions)

optimizer = torch.optim.Adam(policy.parameters())

gamma = 0.99
returns = deque(maxlen=100)

In [None]:
n_episode = 1
losses = []
reinforce_returns = []

state_features = generate_state_features(num_states,d=4,l=2)

while n_episode <=5000:
  rewards,states,actions = [],[],[]
  state = 0
  episode_len = 0
  while episode_len<=50:
    episode_len += 1
    probs = policy(torch.tensor(state_features[state]).unsqueeze(0).float())
    sampler = Categorical(probs)
    action = sampler.sample()
    new_state, reward = take_action(state,action)

    states.append(state_features[state])
    actions.append(action)
    rewards.append(reward)

    state = new_state


  rewards = np.array(rewards)
  R = torch.tensor([np.sum(rewards[i:]*(gamma**np.array(range(0,len(rewards)-i)))) for i in range(len(rewards))]).float()
  states = torch.tensor(states).float()
  actions = torch.tensor(actions)
  #state_values = torch.flatten(value_fn(states).float())

  probs = policy(states)
  sampler = Categorical(probs)
  logprobs = -sampler.log_prob(actions)

  policy_net_loss = torch.sum(logprobs*R)

  optimizer.zero_grad()
  policy_net_loss.backward()
  optimizer.step()

  returns.append(np.sum(rewards))
  if n_episode%100==0:
    print("Episode: {:6d}\tAvg. Return: {:6.2f}".format(n_episode, np.mean(returns)))
  reinforce_returns.append(np.mean(returns))
  n_episode += 1


Episode:    100	Avg. Return:   2.84
Episode:    200	Avg. Return:  11.67
Episode:    300	Avg. Return:  17.31
Episode:    400	Avg. Return:  18.91
Episode:    500	Avg. Return:  20.11
Episode:    600	Avg. Return:  21.08
Episode:    700	Avg. Return:  20.71
Episode:    800	Avg. Return:  21.20
Episode:    900	Avg. Return:  22.41
Episode:   1000	Avg. Return:  22.83
Episode:   1100	Avg. Return:  21.38
Episode:   1200	Avg. Return:  20.92
Episode:   1300	Avg. Return:  22.07
Episode:   1400	Avg. Return:  21.71
Episode:   1500	Avg. Return:  23.02
Episode:   1600	Avg. Return:  22.38
Episode:   1700	Avg. Return:  23.24
Episode:   1800	Avg. Return:  22.45
Episode:   1900	Avg. Return:  22.71
Episode:   2000	Avg. Return:  21.78
Episode:   2100	Avg. Return:  22.06
Episode:   2200	Avg. Return:  22.45
Episode:   2300	Avg. Return:  22.38
Episode:   2400	Avg. Return:  23.00
Episode:   2500	Avg. Return:  21.89
Episode:   2600	Avg. Return:  22.39
Episode:   2700	Avg. Return:  23.49
Episode:   2800	Avg. Return:

In [None]:
returns = deque(maxlen=100)

d, l = 4, 2

weights_v = np.zeros(d,dtype=float)
weights_p = np.zeros(d*num_actions,dtype=float)


print(weights_v)
print(weights_p)

[0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
weights_w = np.zeros(d*num_actions,dtype=float)

alpha_0, beta_0, gamma, epsilon = 0.1, 0.01, 0.95, 0.1
alpha_c, beta_c = 1000, 100000
t = 0
n_episode = 1
actions_list = []
avg_reward = 0

state_features = generate_state_features(num_states,d=4,l=2)

while n_episode <=5000:
  rewards,states,actions = [],[],[]
  state = 0

  episode_len = 0
  while episode_len<=50:
    episode_len += 1
    state_action_features = generate_state_action_features(state_features[state],d,num_actions)
    t += 1
    probs = np.dot(weights_p,np.transpose(state_action_features))

    probs -= probs.max()
    probs = np.exp(np.clip(probs/epsilon, -500, 500))
    #probs = np.exp(probs)
    probs /= probs.sum()

    probs2 = probs.cumsum()
    action = np.where(probs2 >= np.random.random())[0][0]
    #print(action)



    new_state, reward = take_action(state,action)


    value_curr = np.dot(weights_v,np.asarray(state_features[state]))
    value_next = np.dot(weights_v,np.asarray(state_features[new_state]))

    avg_reward = (1 - gamma)*avg_reward + gamma * reward

    #td_error = reward + value_curr - value_next - avg_reward

    td_error = reward + value_curr - value_next

    beta = (beta_0 * beta_c) / (beta_c + t)
    alpha = (alpha_0 * alpha_c) / (alpha_c + t**(2/3))

    weights_v += 0.01*alpha * td_error * np.asarray(state_features[state])

    grad_prob = (1 - probs[action]) * np.asarray(state_action_features[action])

    weights_w = np.dot((np.eye(d*num_actions) - alpha * np.outer(grad_prob,grad_prob) ),weights_w) + alpha * td_error * grad_prob
    weights_p += beta * weights_w


    states.append(state)
    actions.append(action)
    rewards.append(reward)
    actions_list.append(action)

    state = new_state

  returns.append(np.sum(rewards))
  #print(np.sum(rewards))
  if n_episode%100==0:
    print("Episode: {:6d}\tAvg. Return: {:6.2f}".format(n_episode, np.mean(returns)))
  n_episode += 1

Episode:    100	Avg. Return:  23.37
Episode:    200	Avg. Return:  22.80
Episode:    300	Avg. Return:  23.28
Episode:    400	Avg. Return:  23.12
Episode:    500	Avg. Return:  23.38
Episode:    600	Avg. Return:  23.58
Episode:    700	Avg. Return:  22.71
Episode:    800	Avg. Return:  23.56
Episode:    900	Avg. Return:  22.88
Episode:   1000	Avg. Return:  23.46
Episode:   1100	Avg. Return:  22.82
Episode:   1200	Avg. Return:  23.52
Episode:   1300	Avg. Return:  23.41
Episode:   1400	Avg. Return:  23.29
Episode:   1500	Avg. Return:  23.64
Episode:   1600	Avg. Return:  23.82
Episode:   1700	Avg. Return:  23.91
Episode:   1800	Avg. Return:  24.27
Episode:   1900	Avg. Return:  23.45
Episode:   2000	Avg. Return:  24.01
Episode:   2100	Avg. Return:  23.26
Episode:   2200	Avg. Return:  24.02
Episode:   2300	Avg. Return:  23.15
Episode:   2400	Avg. Return:  23.02
Episode:   2500	Avg. Return:  23.82
Episode:   2600	Avg. Return:  23.52
Episode:   2700	Avg. Return:  21.67
Episode:   2800	Avg. Return: