In [0]:
import numpy as np
import sys
import matplotlib.pyplot as plt
#Creates the world for this example
class grid_world():
  def __init__(self,n):
    self.size=n
  def build_world(self,n):
    world=np.zeros((n,n))
    world[0,0]=1
    world[0,n-1]=10
    return world
  #Set of actions available, since a movement out of bounds is caught later all movements are possible.
  def get_actions(self,i,j,n):
    moves=[]
    if(i!=0):
      moves.append((-1,0))
    if(i!=self.size-1):
      moves.append((1,0))
    if(j!=0):
      moves.append((0,-1))      
    if(j!=self.size-1):
      moves.append((0,1))      
    return moves

  def get_states(self,n):
    return np.meshgrid(range(n),range(n))

  #Builds the matrix of state action pairs
  def transition_actions(self,n):
    transitions=[[[] for i in range(n)] for j in range(n)]
    for i in range(n):
      for j in range(n):
        transitions[i][j]=get_actions(i,j,n)
    return(transitions)



In [0]:
def value_iteration(rewards,possible_actions,n,p_action, lbda):
	#Initializations of V and theta the convergence condition
	V=np.zeros((n,n))
	V_new=np.zeros((n,n))
	converged=0
	theta=.0001
	count=0

	while(not converged):
		count=count+1
		#Loop over all states skipping the top left and the top right
		for i in range(n):
			for j in range(n):
				if(i==0 and j==0):
					continue
				if(i==0 and j==n-1):
					continue
				action_rewards=[]

				#Loop over actions to get the value if that action was executed and then sum them according to the probability of that state happening for a given action
				#The max min part bounds the actions into the grid
				for action in possible_actions[i][j]:

        final_action_rewards=[]
				for ii in range(len(action_rewards)):

          #Update Values matrix
          
				V_new[i,j]=max(final_action_rewards)
		converged=np.sum(abs(V-V_new))<theta
		V=V_new.copy()
	return(V,bottom_left,bottom_right)




In [0]:
def eval_policy(policy,rewards,possible_actions,n,p_action,lbda,num_iterations):
	V=np.zeros((n,n))
	V_new=np.zeros((n,n))
	converged=0
	theta=.1
	#Evaluates the given policy: if num_iterations is greater than one it does modified policy otherwise it iterates until convergence 
	if(num_iterations<0):
		while(not converged):
			for i in range(n):
				for j in range(n):
					if(i==0 and j==0):
						continue
					if(i==0 and j==n-1):
						continue
					action_rewards=[]
					action_probs=[]
					#Loops over states and calculates the value of taking an action, calculates the policy's chance of taking that action then combines the two to get an evaluation
					for action in policy[i][j][0]:
            #Fill in from psuedo code
					for action_prob in policy[i][j][1]:
						action_probs.append(action_prob)
					final_action_rewards=[]
					for ii in range(len(action_rewards)):
            #Fill in from psuedo code
					V_new[i,j]=max(final_action_rewards)
			converged=np.sum(abs(V-V_new))<theta
			V=V_new.copy()
	
	else:
		for k in range(num_iterations):
			for i in range(n):
				for j in range(n):
					if(i==0 and j==0):
						continue
					if(i==0 and j==n-1):
						continue
					action_rewards=[]
					action_probs=[]
					for action in policy[i][j][0]:

          for action_prob in policy[i][j][1]:

          final_action_rewards=[]
					for ii in range(len(action_rewards)):

          V_new[i,j]=max(final_action_rewards)
			V=V_new.copy()
	return V	


#Initializes a policy based on a set of actions
def initialize_policy(n,possible_actions):
	policy=[[[[]for i in range(2)] for j in range(n)] for k in range(n)] 
	for i in range(n):
		for j in range(n):
			state_policy=np.zeros((len(possible_actions[i][j])))
			state_policy[np.random.randint(0,len(possible_actions[i][j]))]=1
			policy[i][j][1]=state_policy
			policy[i][j][0]=possible_actions[i][j]			
	return policy

def policy_iteration(rewards,possible_actions,n,p_action, lbda):
	V=np.zeros((n,n))
	pi=np.zeros((n,n))
	policy=initialize_policy(n,possible_actions)
	V_new=np.zeros((n,n))
	converged=0
	count=0
	bottom_left=[]
	bottom_right=[]
	bottom_left.append(0)
	bottom_right.append(0)
	while(not converged):
		count=count+1
		converged=1
		#Gets the Values based on the current policy, initially a random policy
		V=eval_policy(policy,rewards,possible_actions,n,p_action,lbda,num_iterations=-1)
		for i in range(n):
			for j in range(n):
				if(i==0 and j==0):
					continue
				if(i==0 and j==n-1):
					continue
				action_rewards=[]
				index=np.argmax(policy[i][j][1])
				best_action=policy[i][j][0][index]
				for action in possible_actions[i][j]:
					#Fill in here
				final_action_rewards=[]
				for ii in range(len(action_rewards)):
					#FILL in here
				state_policy=np.zeros((len(final_action_rewards)))
				state_policy[np.argmax(final_action_rewards)]=1
				#Creates a new best action for the current set of values and assigns it to the policy if the policy hasn't changed it has converged
				if(np.argmax(final_action_rewards)!=np.argmax(policy[i][j][1])):
					converged=0
				policy[i][j][1]=state_policy
		bottom_left.append(V[n-1][0])
		bottom_right.append(V[n-1][n-1])
	return(policy,bottom_left,bottom_right)