In [1]:
import copy

class CliffWalkingEnv:
    def __init__(self, ncol = 12, nrow = 4):
        self.ncol = ncol
        self.nrow = nrow 
        self.P = self.createP()

    def createP(self):
        P = [[[] for j in range(4)] for i in range(self.nrow * self.ncol)]
        # four actions: change[0] for up, change[1] for down, change[2] for left, change[3] for right
        change = [[0,-1],[0,1],[-1,0],[0,1]]
        for i in range(self.nrow):
            for j in range(self.ncol):
                for a in range(4):
                    # if position is at cliff or target, no interation anymore
                    if i == self.nrow -1 and j > 0:
                        P[i * self.ncol + j][a] = [(1,i*self.ncol+j,0,True)]
                        continue

                    next_x = min(self.ncol - 1, max(0, j + change[a][0]))
                    next_y = min(self.nrow - 1, max(0, i + change[a][1]))
                    next_state = next_y * self.ncol + next_x
                    reward = -1 
                    done = False

                    #if next position is at cliff or target
                    if next_y == self.nrow - 1 and next_x > 0:
                        done = True
                        if next_x != self.ncol - 1: #next position at cliff 
                            reward = -100
                    P[i * self.ncol + j][a] = [(1,next_state,reward, done)]

        return P 


# Policy Iteration

In [2]:
class PolicyIteration:
    def __init__(self,env,theta,gamma):
        self.env = env
        self.v = [0] * self.env.ncol * self.env.nrow #intialise value as 0 
        self.pi = [[0.25,0.25,0.25,0.25] for i in range(self.env.ncol * self.env.nrow)]

        self.theta = theta 
        self.gamma = gamma

    def policy_evaluation(self):
        cnt = 1
        while 1:
            max_diff = 0
            new_v = [0] * self.env.ncol * self.env.nrow
            for s in range(self.env.ncol * self.env.nrow):
                qsa_list = []
                for a in range(4):
                    qsa = 0 
                    for res in self.env.P[s][a]:
                        p,next_state,r,done = res
                        qsa += p * (r + self.gamma * self.v[next_state] * (1 - done))
                    qsa_list.append(self.pi[s][a] * qsa)
                new_v[s] = sum(qsa_list)
                max_diff = max(max_diff, abs(new_v[s] - self.v[s]))
            self.v = new_v
            if max_diff < self.theta: break
            cnt += 1
        print("Policy evaluation finished in %d rounds" %cnt)

    def policy_improvement(self):
        for s in range(self.env.nrow * self.env.ncol):
            qsa_list = []
            for a in range(4):
                qsa = 0
                for res in self.env.P[s][a]:
                    p, next_state, r, done = res
                    qsa += p * (r + self.gamma * self.v[next_state] * (1 - done))
                qsa_list.append(qsa)
            maxq = max(qsa_list)
            cntq = qsa_list.count(maxq)
            self.pi[s] = [1 / cntq if q == maxq else 0 for q in qsa_list]
        print('policy improvement finish')
        return self.pi

    def run(self):
        while 1:
            self.policy_evaluation()
            old_pi = copy.deepcopy(self.pi)
            new_pi = self.policy_improvement()
            if old_pi == new_pi:
                break

        

In [3]:
def print_agent(agent, action_meaning, disaster=[], end = []):
    print('state value: ')
    for i in range(agent.env.nrow):
        for j in range(agent.env.ncol):
            print('%6.6s' % ('%.3f' %agent.v[i * agent.env.ncol +j]), end = '  ')
        print()

    print("policy: ")
    for i in range(agent.env.nrow):
        for j in range(agent.env.ncol):
            if (i * agent.env.ncol + j) in disaster:
                print('****', end=' ')
            elif (i * agent.env.ncol + j) in end:
                print('EEEE', end=' ')
            else:
                a = agent.pi[i * agent.env.ncol + j]
                pi_str = ''
                for k in range(len(action_meaning)):
                    pi_str += action_meaning[k] if a[k] > 0 else 'o'
                print(pi_str, end=' ')
        print()


env = CliffWalkingEnv()
action_meaning = ['^', 'v', '<', '>']
theta = 0.001
gamma = 0.9
agent = PolicyIteration(env, theta, gamma)
agent.run()
print_agent(agent, action_meaning, list(range(37, 47)), [47])

Policy evaluation finished in 67 rounds
policy improvement finish
Policy evaluation finished in 13 rounds
policy improvement finish
Policy evaluation finished in 1 rounds
policy improvement finish
state value: 
-9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -2.710  
-9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -1.900  
-9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -9.998  -1.000  
-9.998   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000  
policy: 
^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ovo> 
^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ovo> 
^v<> ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ovo> 
^v<> **** **** **** **** **** **** **** **** **** **** EEEE 


# Value Iteration

In [14]:
class ValueIteration:
    def __init__(self,env,theta,gamma):
        self.env = env
        self.theta = theta
        self.v = [0] * (self.env.ncol * self.env.nrow)
        self.gamma = gamma
        self.pi = [None for i in range(self.env.ncol * self.env.nrow)]

    def value_iteration(self):
        cnt = 0
        while 1:
            max_diff = 0
            new_v = [0] * (self.env.ncol * self.env.nrow)
            for s in range(self.env.ncol * self.env.nrow):
                qsa_list = []
                for a in range(4):
                    qsa = 0
                    for res in self.env.P[s][a]:
                        p,next_state,r,done = res
                        qsa += p * (r + self.gamma * self.v[next_state] * (1 - done))
                    qsa_list.append(qsa)
                new_v[s] = max(qsa_list)
                max_diff = max(max_diff, abs(new_v[s] - self.v[s]))
            self.v = new_v
            if max_diff < self.theta: break
            cnt += 1
        print("Value Iteration Finished in %d rounds" %cnt)
        self.get_policy()

    def get_policy(self):
        for s in range(self.env.nrow * self.env.ncol):
            qsa_list = []
            for a in range(4):
                qsa = 0
                for res in self.env.P[s][a]:
                    p, next_state,r,done = res
                    qsa += p * (r + self.gamma * self.v[next_state] * (1 - done))
                qsa_list.append(qsa)
            maxq = max(qsa_list)
            cntq = qsa_list.count(maxq)
            self.pi[s] = [1/cntq if q==maxq else 0 for q in qsa_list]

env = CliffWalkingEnv()
action_meaning = ['^', 'v', '<', '>']
theta = 0.001
gamma = 0.9
agent = ValueInteration(env,theta,gamma)
agent.value_iteration()
print_agent(agent,action_meaning,list(range(37,47)),[47])

Value Iteration Finished in 66 rounds
state value: 
-9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -2.710  
-9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -1.900  
-9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -9.991  -1.000  
-9.991   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000   0.000  
policy: 
^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ovo> 
^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ^v<> ovo> 
^v<> ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ^o<o ovo> 
^v<> **** **** **** **** **** **** **** **** **** **** EEEE 


In [16]:
import gym
import warnings
warnings.filterwarnings("ignore")
env = gym.make("FrozenLake-v1")
env = env.unwrapped
env.render()

holes = set()
ends = set()

for s in env.P:
    for a in env.P[s]:
        for s_a in env.P[s][a]:
            if s_a[2] == 1.0:
                ends.add(s_a[1])
            if s_a[3] == True:
                holes.add(s_a[1])
holes = holes - ends
print('holes are located at: ', holes)
print('targets are located at: ', ends) 

for a in env.P[14]:
    print (env.P[14][a])

holes are located at:  {11, 12, 5, 7}
targets are located at:  {15}
[(0.3333333333333333, 10, 0.0, False), (0.3333333333333333, 13, 0.0, False), (0.3333333333333333, 14, 0.0, False)]
[(0.3333333333333333, 13, 0.0, False), (0.3333333333333333, 14, 0.0, False), (0.3333333333333333, 15, 1.0, True)]
[(0.3333333333333333, 14, 0.0, False), (0.3333333333333333, 15, 1.0, True), (0.3333333333333333, 10, 0.0, False)]
[(0.3333333333333333, 15, 1.0, True), (0.3333333333333333, 10, 0.0, False), (0.3333333333333333, 13, 0.0, False)]


In [17]:
action_meaning = ['<', 'v', '>', '^']
theta = 1e-5
gamma = 0.9
agent = PolicyIteration(env, theta, gamma)
agent.run()
print_agent(agent, action_meaning, [5, 7, 11, 12], [15])

Policy evaluation finished in 25 rounds
policy improvement finish
Policy evaluation finished in 58 rounds
policy improvement finish
state value: 
 0.069   0.061   0.074   0.056  
 0.092   0.000   0.112   0.000  
 0.145   0.247   0.300   0.000  
 0.000   0.380   0.639   0.000  
policy: 
<ooo ooo^ <ooo ooo^ 
<ooo **** <o>o **** 
ooo^ ovoo <ooo **** 
**** oo>o ovoo EEEE 


In [18]:
action_meaning = ['<', 'v', '>', '^']
theta = 1e-5
gamma = 0.9
agent = ValueIteration(env, theta, gamma)
agent.value_iteration()
print_agent(agent, action_meaning, [5, 7, 11, 12], [15])

Value Iteration Finished in 60 rounds
state value: 
 0.069   0.061   0.074   0.056  
 0.092   0.000   0.112   0.000  
 0.145   0.247   0.300   0.000  
 0.000   0.380   0.639   0.000  
policy: 
<ooo ooo^ <ooo ooo^ 
<ooo **** <o>o **** 
ooo^ ovoo <ooo **** 
**** oo>o ovoo EEEE 
