In [1]:
import numpy as np
from new import NewWorld
import matplotlib.pyplot as plt
import time

In [2]:
env = NewWorld()

env.state

array([0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625,
       0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625])

In [3]:
n_states = 4*4

In [29]:
# start with a simple policy 

def evaluate_policy(policy,env):
    
    discount_factor = 0.9
    theta = 0.00001
    
    V = np.zeros(n_states)
    
    while True:
        
        delta = 0
        for s in range(n_states):
            
            value = 0
            for a, action_prob in enumerate(policy[s]):
                for prob, next_state, reward, done in env.P[s][a]:
                    value += action_prob * prob * (reward + discount_factor * V[next_state]) # value function 
            
            delta = max(delta, np.abs(value - V[s]))
            V[s] = value
            
        if delta < theta:
            break
    return V
 


In [32]:
policy = {}
for i in range(n_states):
    
    policy[i] = np.ones(4)/4
    

output = evaluate_policy(policy,env)



In [33]:
output

array([9.99994911, 9.99995517, 9.99995876, 9.99996037, 9.99995517,
       9.99996051, 9.99996367, 9.99996509, 9.99995876, 9.99996367,
       9.99996658, 9.99996789, 9.99996037, 9.99996509, 9.99996789,
       9.99996914])

In [82]:
def improve_policy(env,policy_eval):
    
    discount_factor = 0.9

    
    def lookahead(state,value):
        
        A = np.zeros(4)
        
        for a in range(4):
            
            for prob,next_state,reward,done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
                
        return A
    
    # random_policy
    
    policy = np.ones((n_states,4)) / 4
    
    i = 0
    
    while True:
        
        V = policy_eval(policy,env)
        policy_stable = True
        
        for s in range(n_states):
            
            # best action current policy
            
            best_action_current = np.argmax(policy[s])
            
            action_val = lookahead(s,V)
            # best action lookahead
            best_action_look = np.argmax(action_val)
            
            print("state =>",s,"current action =>",best_action_current,"best action =>",best_action_look)
                        
            if best_action_current != best_action_look: # if stable == optimal
                
                policy_stable = False
            policy[s] = np.eye(4)[best_action_look] # update the policy
        
        i+=1
                
        if policy_stable:
            break
        
    return policy, V
                
                

In [83]:
pol,val = improve_policy(env,evaluate_policy)

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 3
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 3
state => 9 current action => 0 best action => 3
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 3
state => 13 current action => 0 best action => 3
state => 14 current action => 0 best action => 3
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 3 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 b

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 b

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 be

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 0 best action => 1
state => 15 current action => 0 best action => 1
state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 0 current action => 1 best action => 0
state => 1 current action => 1 best action => 0
state => 2 current action => 1 best action => 0
state => 3 current action => 1 best action => 0
state => 4 current action => 1 best action => 0
state => 5 current action => 1 best action => 0
state => 6 current action => 1 best action => 0
state => 7 current action => 1 best action => 0
state => 8 current action => 1 best action => 0
state => 9 current action => 1 best action => 0
state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 bes

state => 10 current action => 1 best action => 0
state => 11 current action => 1 best action => 0
state => 12 current action => 1 best action => 0
state => 13 current action => 1 best action => 0
state => 14 current action => 1 best action => 0
state => 15 current action => 1 best action => 0
state => 0 current action => 0 best action => 1
state => 1 current action => 0 best action => 1
state => 2 current action => 0 best action => 1
state => 3 current action => 0 best action => 1
state => 4 current action => 0 best action => 1
state => 5 current action => 0 best action => 1
state => 6 current action => 0 best action => 1
state => 7 current action => 0 best action => 1
state => 8 current action => 0 best action => 1
state => 9 current action => 0 best action => 1
state => 10 current action => 0 best action => 1
state => 11 current action => 0 best action => 1
state => 12 current action => 0 best action => 1
state => 13 current action => 0 best action => 1
state => 14 current action => 

KeyboardInterrupt: 

In [68]:
pol

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [1]:
for i in range(2):
    state = env.reset()
    for i in range(10):
        
        move = np.argmax(pol[state])
        
        observation,reward,done,_ = env.step(move)
        #env.render()
        print(move,state)
        state = observation
        
        

NameError: name 'env' is not defined