In [1]:
''' environment'''
import gym 

'''storage '''
import collections

''' writer '''
from tensorboardX import SummaryWriter 

In [2]:
''' agent  '''
GAMMA = 0.9


class Agent:

    ############ train and accumulate matrices ############################
    def __init__(self):
        self.env = gym.make('FrozenLake-v0')
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(collections.Counter)
        self.values = collections.defaultdict(float)

    def play_n_random(self, count):
        ''' plays count no of random steps in environment for initial data preparation'''
        for _ in range(count):
            action = self.env.action_space.sample()
            new_state, reward, is_done, info = self.env.step(action)

            self.rewards[(self.state, action,new_state)] = reward  # add to reward
            # add to counter
            self.transits[(self.state, action)][new_state] += 1
            self.state = self.env.reset() if is_done else new_state

    def calc_action_value(self, state, action):
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())

        action_value = 0.0
        for tgt_state, count in target_counts.items():
            reward = self.rewards[(state, action, tgt_state)]
            action_value += (count/total) * (reward + GAMMA * self.values[tgt_state])
        return action_value

    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = [
                self.calc_action_value(state, action)
                for action in range(self.env.action_space.n)
            ]
            self.values[state] = max(state_values)


################## test time ##############################################

    def select_action(self, state):
        best_action, best_value = None, None

        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)

            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, is_done, _ = env.step(action)

            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            total_reward += reward

            if is_done:
                break

            state = new_state
        return total_reward

**Env**\
[frozen lake description](https://gym.openai.com/envs/FrozenLake-v0/)

SFFF       (S: starting point, safe)\
FHFH       (F: frozen surface, safe)\
FFFH       (H: hole, fall to your doom)\
HFFG       (G: goal, where the frisbee is located)\

The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise. 

In [3]:
Gamma = 0.9 

class Agent():
    
    def __init__(self):
        self.env = gym.make('FrozenLake-v0')
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float) 
        self.transists = collections.defaultdict(collections.Counter)
        self.values = collections.defaultdict(float) 
        
        
    ## collect initial data 
    def play_n_random(self, count):
        for _ in range(count):
            # run on env 
            #print(self.env.render())
            action = self.env.action_space.sample()
            next_state, reward, is_done, _ = self.env.step(action)
            
            # populate the matrices
            self.rewards[(self.state, action, next_state)] = reward
            self.transists[(self.state, action)][next_state] += 1 
            
            if is_done:
                self.state = self.env.reset()
                break 
            self.state = next_state  
            
    def value_iteration(self):
        
        for state in range(self.env.observation_space.n): 
            state_values = [self.calculate_action_value(state, action)
                            for action in range(self.env.action_space.n)]
            self.values[state] = max(state_values)
            
    def calculate_action_value(self, state, action):
        target_counts = self.transists[(state, action)]
        total = sum(target_counts.values())
        
        action_value = 0.0 
        for nxt_state, count in target_counts.items():
            reward = self.rewards[(state,action, nxt_state)]
            value = self.values[nxt_state]
            action_value += (count/total)*(reward + Gamma*value)
            
        return action_value  
    
    def select_action(self, state):
        best_action, best_value = None, None 
        
        for action in range(self.env.action_space.n):
            action_value = self.calculate_action_value(state, action)
            
            if best_value is None or best_value < action_value:
                best_value = action_value 
                best_action = action 
        return best_action  
    
    def play_episode(self, env, render = False):
        total_reward = 0.0 
        state = env.reset()
        
        while True:
            if render:
                env.render()
                
            action = self.select_action(state)
            new_state, reward, is_done, _ = env.step(action)
            
            self.rewards[(state, action, new_state)] = reward 
            self.transists[(state, action)][new_state] += 1 
            total_reward += reward 
            
            if is_done:
                break 
            state = new_state 
        return total_reward  

In [4]:
agent = Agent()
agent.play_n_random(100)

print(agent.rewards)
print()
print(agent.transists)
print()
print(agent.values)
print()
print(agent.state)
print()
agent.env.render()

defaultdict(<class 'float'>, {(0, 3, 1): 0.0, (1, 0, 1): 0.0, (1, 3, 0): 0.0, (0, 2, 1): 0.0, (1, 1, 0): 0.0, (0, 3, 0): 0.0, (0, 2, 0): 0.0, (0, 0, 4): 0.0, (4, 0, 0): 0.0, (0, 2, 4): 0.0, (4, 2, 0): 0.0, (4, 2, 5): 0.0})

defaultdict(<class 'collections.Counter'>, {(0, 3): Counter({1: 1, 0: 1}), (1, 0): Counter({1: 1}), (1, 3): Counter({0: 1}), (0, 2): Counter({1: 1, 0: 1, 4: 1}), (1, 1): Counter({0: 1}), (0, 0): Counter({4: 2}), (4, 0): Counter({0: 1}), (4, 2): Counter({0: 1, 5: 1})})

defaultdict(<class 'float'>, {})

0


[41mS[0mFFF
FHFH
FFFH
HFFG


In [7]:
agent = Agent()
writer = SummaryWriter('valueBased')
test_env = gym.make('FrozenLake-v0')
iter_no = 0 
best_reward = 0.0 
    
while True:
    iter_no += 1 
    agent.play_n_random(100)
    agent.value_iteration()
    
    reward = 0.0 
    test_episodes = 10
    for _ in range(test_episodes):
        reward += agent.play_episode(test_env, render = False)
    
    reward /= test_episodes 
    writer.add_scalar("reward", reward, iter_no)
    
    if reward > best_reward:
        print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
        best_reward = reward
    if reward > 0.80:
        print("Solved in %d iterations!" % iter_no)
        break
        
agent.play_episode(test_env, render = True) 
writer.close()

Best reward updated 0.000 -> 0.300
Best reward updated 0.300 -> 0.500
Best reward updated 0.500 -> 0.600
Best reward updated 0.600 -> 0.700
Best reward updated 0.700 -> 0.800
Best reward updated 0.800 -> 0.900
Solved in 142 iterations!

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Left)
SF[41mF[0mF

In [9]:
def play_s(agent, env):
    state = env.reset()
    render_list = []
    steps = 0
    while True:
        render_list.append(env.render())
        
        action = agent.select_action(state)
        next_state, reward, is_done, _ = env.step(action)
        steps += 1 
        if reward == 1:
            env.render()
            print("successfully played in steps ",steps)
            break 
            
        if is_done and reward != 1:
            print("not solved: trying again ")
            steps = 0
            state = env.reset() 
            render_list.clear()
            
        state = next_state 
        
    return reward 

reward = play_s(agent, test_env)
print(reward)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41