In [30]:
import math
import sys
print(sys.executable)

from collections import namedtuple, defaultdict, Counter
import numpy as np

import gym

from torch import FloatTensor, LongTensor
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

ENV_NAME_LAKE_SMALL =  "FrozenLake-v0"
ENV_NAME_LAKE_BIG ="FrozenLake8x8-v0"

GAMMA = 0.9
TEST_EPISODES = 20

C:\ProgramData\Anaconda3\envs\py36\python.exe


In [45]:
class Agent:
    def __init__(self, envname):
        self.env = gym.make(envname)
        self.state = self.env.reset()
        # rewards per (state, action, new_state) 
        self.rewards = defaultdict(float)
        # count transtions from each (state,action)
        self.transits = defaultdict(Counter)
        # calculated value per state in Value iteration and 
        # value per (state, action) in Q iteration
        self.values = defaultdict(float)

    def play_n_random_steps(self, count):
        for _ in range(count):
            action = self.env.action_space.sample()
            new_state, reward, is_done, _ = self.env.step(action)
            self.rewards[(self.state, action, new_state)] = reward
            self.transits[(self.state, action)][new_state] += 1
            
            self.state = self.env.reset() if is_done else new_state
     
    
    # play one episode untill done
    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, is_done, _ = env.step(action)
            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward
    
    # select max of possible actions from this state
    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

class AgentV(Agent):
    def __init__(self, envname):
        super().__init__(envname)

    def calc_action_value(self, state, action):
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        for tgt_state, count in target_counts.items():
            reward = self.rewards[(state, action, tgt_state)]
            action_value += (count / total) * (reward + GAMMA * self.values[tgt_state])
        return action_value       
    
    # one value iteration for all states
    def iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = [self.calc_action_value(state, action)
                            for action in range(self.env.action_space.n)]
            self.values[state] = max(state_values)
    
class AgentQ(Agent):
    def __init__(self, envname):
        super().__init__(envname)

    # in Q case , Q value is already stored
    def calc_action_value(self, state, action):
        return self.values[(state, action)]
    
    # one Q iteration for all ( state, action ) pairs
    def iteration(self):
        for state in range(self.env.observation_space.n):
            for action in range(self.env.action_space.n):
                action_value = 0.0
                target_counts = self.transits[(state, action)]
                total = sum(target_counts.values())
                for tgt_state, count in target_counts.items():
                    reward = self.rewards[(state, action, tgt_state)]
                    best_action = self.select_action(tgt_state)
                    action_value += (count / total) * (reward + GAMMA * self.values[(tgt_state, best_action)])
                self.values[(state, action)] = action_value

In [46]:
def train(envname, v_or_q = "v"):
    test_env = gym.make(envname)
    if v_or_q == "v":
        agent = AgentV(envname)
    else:
        agent = AgentQ(envname)
    writer = SummaryWriter(comment= "-%s-iteration" % (v_or_q))

    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1
        agent.play_n_random_steps(100)
        agent.iteration()
            
        reward = 0.0
        for _ in range(TEST_EPISODES):
            reward += agent.play_episode(test_env)
        reward /= TEST_EPISODES
        writer.add_scalar("reward", reward, iter_no)
        if reward > best_reward:
            print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
            best_reward = reward
        if reward > 0.80:
            print("Solved in %d iterations!" % iter_no)
            break
    writer.close()

# SMALL FROZEN LAKE => VALUE ITERATION

In [43]:
train(ENV_NAME_LAKE_SMALL)

Best reward updated 0.000 -> 0.500
Best reward updated 0.500 -> 0.800
Best reward updated 0.800 -> 0.850
Solved in 39 iterations!


# BIG FROZEN LAKE => VALUE ITERATION

In [49]:
train(ENV_NAME_LAKE_BIG)

Best reward updated 0.000 -> 0.350
Best reward updated 0.350 -> 0.400
Best reward updated 0.400 -> 0.500
Best reward updated 0.500 -> 0.650
Best reward updated 0.650 -> 0.850
Solved in 268 iterations!


# SMALL FROZEN LAKE => Q ITERATION

In [47]:
train(ENV_NAME_LAKE_SMALL, v_or_q="q")

Best reward updated 0.000 -> 0.100
Best reward updated 0.100 -> 0.400
Best reward updated 0.400 -> 0.600
Best reward updated 0.600 -> 0.750
Best reward updated 0.750 -> 0.900
Solved in 15 iterations!


# BIG FROZEN LAKE => Q ITERATION

In [48]:
train(ENV_NAME_LAKE_BIG, v_or_q="q")

Best reward updated 0.000 -> 0.050
Best reward updated 0.050 -> 0.200
Best reward updated 0.200 -> 0.350
Best reward updated 0.350 -> 0.500
Best reward updated 0.500 -> 0.550
Best reward updated 0.550 -> 0.650
Best reward updated 0.650 -> 0.700
Best reward updated 0.700 -> 0.800
Best reward updated 0.800 -> 0.850
Solved in 438 iterations!
