In [44]:
import numpy as np
import sys
from six import StringIO, b

from gym import utils
from gym.envs.toy_text import discrete

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

MAPS = {
    "4x4": [
        "SFFF",
        "FHFH",
        "FFFH",
        "HFFG"
    ],
    "8x8": [
        "SFFFFFFF",
        "FFFFFFFF",
        "FFFHFFFF",
        "FFFFFHFF",
        "FFFHFFFF",
        "FHHFFFHF",
        "FHFFHFHF",
        "FFFHFFFG"
    ],
}

class FrozenLakeEnv(discrete.DiscreteEnv):
    """
    Winter is here. You and your friends were tossing around a frisbee at the park
    when you made a wild throw that left the frisbee out in the middle of the lake.
    The water is mostly frozen, but there are a few holes where the ice has melted.
    If you step into one of those holes, you'll fall into the freezing water.
    At this time, there's an international frisbee shortage, so it's absolutely imperative that
    you navigate across the lake and retrieve the disc.
    However, the ice is slippery, so you won't always move in the direction you intend.
    The surface is described using a grid like the following
        SFFF
        FHFH
        FFFH
        HFFG
    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located
    The episode ends when you reach the goal or fall in a hole.
    You receive a reward of 1 if you reach the goal, and zero otherwise.
    """

    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self, desc=None, map_name="4x4",is_slippery=True):
        if desc is None and map_name is None:
            raise ValueError('Must provide either desc or map_name')
        elif desc is None:
            desc = MAPS[map_name]
        self.desc = desc = np.asarray(desc,dtype='c')
        self.nrow, self.ncol = nrow, ncol = desc.shape
        self.reward_range = (0, 1)

        nA = 4
        nS = nrow * ncol

        isd = np.array(desc == b'S').astype('float64').ravel()
        isd /= isd.sum()

        P = {s : {a : [] for a in range(nA)} for s in range(nS)}

        def to_s(row, col):
            return row*ncol + col
        
        def inc(row, col, a):
            if a==0: # left
                col = max(col-1,0)
            elif a==1: # down
                row = min(row+1,nrow-1)
            elif a==2: # right
                col = min(col+1,ncol-1)
            elif a==3: # up
                row = max(row-1,0)
            return (row, col)

        for row in range(nrow):
            for col in range(ncol):
                s = to_s(row, col)
                for a in range(4):
                    li = P[s][a]
                    letter = desc[row, col]
                    if letter in b'GH':
                        li.append((1.0, s, 0, True))
                    else:
                        if is_slippery:
                            for b in [(a-1)%4, a, (a+1)%4]:
                                newrow, newcol = inc(row, col, b)
                                newstate = to_s(newrow, newcol)
                                newletter = desc[newrow, newcol]
                                done = bytes(newletter) in b'GH'
                                rew = float(newletter == b'G')
                                li.append((1.0/3.0, newstate, rew, done))
                        else:
                            newrow, newcol = inc(row, col, a)
                            newstate = to_s(newrow, newcol)
                            newletter = desc[newrow, newcol]
                            done = bytes(newletter) in b'GH'
                            rew = float(newletter == b'G')
                            li.append((1.0, newstate, rew, done))

        super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout

        row, col = self.s // self.ncol, self.s % self.ncol
        desc = self.desc.tolist()
        desc = [[c.decode('utf-8') for c in line] for line in desc]
        desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
        if self.lastaction is not None:
            outfile.write("  ({})\n".format(["Left","Down","Right","Up"][self.lastaction]))
        else:
            outfile.write("\n")
        outfile.write("\n".join(''.join(line) for line in desc)+"\n")

        if mode != 'human':
            return outfile

In [204]:
vector = [3, 2, 3, 3]
x = np.max(vector)
indices = np.nonzero(vector == x)[0] # 인덱스 자체가 랜덤 방향이 되므로.
import random
random.choice(indices)

3

In [109]:
import gym
import random as rand
import numpy as np

FROZENLAKE_NOT_SLIPPERY = 'FrozenLakeNotSlippery-v0'
SLIPPERY='FrozenLake-v0'

def register_frozen_lake_not_slippery(name):
    from gym.envs.registration import register
    register(
        id=name,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
    
DEBUG=False
def debug(env, log):
    if DEBUG and env != None:
        env.render()
    if DEBUG:
        print(log)
        
def info(env, log):
    env.render()
    print(log)
    
def greedy_action(vector):
    m = np.amax(vector)
    indices = np.nonzero(vector == m)[0]
    return rand.choice(indices)

def random_action():
    dirs = [ LEFT, DOWN, RIGHT, UP ]
    return rand.choice(dirs)

def exists_env(name):
    from gym import envs
    for env in envs.registry.all():
        if env.id == name:
            return True
    return False

if not exists_env(FROZENLAKE_NOT_SLIPPERY):
    register_frozen_lake_not_slippery(FROZENLAKE_NOT_SLIPPERY)
    print("Registered")
else:
    print("Already Regitered")

Already Regitered


In [110]:
env = gym.make(FROZENLAKE_NOT_SLIPPERY)

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

lr = .8
y = .95
num_episodes = 3000
answers=[]
Q = np.zeros([env.observation_space.n, env.action_space.n])
total = 0.0
for step in range(num_episodes):
    prev_state = env.reset()
    doomed = False
    state_list = []
    while not doomed:
        state_list.append(prev_state)
        action = greedy_action(Q[prev_state,:])
        curr_state, reward, doomed, _ = env.step(action)
        if reward == 1.0:
            debug(env, "[SUCC][%d][%d->%d] : %f - %a" % (step, prev_state, curr_state, reward, doomed))
        elif doomed:
            debug(env, "[FAIL][%d][%d->%d] : %f\n" % (step, prev_state, curr_state, reward))
            break
        Q[prev_state, action] = reward + np.max(Q[curr_state,:])
        prev_state = curr_state
        total += reward
print("left, down, right, up")
for x in range(len(Q)):
    print("[%d] %s" % (x, Q[x,:]))
print("total reward is %d" % total)

left, down, right, up
[0] [ 0.  1.  0.  0.]
[1] [ 0.  0.  0.  0.]
[2] [ 0.  0.  0.  0.]
[3] [ 0.  0.  0.  0.]
[4] [ 0.  1.  0.  0.]
[5] [ 0.  0.  0.  0.]
[6] [ 0.  0.  0.  0.]
[7] [ 0.  0.  0.  0.]
[8] [ 0.  0.  1.  0.]
[9] [ 0.  1.  0.  0.]
[10] [ 0.  0.  0.  0.]
[11] [ 0.  0.  0.  0.]
[12] [ 0.  0.  0.  0.]
[13] [ 0.  0.  1.  0.]
[14] [ 0.  0.  1.  0.]
[15] [ 0.  0.  0.  0.]
total reward is 2806


In [4]:
#Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])
# Set learning parameters
lr = .8
y = .95
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)
    

print("Final Q-Table Values")
print(Q)

Final Q-Table Values
[[  5.82698888e-03   8.10405079e-03   2.05724807e-01   6.80835127e-03]
 [  5.77328482e-04   3.51814810e-04   6.66929863e-05   4.72895880e-02]
 [  8.91247325e-04   2.91184848e-03   3.56435794e-03   7.79812719e-03]
 [  0.00000000e+00   1.34757663e-03   3.30683431e-04   5.61992823e-03]
 [  3.60935889e-01   2.81086590e-03   2.03756917e-03   6.33905578e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  5.38297333e-02   3.06025282e-09   6.57390573e-06   2.11311507e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  5.96605729e-04   1.09666568e-04   2.94370580e-04   6.24011457e-01]
 [  3.19626281e-04   3.39689390e-01   9.44021307e-04   1.28309177e-03]
 [  8.11139968e-01   5.36157168e-04   4.76236270e-05   1.16402982e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.76219483e-03   9.34150118e-01   0

In [29]:
print(1 == 1.0)

True
