In [1]:
import numpy as np
import gym

In [2]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)


In [8]:
env = gym.make('FrozenLakeNotSlippery-v0')
state_size = env.nS
action_size = env.nA

In [7]:
trial = 100
samples = 100
learning_rate = 0.1
keep_best = int(0.2 * samples)

In [9]:
policy_array = np.ones((state_size, action_size))/action_size

In [12]:
def run_sample(env, policy, action_size):
    state = env.reset()
    action_count = np.zeros(np.shape(policy))
    total_reward = 0
    while True:
        action = np.random.choice(action_size, p=policy[state])
        action_count[state, action] += 1
        state, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward, action_count

In [13]:
def update_policy(policy, s_list, learning_rate, keep):
    s_list.sort(key=lambda x:x[0], reverse=True)
    s_list = s_list[:keep]
    
    best_policy = np.zeros(np.shape(policy))
    for s in s_list:
        best_policy += s[1]
        
    for i in range(len(best_policy)):
        total_actions = np.sum(best_policy[i])
        if total_actions > 0:
            policy[i] = (best_policy[i] / total_actions) * learning_rate + policy[i] * (1. - learning_rate)
            
    policy = policy / np.sum(policy, axis=1)[:, None]
    
    return policy
    

In [15]:
for t in range(trial):
    sample_list = []
    for s in range(samples):
        reward, action_table = run_sample(env, policy_array, action_size)
        sample_list.append((reward, action_table))
    policy_array = update_policy(policy_array, sample_list, learning_rate, keep_best)

In [16]:
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        action = np.argmax(policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            break
            
print("Average reward: {} Average Length {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average reward: 1.0 Average Length 6.0


In [18]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [19]:
device

device(type='cuda')

In [20]:
env = gym.make('FrozenLakeNotSlippery-v0')
state_size = env.nS
action_size = env.nA

trial = 100
samples = 100
learning_rate = 0.1
keep_best = int(0.2 * samples)

In [23]:
def run_sample_tensor(env, policy, state_size, action_size, device):
    state = env.reset()
    action_count = torch.zeros((state_size, action_size)).to(device)
    total_reward = torch.zeros((1)).to(device)
    while True:
        action = np.random.choice(action_size, p=policy[state])
        action_count[state, action] += 1
        state, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward, action_count

In [None]:
def update_policy(policy, s_list, learning_rate, keep):
    s_list.sort(key=lambda x:x[0], reverse=True)
    s_list = s_list[:keep]
    
    best_policy = np.zeros(np.shape(policy))
    for s in s_list:
        best_policy += s[1]
        
    for i in range(len(best_policy)):
        total_actions = np.sum(best_policy[i])
        if total_actions > 0:
            policy[i] = (best_policy[i] / total_actions) * learning_rate + policy[i] * (1. - learning_rate)
            
    policy = policy / np.sum(policy, axis=1)[:, None]
    
    return policy

In [None]:
policy_tensor = torch.ones((state_size, action_size)).to(device) / action_size

for t in range(trial):
    sample_tensor = torch.zeros((samples, state_size, action_size)).to(device)
    reward_tensor = torch.zeros((samples)).to(device)
    policy_array = policy_tensor.cpu().numpy()
    
    for s in range(samples):
        reward, action_table_tensor = run_sample_tensor(env, policy_array, state_size, action_size, device)
        reward_tensor[s] = reward
        sample_tensor[s] = action_table_tensor
    policy_tensor = update_policy_tensor(policy_tensor, sample_tensor, reward_tensor, learning_rate, smoothing_factor, action_size, keep_best, device)