In [46]:
import torch
import torch.nn as nn
from collections import namedtuple
import gym
import numpy as np
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

In [47]:
HIDDEN_SIZE = 128
BATCH_SIZE = 100 #Batch size is increased
PERCENTILE = 70
MAX_ITER = 10000000
GAMMA = 0.9

In [48]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space,
                          gym.spaces.Discrete)
        shape = (env.observation_space.n, )
        self.observation_space = gym.spaces.Box(
            0.0, 1.0, shape, dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [49]:
class Net(nn.Module):
    def __init__(self, num_features:int, hidden_size:int, num_actions:int):
        super(Net, self).__init__()
        # Define two linear layers
        self.linear1 = nn.Linear(in_features=num_features, out_features=hidden_size)
        self.linear2 = nn.Linear(in_features=hidden_size, out_features=num_actions)

        # Define a ReLU activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)  # First linear layer
        x = self.relu(x)     # ReLU activation
        x = self.linear2(x)  # Second linear layer
        return x

In [50]:
Episode = namedtuple('Episode', field_names=['total_rewards', 'steps'])
episodeStep = namedtuple('episodeStep', field_names=['state', 'action'])

In [51]:
def create_episode(env:gym.Env, net:Net):
    net.eval()
    total_rewards = 0
    steps = []
    sm = nn.Softmax(dim=0)
    
    current_state = env.reset()[0]
    
    while True:
        current_state_tensor = torch.FloatTensor(current_state)
        action_prob = sm(net(current_state_tensor))
        action_prob = action_prob.detach().numpy()
        action = np.random.choice(env.action_space.n, p=action_prob)
        
        next_state, reward, terminated, _, info = env.step(action=action)
        total_rewards += reward
        current_step = episodeStep(state=current_state, action=action)
        steps.append(current_step)
        
        if terminated:
            e = Episode(total_rewards=total_rewards, steps=steps)
            return e
            
        
        current_state = next_state
    

In [52]:
def create_batch(env:gym.Env, net:Net, batch_size:int):
    batch = []
    for i in range(batch_size):
        episode = create_episode(env, net)
        batch.append(episode)
        
    return batch
    

In [53]:
def filter_batch(batch, percentile):
    discounted_rewards = list(map(lambda s:s.total_rewards * (GAMMA**len(s.steps)), batch))  #Discounted reward is introduced
    reward_percentile = np.percentile(discounted_rewards, percentile)
    mean_reward = np.mean(discounted_rewards)
    
    training_states = []
    training_actions = []
    new_batch = []
    for episode, discounted_rewards in zip(batch, discounted_rewards):
        steps = episode.steps
        if discounted_rewards< reward_percentile:
            continue
        training_states.extend(map(lambda step: step.state, steps))
        training_actions.extend(map(lambda step: step.action, steps))
        new_batch.append(episode)
        
    return new_batch, training_states, training_actions, reward_percentile, mean_reward
        
        
        

In [54]:
if __name__=="__main__":
    env =  DiscreteOneHotWrapper(gym.make("FrozenLake-v1"))
    
    num_features = env.observation_space.shape[0]
    num_actions = env.action_space.n
    net = Net(num_features=num_features, hidden_size=HIDDEN_SIZE, num_actions=num_actions)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(params = net.parameters(), lr = 0.001)  #Learning rate is decreased
    
    writer = SummaryWriter(log_dir='frozen_lake_tb')
    prev_batch = []
    for i in range(MAX_ITER):
        batch = create_batch(env, net, BATCH_SIZE)
        new_batch, training_states, training_actions, reward_percentile, mean_reward = filter_batch(batch=batch + prev_batch, percentile=PERCENTILE)
        training_states = torch.FloatTensor(training_states)
        training_actions = torch.LongTensor(training_actions)
        # Zero the parameter gradients
        net.train()
        optimizer.zero_grad()

        # Forward pass
        outputs = net(training_states)
        outputs = outputs.float()
        loss = criterion(outputs, training_actions)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        print('Current iteration: {}, mean reward: {}'.format(i, mean_reward))
        
        writer.add_scalar("loss", loss.item(), i)
        writer.add_scalar("reward_bound", reward_percentile, i)
        writer.add_scalar("mean_reward", mean_reward, i)
        
        prev_batch = new_batch
        if mean_reward > 0.8:
            break
        
    writer.close()
        

Current iteration: 0, mean reward: 0.0
Current iteration: 1, mean reward: 0.0017433922005000006
Current iteration: 2, mean reward: 0.005690315934000001
Current iteration: 3, mean reward: 0.004267736950500001
Current iteration: 4, mean reward: 0.004275123980400001
Current iteration: 5, mean reward: 0.0048827824771500005
Current iteration: 6, mean reward: 0.005962867475387145
Current iteration: 7, mean reward: 0.00521750904096375
Current iteration: 8, mean reward: 0.00463778581419
Current iteration: 9, mean reward: 0.0045226856728710005
Current iteration: 10, mean reward: 0.004111532429882728
Current iteration: 11, mean reward: 0.004626427694753251
Current iteration: 12, mean reward: 0.004809957168303001
Current iteration: 13, mean reward: 0.004808029441995643
Current iteration: 14, mean reward: 0.004943619500592367
Current iteration: 15, mean reward: 0.004634643281805344
Current iteration: 16, mean reward: 0.0043620172064050295
Current iteration: 17, mean reward: 0.004353548694843398
Cu

KeyboardInterrupt: 