In [2]:
import gym
import torch
import time
import random
import numpy as np
import torch.nn as nn
from collections import namedtuple
# from tensorboardX import SummaryWriter

### $ The \ cross-entropy \ method $

##### $ Classes \ and \ Functions $ 

In [3]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    """ custom Module that transforms the environment output into OneHot Encoder"""
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        shape = (env.observation_space.n,)
        self.observation_space = gym.spaces.Box(0.0,1.0, shape, dtype = np.float32) # In here we change the shape and output of the observation 

    def observation(self, observation):
        res =  np.copy(self.observation_space.low)
        res [observation] = 1.0
        return res

class NeuralNetwork(nn.Module):
    """ Internal mapper decision function for the agent to choose a 
    action to accumulate the higher possible rewards
    """

    def __init__(self, obs_size:int, hidden_size:int, n_actions:int):
        super(NeuralNetwork, self).__init__()
        self.net = nn.Sequential(
            # In here build the entire Neural Network 
            nn.Linear(in_features=obs_size, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(in_features=hidden_size, out_features=n_actions)
        )

    def forward(self, X: torch.FloatType):
        return self.net(X)
    

# Batches Generator
def iterate_batches(env:gym.make, Net: nn.Module, batch_size :int):
    """ Iterate through batches and Run the Environments"""

    batch = []
    episode_reward = 0.0
    episode_step = []
    obs = env.reset()[0] # This will take the Initial State from the environment 
    sm = nn.Softmax(dim=1)
 
    while True:
        obs_v = torch.FloatTensor(np.array([obs])) # Transform the observation into Tensors in Torch
        act_prob_v = sm(Net(obs_v)) # Calculate the probability at each iteration of the Neural Network
        act_prob = act_prob_v.data.numpy()[0] # we need to unpack this by accessing the tensor.data field and then converting the tensor into a NumPy array
        action = np.random.choice(len(act_prob), p = act_prob ) # look for why Len(act_prob)

        next_obs, reward, is_done, _ , _= env.step(action) # it returns (observation, reward, terminated, truncated, info)
        episode_reward += reward
        episode_step.append(EpisodeStep(observation=obs, action=action))

        if is_done:
            batch.append(Episode(reward=episode_reward, steps= episode_step))
            episode_reward = 0.0
            episode_step = []
            next_obs = env.reset()[0]

            if len(batch) == batch_size:
                yield batch 
                batch = []
        obs = next_obs

    
def filter_batches (batch:namedtuple, percentile:int)-> list:
    """ This function filter the elite or best Episode to retrain the NN
    Parameters:
    - batch: namedtuple, containing the Rewards and Steps
    - percentile: int, percentile to filter the batches """

    # Filtering Rewards
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile) # Get those reward that are above the percentile 
    reward_mean = float(np.mean(rewards))

    train_obs, train_act = [], []
    for example in batch:
        if example.reward < reward_bound: # Filter the batches which reward is above the rewards_bound
            continue
        train_obs.extend(map(lambda step:step.observation, example.steps))
        train_act.extend(map(lambda step:step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

##### $ CartPole $ 

In [4]:
# Create the Neural Network 
HIDDEN_LAYERS = 50
BATCH_SIZE = 16
PERCENTILE = 70

# Keep tracking of each episode and Steps
Episode = namedtuple('Episode', field_names= ['reward','steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names= ['observation', 'action'])

# Initiating Environment
env = gym.make("CartPole-v1")
obs_size_ = env.observation_space.shape[0] # number of output in the environment ->  ndarray with shape (1,) which takes values {0,1} where 0, push cart to the left, and 1, push cart to the right  
n_actions_ = env.action_space.n  #left , Right

# Initiate Neural Network , Loss Functions and Optimizer
net = NeuralNetwork(obs_size = obs_size_, hidden_size= HIDDEN_LAYERS, n_actions = n_actions_)
objective = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr=0.01)
# writer = SummaryWriter(comment="-cartpole")


# Start Training 
for iter_no, batch in enumerate(iterate_batches(env=env, Net=net, batch_size=BATCH_SIZE)):

    # Applying optimization
    obs_v, acts_v, reward_b, reward_m = filter_batches(batch=batch, percentile=PERCENTILE) # Take the best Scenarios
    optimizer.zero_grad() # Reset the Gradient 
    action_score_v = net(obs_v)

    loss_v = objective(action_score_v, acts_v) # compare the action output vs the winning action 
    loss_v.backward()
    optimizer.step() # Apply back Propagation
    # if reward_b > 400:
        # time.sleep(0.02) 
        # env.render()
    print(f"iter_no ; {iter_no}, loss : {loss_v.item()}, Reward Mean : {reward_m}, Reward bound : {reward_b}")
    # writer.add_scalar("loss", loss_v.item(), iter_no)
    # writer.add_scalar("reward_bound", reward_b, iter_no)
    # writer.add_scalar("reward_mean", reward_m, iter_no)

    if reward_m > 200:
        print("solved!")
        break 
    # writer.close()


  train_obs_v = torch.FloatTensor(train_obs)


iter_no ; 0, loss : 0.6824373006820679, Reward Mean : 15.5625, Reward bound : 17.5
iter_no ; 1, loss : 0.6878387331962585, Reward Mean : 19.3125, Reward bound : 18.5
iter_no ; 2, loss : 0.6904373168945312, Reward Mean : 19.5, Reward bound : 23.5
iter_no ; 3, loss : 0.6856494545936584, Reward Mean : 20.125, Reward bound : 26.5
iter_no ; 4, loss : 0.646310031414032, Reward Mean : 23.6875, Reward bound : 27.5
iter_no ; 5, loss : 0.702922523021698, Reward Mean : 16.375, Reward bound : 18.0
iter_no ; 6, loss : 0.688866138458252, Reward Mean : 24.375, Reward bound : 28.5
iter_no ; 7, loss : 0.6686075925827026, Reward Mean : 28.75, Reward bound : 27.0
iter_no ; 8, loss : 0.6799336671829224, Reward Mean : 28.6875, Reward bound : 32.5
iter_no ; 9, loss : 0.6793533563613892, Reward Mean : 33.6875, Reward bound : 29.5
iter_no ; 10, loss : 0.6730258464813232, Reward Mean : 37.5625, Reward bound : 44.0
iter_no ; 11, loss : 0.6673963069915771, Reward Mean : 34.3125, Reward bound : 36.5
iter_no ; 12,

##### $ Frozen \ Lake \ - \ Naive$

In [10]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

# Keep tracking of each episode and Steps
Episode = namedtuple('Episode', field_names= ['reward','steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names= ['observation', 'action'])

# Initiate Environment 
env = DiscreteOneHotWrapper(gym.make('FrozenLake-v1')) # In here we change the Initiate State actions
obs_size_ = env.observation_space.shape[0] # shape [16, ]
n_actions_ = env.action_space.n # array from 0 to 4 where each number equal to an action UP DOWN LEFT RIGHT

# Initiate Neural Network
net = NeuralNetwork(obs_size=obs_size_, hidden_size=HIDDEN_SIZE, n_actions=n_actions_) # obs_size =[16, ],HIDDEN_SIZE = 128 , n_actions = np.array[0,4]
objective = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr = 0.001)
writer = SummaryWriter(comment='-frozenlake-naive')

# Start Training 
for iter_no, batch in enumerate(iterate_batches(env=env, Net=net, batch_size=BATCH_SIZE)):

    # Applying optimization
    obs_v, acts_v, reward_b, reward_m = filter_batches(batch=batch, percentile=PERCENTILE) # Take the best Scenarios
    optimizer.zero_grad() # Reset the Gradient 
    action_score_v = net(obs_v)

    loss_v = objective(action_score_v, acts_v) # compare the action output vs the winning action 
    loss_v.backward()
    optimizer.step() # Apply back Propagation
    # if reward_b > 400:
        # time.sleep(0.02) 
        # env.render()
    print(f"iter_no ; {iter_no}, loss : {loss_v.item()}, Reward Mean : {reward_m}, Reward bound : {reward_b}")
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)

    if reward_m > 0.8 or iter_no == 50:
        print("solved!")
        break 
    writer.close()



iter_no ; 0, loss : 1.3898205757141113, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 1, loss : 1.3752325773239136, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 2, loss : 1.3727778196334839, Reward Mean : 0.125, Reward bound : 0.0
iter_no ; 3, loss : 1.3739104270935059, Reward Mean : 0.0625, Reward bound : 0.0
iter_no ; 4, loss : 1.3824213743209839, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 5, loss : 1.3741878271102905, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 6, loss : 1.3988438844680786, Reward Mean : 0.0625, Reward bound : 0.0
iter_no ; 7, loss : 1.3869667053222656, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 8, loss : 1.3656502962112427, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 9, loss : 1.3733731508255005, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 10, loss : 1.3804473876953125, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 11, loss : 1.358634114265442, Reward Mean : 0.0, Reward bound : 0.0
iter_no ; 12, loss : 1.365053415298462, Reward Mean : 0

##### $ Frozen \ Lake \ - \ Tweaked $

In [15]:
def filter_batches_and_gamma (batch:namedtuple, gamma_penalty:float, percentile:int)-> list:
    """ This function filter the elite or best Episode to retrain the NN using and Penalty GAMMA
    Parameters:
    - batch: namedtuple, containing the Rewards and Steps
    - gamma_penalty: float, percentage to transform the elite into a percentage instead of 1
    - percentile: int, percentile to filter the batches """

    # Filtering Rewards
    filter_function = lambda s:s.reward * (gamma_penalty ** len(s.steps))
    discounted_reward = list(map(filter_function, batch))
    reward_bound = np.percentile(discounted_reward, percentile) # Get those reward that are above the percentile with the gamma_penalty applied
  
    train_obs, train_act, elite_batch  = [], [] , []
    for example, discounted_rw in zip(batch, discounted_reward):
        if discounted_rw > reward_bound: # Filter the batches which reward is above the rewards_bound
            train_obs.extend(map(lambda step:step.observation, example.steps))
            train_act.extend(map(lambda step:step.action, example.steps))
            elite_batch.append(example)

    # train_obs_v = torch.FloatTensor(train_obs)
    # train_act_v = torch.LongTensor(train_act)

    return elite_batch, train_obs, train_act, reward_bound

In [21]:
# Hard core Variables
HIDDEN_SIZE = 128
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9
random.seed(104)

# Keeping Records of Batches
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

# Initiate the Environment
env = DiscreteOneHotWrapper(gym.make('FrozenLake-v1', is_slippery = False))#,render_mode = 'human'))
obs_size_ = env.observation_space.shape[0]
n_actions_ = env.action_space.n

# Initiate Neural Network
net= NeuralNetwork(obs_size=obs_size_, hidden_size=HIDDEN_SIZE, n_actions=n_actions_)
objective = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr = 0.001)
writer = SummaryWriter(comment="-frozenlake-tweaked-no-slippery")

full_batch = []
for iter_no , batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_mean = float(np.mean(list(map(lambda s:s.reward, batch))))
    full_batch, obs, acts, reward_bound = filter_batches_and_gamma(batch = full_batch + batch, gamma_penalty=GAMMA, percentile=PERCENTILE)
    if not full_batch:
        continue
    obs_v = torch.FloatTensor(obs)
    act_v = torch.LongTensor(acts)
    full_batch = full_batch[-500:]

    # Optimizing and Updating Neural Network
    optimizer.zero_grad()
    action_score_v = net(obs_v)
    loss_v = objective(action_score_v, act_v)
    loss_v.backward()
    optimizer.step()

    print(f"iter_no ; {iter_no}, loss : {loss_v.item()}, Reward Mean : {reward_mean}, Reward bound : {reward_bound}")
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_mean", reward_mean, iter_no)
    writer.add_scalar("reward_bound", reward_bound, iter_no)

    if reward_mean > 0.8:
        print("Solved!")
        env.close()
        break
    
    writer.close()


iter_no ; 0, loss : 1.3785868883132935, Reward Mean : 0.01, Reward bound : 0.0
iter_no ; 1, loss : 1.378678560256958, Reward Mean : 0.02, Reward bound : 0.0
iter_no ; 2, loss : 1.3657974004745483, Reward Mean : 0.02, Reward bound : 0.0
iter_no ; 3, loss : 1.360249400138855, Reward Mean : 0.01, Reward bound : 0.0
iter_no ; 4, loss : 1.3615710735321045, Reward Mean : 0.03, Reward bound : 0.0
iter_no ; 5, loss : 1.357996940612793, Reward Mean : 0.02, Reward bound : 0.0
iter_no ; 6, loss : 1.3532918691635132, Reward Mean : 0.02, Reward bound : 0.0
iter_no ; 7, loss : 1.3488185405731201, Reward Mean : 0.01, Reward bound : 0.0
iter_no ; 8, loss : 1.3447781801223755, Reward Mean : 0.01, Reward bound : 0.0
iter_no ; 9, loss : 1.3390018939971924, Reward Mean : 0.01, Reward bound : 0.0
iter_no ; 10, loss : 1.339638113975525, Reward Mean : 0.05, Reward bound : 0.0
iter_no ; 11, loss : 1.3393833637237549, Reward Mean : 0.02, Reward bound : 0.0
iter_no ; 12, loss : 1.3383861780166626, Reward Mean :