In [1]:
import gym
import torch
import numpy as np
import torch.nn as nn
from collections import namedtuple
from tensorboardX import SummaryWriter

#### $ The \ cross-entropy \ method \ on \ CartPole $ 

In [16]:
class InnerNet(nn.Module):
    """ Internal mapper decision function for the agent to choose a 
    action to accumulate the higher possible rewards
    """

    def __init__(self, obs_size:int, hidden_size:int, n_actions:int):
        super(InnerNet, self).__init__()
        self.net = nn.Sequential(
            # In here build the entire Neural Network 
            nn.Linear(in_features=obs_size, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(in_features=hidden_size, out_features=n_actions)
        )

    def forward(self, X: torch.FloatType):
        return self.net(X)
    
# Keep tracking of each episode and Steps
Episode = namedtuple('Episode', field_names= ['reward','steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names= ['observation', 'action'])


# Batches Generator
def iterate_batches(env:gym.make, Net: nn.Module, batch_size :int):
    """ Iterate through batches and Run the Environments"""

    batch = []
    episode_reward = 0.0
    episode_step = []
    obs = env.reset()  # This will take the Initial State from the environment 
    print(obs.shape)
    sm = nn.Softmax(dim=1)
 
    while True:
        obs_v = torch.FloatTensor([obs]) # Transform the observation into Tensors in Torch
        act_prob_v = sm(Net(obs_v)) # Calculate the probability at each iteration of the Neural Network
        act_prob = act_prob_v.data.numpy()[0] # we need to unpack this by accessing the tensor.data field and then converting the tensor into a NumPy array
        action = np.random.choice(len(act_prob), p = act_prob ) # look for why Len(act_prob)

        next_obs, reward, is_done, _ , _= env.step(action) # it returns (observation, reward, terminated, truncated, info)
        episode_reward += reward
        episode_step.append(EpisodeStep(observation=obs, action=action))

        if is_done:
            batch.append(Episode(reward=episode_reward, steps= episode_step))
            episode_reward = 0.0
            episode_step = []
            next_obs = env.reset()

            if len(batch) == batch_size:
                yield batch 
                batch = []
        obs = next_obs

    
def filter_batches (batch:namedtuple, percentile:int)-> list:
    """ This function filter the elite or best Episode to retrain the NN
    Parameters:
    - batch: namedtuple, containing the Rewards and Steps
    - percentile: int, percentile to filter the batches """

    # Filtering Rewards
    rewards = list(map(lambda s: s.rewards, batch))
    reward_bound = np.percentile(rewards, percentile) # Get those reward that are above the percentile 
    reward_mean = float(np.mean(rewards))

    train_obs, train_act = [], []
    for example in batch:
        if example.reward < reward_bound: # Filter the batches which reward is above the rewards_bound
            continue
        train_obs.extend(map(lambda step:step.observation, example.steps))
        train_act.extend(map(lambda step:step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean


In [17]:
# Create the Neural Network 
HIDDEN_LAYERS = 50
BATCH_SIZE = 16
PERCENTILE = 70

# Initiating Environment
env = gym.make("CartPole-v1")
obs_size_ = env.observation_space.shape[0] # number of output in the environment ->  ndarray with shape (1,) which takes values {0,1} where 0, push cart to the left, and 1, push cart to the right  
n_actions_ = env.action_space.n  #left , Right

# Initiate Neural Network , Loss Functions and Optimizer
net = InnerNet(obs_size = obs_size_, hidden_size= HIDDEN_LAYERS, n_actions = n_actions_)
objective = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter(comment="-cartpole")


# Start Training 
for iter_no, batch in enumerate(iterate_batches(env=env, Net=net, batch_size=BATCH_SIZE)):

    # Applying optimization
    obs_v, acts_v, reward_b, reward_m = filter_batches(batch=batch, percentile=PERCENTILE) # Take the best Scenarios
    optimizer.zero_grad() # Reset the Gradient 
    action_score_v = net(obs_v)

    loss_v = objective(action_score_v, acts_v) # compare the action output vs the winning action 
    loss_v.backward()
    optimizer.step() # Apply back Propagation 

    print(f"iter_no ; {iter_no}, loss : {loss_v.item()}, Reward Mean : {reward_m}, Reward bound : {reward_b}")
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)

    if reward_m > 199:
        print("solved!")
        break 
    writer.close()


ValueError: expected sequence of length 4 at dim 2 (got 0)

In [22]:
env = gym.make("CartPole-v1")
obs_size = env.observation_space.shape[0]

In [24]:
env.action_space.n

2

In [21]:
torch.FloatTensor(np.array([ 0.02712901,  0.03728559, -0.00959169,  0.02476843]))

tensor([ 0.0271,  0.0373, -0.0096,  0.0248])