In [3]:
import time
import torch 
import collections
import numpy as np
import torch.nn as nn
import gymnasium as gym
from collections import namedtuple
from tensorboardX import SummaryWriter

#### $ Functions \ and \ Classes $

In [19]:
class PolicyNeuralNetwork(nn.Module):
    """ Internal NN Policy mapper decision for the agent to choose an action """
    def __init__(self, obs_size:int, hidden_size:int, n_actions:int):
        super(PolicyNeuralNetwork, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(in_features=obs_size, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(in_features=hidden_size, out_features=n_actions)
        )

    def forward(self, X:torch.FloatTensor)-> torch.FloatTensor:
        return torch.tanh(self.policy(X))*2 # Tanh activation maps the output to the range (-1, 1) , # Scale the output to the range (-2, 2)   

#### $ Solving \ the \ Pendulum \ Problem \ with \ Cross \ Entropy $

In [20]:
def iterate_batches(env:gym.make, Policy:nn.Module, batch_size:int, max_episode_steps:int=20):
    """ Iterate thorugh batches and run the enviroment 
    Parameters:
    - env: enviroment 
    - policy: Policy Neural Network 
    - batch_size: batch size"""

    batch = []
    episode_reward = 0.0
    step_count = 0 
    episode_step = []
    obs = env.reset()[0] # ouput three value, x value, y value and Angular Velocity 

    while True:
        abs_vector = torch.FloatTensor(np.array(obs))
        agent_policy_action = Policy(abs_vector).detach().numpy() # Feed Neural Network
        
        # Passing Action into environment 
        next_obs, reward, is_done, _ , _ = env.step(agent_policy_action)
        episode_reward += reward
        episode_step.append(EpisodeStep(observation=obs, action=agent_policy_action))
    
        if is_done or step_count >= max_episode_steps: # 
            batch.append(Episode(reward = episode_reward, steps = episode_step))
            episode_reward = 0.0
            episode_step = []
            next_obs =  env.reset()[0]

            if len(batch) == batch_size:
                yield batch 
                batch = []
        
        obs = next_obs
        step_count += 1 


def filter_batches (batches:collections.namedtuple , percentile:int)-> list:
    """ Filter the elite or best Episode to retrain the NN
    - batch : namedtuple, conataining the Rewards and Steps
    - percentile: int, percentile to filter the batches"""

    # Filtering Rewards
    rewards = list(map(lambda s: s.reward, batches))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs, train_act = [], []
    for sample in batches:
        if sample.reward >= reward_bound:
            continue
        train_obs.extend(map(lambda step:step.observation, sample.steps))
        train_act.extend(map(lambda step:step.action, sample.steps))

    train_obs_vector = torch.FloatTensor(train_obs)
    train_act_vector = torch.FloatTensor(train_act)

    return train_obs_vector, train_act_vector, reward_bound, reward_mean



In [23]:
# Training Agent 
HIDDEN_LAYER = 150
BATCH_SIZE = 16
PERCENTILE = 70

# Keep tracking of each episode and Steps
Episode = collections.namedtuple('Episode', field_names= ['reward','steps'])
EpisodeStep = collections.namedtuple('EpisodeStep', field_names= ['observation', 'action'])

# Initiate Environment
env = gym.make('Pendulum-v1', render_mode = 'human')
obs_size = env.observation_space.shape[0]
n_actions_ = env.action_space.shape[0]

# Intiate Neural Network , Loss Function and Optimizer
Net = PolicyNeuralNetwork(obs_size=obs_size, hidden_size=HIDDEN_LAYER, n_actions=n_actions_)
objective = nn.MSELoss()
optimizer = torch.optim.Adam(params=Net.parameters(), lr=0.001)

# Training NN and Agent 
for iter_no, batch in enumerate(iterate_batches(env=env, Policy=Net, batch_size=BATCH_SIZE)):

    # Applying Optimization 
    obs_vector, action_vector, reward_boundary, reward_mean = filter_batches(batches = batch, percentile=PERCENTILE)
    optimizer.zero_grad()
    action_score_vector = Net(obs_vector) # trainining only on the best

    loss_vector = objective(action_score_vector, action_vector)
    loss_vector.backward() # back propagation
    optimizer.step() # applying and update 
    time.sleep(1/10)
    env.render()
    print(f"iter_no ; {iter_no}, loss : {loss_vector.item()}, Reward Mean : {reward_mean}, Reward bound : {reward_boundary}")

    if reward_mean > -1.5:
        print("solved")
        break
env.close()


iter_no ; 0, loss : 8.681227970915115e-15, Reward Mean : -9.400084480318185, Reward bound : -0.44820316193696963
iter_no ; 1, loss : 5.24832678486638e-16, Reward Mean : -3.7799627481683435, Reward bound : -1.087118357544834
iter_no ; 2, loss : 1.0698513167766485e-15, Reward Mean : -3.9609341324854763, Reward bound : -1.5008150293345128
iter_no ; 3, loss : 2.3213753188715833e-15, Reward Mean : -1.7397613233320461, Reward bound : -0.09917973205346153
iter_no ; 4, loss : 6.459479608508399e-16, Reward Mean : -2.968583558231043, Reward bound : -1.1275535664720029
iter_no ; 5, loss : 6.661338147750939e-16, Reward Mean : -5.246190551247389, Reward bound : -3.995646054127965
iter_no ; 6, loss : 8.679925657758712e-16, Reward Mean : -2.527811020138029, Reward bound : -0.5120119591508759
iter_no ; 7, loss : 8.478066589120579e-16, Reward Mean : -2.395224914280303, Reward bound : -1.0744417778791282
iter_no ; 8, loss : 1.0900371707009025e-15, Reward Mean : -2.752302679789458, Reward bound : -0.7885

#### $ Solving \ the \ Mountain \ Car \ Problem \ with \ Cross \ Entropy \ - \ Functions $

In [4]:

def transform_reward (position:int) -> int:
    """ Function that transform the reward according to the the Min position from observation 
    Args:
        position : min position given by the environment

    Return 
        reward  float, 
    """

    reward_values = {-0.1000000:6,0.1100000:5,-0.1300000:4,-0.1500000:3,0.1700000:2,-0.2000000:1}
    for location, reward in reward_values.items():
        if position >= location:
            return reward
        
    return -1


class NeuralNetwork(nn.Module):
    """ Internal NN Policy mapper decision for the agent to choose an action """
    def __init__(self, obs_size:int, hidden_size:int, n_actions:int):
        super(NeuralNetwork, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(in_features=obs_size, out_features=hidden_size),
            nn.LeakyReLU (),
            nn.Linear(in_features=hidden_size, out_features=550),
            nn.LeakyReLU (),
            nn.Linear(in_features=550, out_features=n_actions),
        )

    def forward(self, X:torch.FloatTensor)-> torch.FloatTensor:
        return self.policy(X)
    

def batches_iterator (env:gym.make, AgentPolicy: nn.Module, batch_size:int):
    """
    Iterate through batches of episodes.

    Args:
        env: The gym environment to use.
        policy_network: The neural network that defines the agent's policy.
        batch_size: The number of episodes per batch.

    Yields:
        A batch of episodes, represented as a list of Episode objects.
    """

    batch , episode_step = [], []
    episode_reward, step_count = 0.0 , 0
    obs = env.reset()[0] # the new version return the output as tuple 
    activation_function = nn.Softmax(dim=1) # activation function 

    while True:
        obs_vector = torch.FloatTensor(np.array([obs]))
        act_prob_vector = activation_function(AgentPolicy(obs_vector)) # get the output probabilities from Neural Network
        act_prob = act_prob_vector.detach().numpy()[0] 
        action = np.random.choice(len(act_prob), p= act_prob)

        # Environment
        next_obs, rewards , is_done, _, _ = env.step(action)
        episode_reward += transform_reward(rewards)

        # Track the best steps 
        episode_step.append(EpisodeStep(observation = obs, action = action))
        
        # Reset
        if is_done or step_count > EPISODES:
            batch.append(Episode(reward=episode_reward, steps = episode_step))
            episode_reward = 0.0 
            next_obs = env.reset()[0]
            episode_step = []

            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs
        step_count += 1 

def filter_batches (batch:namedtuple, percentile:int)-> list:
    """ This function filter the elite or best Episode to retrain the NN
    Parameters:
    - batch: namedtuple, containing the Rewards and Steps
    - percentile: int, percentile to filter the batches """

    # Filtering Rewards
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile) # Get those reward that are above the percentile 
    reward_mean = float(np.mean(rewards))

    train_obs, train_act = [], []
    for example in batch:
        if example.reward > reward_bound: # Filter the batches which reward is above the rewards_bound
            continue
        train_obs.extend(map(lambda step:step.observation, example.steps))
        train_act.extend(map(lambda step:step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean


    