In [2]:
import time
import torch 
import collections
import numpy as np
import torch.nn as nn
import gymnasium as gym
from collections import namedtuple
from tensorboardX import SummaryWriter

#### $ Functions \ and \ Classes $

In [19]:
class PolicyNeuralNetwork(nn.Module):
    """ Internal NN Policy mapper decision for the agent to choose an action """
    def __init__(self, obs_size:int, hidden_size:int, n_actions:int):
        super(PolicyNeuralNetwork, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(in_features=obs_size, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(in_features=hidden_size, out_features=n_actions)
        )

    def forward(self, X:torch.FloatTensor)-> torch.FloatTensor:
        return torch.tanh(self.policy(X))*2 # Tanh activation maps the output to the range (-1, 1) , # Scale the output to the range (-2, 2)   

#### $ Solving \ the \ Pendulum \ Problem \ with \ Cross \ Entropy $

In [20]:
def iterate_batches(env:gym.make, Policy:nn.Module, batch_size:int, max_episode_steps:int=20):
    """ Iterate thorugh batches and run the enviroment 
    Parameters:
    - env: enviroment 
    - policy: Policy Neural Network 
    - batch_size: batch size"""

    batch = []
    episode_reward = 0.0
    step_count = 0 
    episode_step = []
    obs = env.reset()[0] # ouput three value, x value, y value and Angular Velocity 

    while True:
        abs_vector = torch.FloatTensor(np.array(obs))
        agent_policy_action = Policy(abs_vector).detach().numpy() # Feed Neural Network
        
        # Passing Action into environment 
        next_obs, reward, is_done, _ , _ = env.step(agent_policy_action)
        episode_reward += reward
        episode_step.append(EpisodeStep(observation=obs, action=agent_policy_action))
    
        if is_done or step_count >= max_episode_steps: # 
            batch.append(Episode(reward = episode_reward, steps = episode_step))
            episode_reward = 0.0
            episode_step = []
            next_obs =  env.reset()[0]

            if len(batch) == batch_size:
                yield batch 
                batch = []
        
        obs = next_obs
        step_count += 1 


def filter_batches (batches:collections.namedtuple , percentile:int)-> list:
    """ Filter the elite or best Episode to retrain the NN
    - batch : namedtuple, conataining the Rewards and Steps
    - percentile: int, percentile to filter the batches"""

    # Filtering Rewards
    rewards = list(map(lambda s: s.reward, batches))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs, train_act = [], []
    for sample in batches:
        if sample.reward >= reward_bound:
            continue
        train_obs.extend(map(lambda step:step.observation, sample.steps))
        train_act.extend(map(lambda step:step.action, sample.steps))

    train_obs_vector = torch.FloatTensor(train_obs)
    train_act_vector = torch.FloatTensor(train_act)

    return train_obs_vector, train_act_vector, reward_bound, reward_mean



In [23]:
# Training Agent 
HIDDEN_LAYER = 150
BATCH_SIZE = 16
PERCENTILE = 70

# Keep tracking of each episode and Steps
Episode = collections.namedtuple('Episode', field_names= ['reward','steps'])
EpisodeStep = collections.namedtuple('EpisodeStep', field_names= ['observation', 'action'])

# Initiate Environment
env = gym.make('Pendulum-v1', render_mode = 'human')
obs_size = env.observation_space.shape[0]
n_actions_ = env.action_space.shape[0]

# Intiate Neural Network , Loss Function and Optimizer
Net = PolicyNeuralNetwork(obs_size=obs_size, hidden_size=HIDDEN_LAYER, n_actions=n_actions_)
objective = nn.MSELoss()
optimizer = torch.optim.Adam(params=Net.parameters(), lr=0.001)

# Training NN and Agent 
for iter_no, batch in enumerate(iterate_batches(env=env, Policy=Net, batch_size=BATCH_SIZE)):

    # Applying Optimization 
    obs_vector, action_vector, reward_boundary, reward_mean = filter_batches(batches = batch, percentile=PERCENTILE)
    optimizer.zero_grad()
    action_score_vector = Net(obs_vector) # trainining only on the best

    loss_vector = objective(action_score_vector, action_vector)
    loss_vector.backward() # back propagation
    optimizer.step() # applying and update 
    time.sleep(1/10)
    env.render()
    print(f"iter_no ; {iter_no}, loss : {loss_vector.item()}, Reward Mean : {reward_mean}, Reward bound : {reward_boundary}")

    if reward_mean > -1.5:
        print("solved")
        break
env.close()


iter_no ; 0, loss : 8.681227970915115e-15, Reward Mean : -9.400084480318185, Reward bound : -0.44820316193696963
iter_no ; 1, loss : 5.24832678486638e-16, Reward Mean : -3.7799627481683435, Reward bound : -1.087118357544834
iter_no ; 2, loss : 1.0698513167766485e-15, Reward Mean : -3.9609341324854763, Reward bound : -1.5008150293345128
iter_no ; 3, loss : 2.3213753188715833e-15, Reward Mean : -1.7397613233320461, Reward bound : -0.09917973205346153
iter_no ; 4, loss : 6.459479608508399e-16, Reward Mean : -2.968583558231043, Reward bound : -1.1275535664720029
iter_no ; 5, loss : 6.661338147750939e-16, Reward Mean : -5.246190551247389, Reward bound : -3.995646054127965
iter_no ; 6, loss : 8.679925657758712e-16, Reward Mean : -2.527811020138029, Reward bound : -0.5120119591508759
iter_no ; 7, loss : 8.478066589120579e-16, Reward Mean : -2.395224914280303, Reward bound : -1.0744417778791282
iter_no ; 8, loss : 1.0900371707009025e-15, Reward Mean : -2.752302679789458, Reward bound : -0.7885

#### $ Solving \ the \ Mountain \ Car \ Problem \ with \ Cross \ Entropy \ - \ Functions $

In [39]:
class NeuralNetwork(nn.Module):
    """ Internal NN Policy mapper decision for the agent to choose an action """
    def __init__(self, obs_size:int, hidden_size:int, n_actions:int):
        super(NeuralNetwork, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(in_features=obs_size, out_features=hidden_size),
            nn.LeakyReLU (),
            nn.Linear(in_features=hidden_size, out_features=550),
            nn.LeakyReLU (),
            nn.Linear(in_features=550, out_features=n_actions),
        )

    def forward(self, X:torch.FloatTensor)-> torch.FloatTensor:
        return self.policy(X)
    

def batches_iterator (env:gym.make, AgentPolicy: nn.Module, batch_size:int):
    """
    Iterate through batches of episodes.

    Args:
        env: The gym environment to use.
        policy_network: The neural network that defines the agent's policy.
        batch_size: The number of episodes per batch.

    Yields:
        A batch of episodes, represented as a list of Episode objects.
    """

    batch , episode_step = [], []
    episode_reward, step_count = 0.0 , 0
    obs = env.reset()[0] # the new version return the output as tuple 
    activation_function = nn.Softmax(dim=1) # activation function 

    while True:
        obs_vector = torch.FloatTensor(np.array([obs]))
        act_prob_vector = activation_function(AgentPolicy(obs_vector)) # get the output probabilities from Neural Network
        act_prob = act_prob_vector.detach().numpy()[0] 
        action = np.random.choice(len(act_prob), p= act_prob)

        # Environment
        next_obs, rewards , is_done, _, _ = env.step(action)
        episode_reward += rewards * 0.95

        # Track the best steps 
        episode_step.append(EpisodeStep(observation = obs, action = action))
        
        # Reset
        if is_done or step_count > EPISODES:
            batch.append(Episode(reward=episode_reward, steps = episode_step))
            episode_reward = 0.0 
            next_obs = env.reset()[0]
            episode_step = []

            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs
        step_count += 1 

def filter_batches (batch:namedtuple, percentile:int)-> list:
    """ This function filter the elite or best Episode to retrain the NN
    Parameters:
    - batch: namedtuple, containing the Rewards and Steps
    - percentile: int, percentile to filter the batches """

    # Filtering Rewards
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile) # Get those reward that are above the percentile 
    reward_mean = float(np.mean(rewards))

    train_obs, train_act = [], []
    for example in batch:
        if example.reward < reward_bound: # Filter the batches which reward is above the rewards_bound
            continue
        train_obs.extend(map(lambda step:step.observation, example.steps))
        train_act.extend(map(lambda step:step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean
    

##### $ Mountain \ Car $  

In [41]:
# Create the Neural Network 
HIDDEN_LAYERS = 400
BATCH_SIZE = 32
PERCENTILE = 90
EPISODES = 100

# Keep tracking of each episode and Steps
Episode = namedtuple('Episode', field_names= ['reward','steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names= ['observation', 'action'])

# Initiating Environment
env = gym.make("MountainCar-v0", render_mode = 'human')
obs_size_ = env.observation_space.shape[0] # number of output in the environment ->  ndarray with shape (1,) which takes values {0,1} where 0, push cart to the left, and 1, push cart to the right  
n_actions_ = env.action_space.n  #accelerate , stop, dont accelerate

# Initiate Neural Network , Loss Functions and Optimizer
net = NeuralNetwork(obs_size = obs_size_, hidden_size= HIDDEN_LAYERS, n_actions = n_actions_)
objective = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr=0.1)
writer = SummaryWriter(comment="-mountain")

# Start Training 
for iter_no, batch in enumerate(batches_iterator(env=env, AgentPolicy=net, batch_size=BATCH_SIZE)):

    # Applying optimization
    obs_v, acts_v, reward_b, reward_m = filter_batches(batch=batch, percentile=PERCENTILE) # Take the best Scenarios
    optimizer.zero_grad() # Reset the Gradient 
    action_score_v = net(obs_v)

    loss_v = objective(action_score_v, acts_v) # compare the action output vs the winning action 
    loss_v.backward()
    optimizer.step() # Apply back Propagation

    print(f"iter_no ; {iter_no}, loss : {loss_v.item()}, Reward Mean : {reward_m}, Reward bound : {reward_b}")
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)

    if reward_m >= -.10:
        print(reward_m)
        print("solved!")
        break 
writer.close()
env.close()

iter_no ; 0, loss : 1.0993001461029053, Reward Mean : -2.078125, Reward bound : -0.5
iter_no ; 1, loss : 2.868468527594814e-06, Reward Mean : -0.5, Reward bound : -0.5
iter_no ; 2, loss : 0.0, Reward Mean : -0.5, Reward bound : -0.5
iter_no ; 3, loss : 0.0, Reward Mean : -0.5, Reward bound : -0.5
iter_no ; 4, loss : 0.0, Reward Mean : -0.5, Reward bound : -0.5
iter_no ; 5, loss : 0.0, Reward Mean : -0.5, Reward bound : -0.5


KeyboardInterrupt: 

In [45]:
env = gym.make("MountainCar-v0", render_mode = 'human')
env.reset()
for _ in range(1000):
    env.step(2)
    env.step(0)

KeyboardInterrupt: 

[+] Playing Random Games
episode = 0, Reward=1880
episode = 1, Reward=0
episode = 2, Reward=0
episode = 3, Reward=0
episode = 4, Reward=0
episode = 5, Reward=0
episode = 6, Reward=0
episode = 7, Reward=0
episode = 8, Reward=0
episode = 9, Reward=0
episode = 10, Reward=0
episode = 11, Reward=0
episode = 12, Reward=0
episode = 13, Reward=0
episode = 14, Reward=0
episode = 15, Reward=0
episode = 16, Reward=0
episode = 17, Reward=0
episode = 18, Reward=0
episode = 19, Reward=0
episode = 20, Reward=0
episode = 21, Reward=0
episode = 22, Reward=0
episode = 23, Reward=0
episode = 24, Reward=0
episode = 25, Reward=0
episode = 26, Reward=0
episode = 27, Reward=0
episode = 28, Reward=0
episode = 29, Reward=0
episode = 30, Reward=0
episode = 31, Reward=0
episode = 32, Reward=0
episode = 33, Reward=0
episode = 34, Reward=0
episode = 35, Reward=0
episode = 36, Reward=0
episode = 37, Reward=0
episode = 38, Reward=0
episode = 39, Reward=0
episode = 40, Reward=0
episode = 41, Reward=0
episode = 42, Re

AttributeError: 'tuple' object has no attribute 'reshape'

In [11]:
env.close()

In [3]:
import gym
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from collections import namedtuple
from time import sleep

RIGHT_CMD = [0, 1]
LEFT_CMD = [1, 0]

# Define Reward Config
BEST_GAMES_TO_EVOLVE = 10

# Define Game Commands
GAME_ACTIONS_MAPPING_TO_ARRAY = [
    [1, 0, 0],  # Movement 0
    [0, 1, 0],  # Movement 1
    [0, 0, 1]   # Movement 2
]

# Initialize Game Environment
env = gym.make('MountainCar-v0')

# Define Structures
GameData = namedtuple('GameData', field_names = ['reward','data'])


def compute_reward(position):
    """
    Compute Reward for Current Position.
    :param position:
    :return:
    """
    # Update Best Position
    if position >= -0.1000000:
        return 6
    if position >= -0.1100000:
        return 5
    if position >= -0.1300000:
        return 4
    if position >= -0.1500000:
        return 3
    if position >= -0.1700000:
        return 2
    if position >= -0.2000000:
        return 1

    return -1


def play_random_games(games=100):
    """
    Play Random Games to Get Some Observations
    :param games:
    :return:
    """

    # Storage for All Games Movements
    all_movements = []

    for episode in range(games):

        # Reset Game Reward
        episode_reward = 0

        # Define Storage for Current Game Data
        current_game_data = []

        # Reset Game Environment
        env.reset()

        # Get First Random Movement
        action = env.action_space.sample()

        while True:

            # Play
            observation, reward, done, info, _ = env.step(action)  # observation=position, velocity

            # Update Reward Value
            reward = compute_reward(observation[[0]])

            # Get Random Action (On Real, its get a "Next" movement to compensate Previous Movement)
            action = env.action_space.sample()

            # Store Observation Data and Action Taken
            current_game_data.append(
                np.hstack((observation, GAME_ACTIONS_MAPPING_TO_ARRAY[action]))
            )

            if done:
                break

            episode_reward += reward

        # Compute Reward
        if episode_reward > -199.0:
            print(f'Reward={episode_reward}')

            # Save All Data
            all_movements.append(
                GameData(episode_reward, current_game_data)
            )

    # Sort Movements Array
    all_movements.sort(key=lambda item: item.reward, reverse=True)

    # Filter the best N games
    all_movements = all_movements[BEST_GAMES_TO_EVOLVE] if len(all_movements) > BEST_GAMES_TO_EVOLVE else all_movements

    # Retrieve only the Game Movements
    movements_only = []
    for single_game_movements in all_movements:
        movements_only.extend([item for item in single_game_movements.data])

    # Create DataFrame
    dataframe = pd.DataFrame(
        movements_only,
        columns=['position', 'velocity', 'action_0', 'action_1', 'action_2']
    )

    return dataframe


def generate_ml(dataframe):
    """
    Train and Generate NN Model
    :param dataframe:
    :return:
    """

    # Define Neural Network Topology
    model = Sequential()
    model.add(Dense(64, input_dim=2, activation='relu'))
    # model.add(Dense(128,  activation='relu'))
    # model.add(Dense(128,  activation='relu'))
    model.add(Dense(64,  activation='relu'))
    model.add(Dense(32,  activation='relu'))
    model.add(Dense(3,  activation='sigmoid'))

    # Compile Neural Network
    model.compile(optimizer='adam', loss='categorical_crossentropy')

    # Fit Model with Data
    model.fit(
        dataframe[['position', 'velocity']],
        dataframe[['action_0', 'action_1', 'action_2']],
        epochs=80
    )

    return model


def play_game(ml_model, games=100):
    """
    Play te Game
    :param ml_model:
    :param games:
    :return:
    """

    for i_episode in range(games):

        # Define Reward Var
        episode_reward = 0

        # Reset Env for the Game
        observation = env.reset()

        while True:
            render = env.render()
            sleep(0.01)

            # Predict Next Movement
            current_action_pred = ml_model.predict(observation.reshape(1, 2))[0]

            # Define Movement
            current_action = np.argmax(current_action_pred)

            # Make Movement
            observation, reward, done, info, _ = env.step(current_action)

            # Update Reward Value
            episode_reward += compute_reward(observation[[0]])

            if done:
                print(f"Episode finished after {i_episode+1} steps", end='')
                break

        print(f" Score = {episode_reward}")


print("[+] Playing Random Games")
df = play_random_games(games=10)

print("[+] Training NN Model")
ml_model = generate_ml(df)

print("[+] Playing Games with NN")
play_game(ml_model=ml_model, games=30)

[+] Playing Random Games
Reward=11026
Reward=1381
Reward=108
Reward=11518
Reward=6968
Reward=7287
Reward=7754
Reward=236
[+] Training NN Model
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
  57/7515 [..............................] - ETA: 13s - loss: 1.0991

KeyboardInterrupt: 