In [1]:
!pip install tensorflow  # Install or update TensorFlow





In [2]:
!pip install tensorflow-probability  # Install TensorFlow Probability




In [3]:
!pip install gym  # Install OpenAI Gym




In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp


  if (distutils.version.LooseVersion(tf.__version__) <


In [3]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


In [4]:
# Policy Network for PPO
class PolicyNetwork(tf.keras.Model):
    def __init__(self, num_actions):
        super(PolicyNetwork, self).__init__()
        self.dense1 = Dense(64, activation='relu')
        self.dense2 = Dense(64, activation='relu')
        self.dense3_mean = Dense(num_actions, activation='tanh')
        self.dense3_std = Dense(num_actions, activation='softplus')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        mean = self.dense3_mean(x)
        std = self.dense3_std(x)
        return mean, std

# PPO Agent Implementation
class PPOAgent:
    def __init__(self, num_actions, state_shape, lr_actor=0.0003, lr_critic=0.001, gamma=0.99, clip_ratio=0.2, value_coef=0.5, entropy_coef=0.01):
        self.policy_network = PolicyNetwork(num_actions)
        self.optimizer = Adam(learning_rate=lr_actor)
        self.gamma = gamma
        self.clip_ratio = clip_ratio
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef

    def choose_action(self, state):
        state = tf.convert_to_tensor(state, dtype=tf.float32)  # Ensure input is tensor
        mean, std = self.policy_network(state)
        action = tf.random.normal(tf.shape(mean), mean, std)
        log_prob = self.log_probability(action, mean, std)
        return action, log_prob

    def log_probability(self, action, mean, std):
        dist = tfp.distributions.Normal(mean, std)
        log_prob = dist.log_prob(action)
        return tf.reduce_sum(log_prob, axis=1)

    def compute_loss(self, states, actions, advantages, old_log_probs):
        mean, std = self.policy_network(states)
        log_probs = self.log_probability(actions, mean, std)
        ratio = tf.exp(log_probs - old_log_probs)
        clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
        actor_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, clipped_ratio * advantages))

        critic_values = self.policy_network(states)[0]  # Assuming critic values from policy network
        critic_loss = tf.reduce_mean(tf.square(critic_values - advantages))

        entropy = tf.reduce_mean(-log_probs)
        total_loss = actor_loss + self.value_coef * critic_loss - self.entropy_coef * entropy
        return total_loss

    def train_step(self, states, actions, advantages, old_log_probs):
        with tf.GradientTape() as tape:
            loss = self.compute_loss(states, actions, advantages, old_log_probs)
        gradients = tape.gradient(loss, self.policy_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.policy_network.trainable_variables))


In [5]:
# Test the Policy Network and PPO Agent
sample_state = tf.random.normal([1, 4])  # Sample input (assuming 4-dimensional state)

# Test Policy Network
policy_net = PolicyNetwork(num_actions=2)  # Example with 2 actions
mean, std = policy_net(sample_state)

# Test PPO Agent
ppo_agent = PPOAgent(num_actions=2, state_shape=[4])
action, log_prob = ppo_agent.choose_action(sample_state)

print("Policy Network and PPO Agent tests successful.")


Policy Network and PPO Agent tests successful.
