In [None]:
# Write a program to demonstrate the REINFORCE algorithm for policy gradient methods

# Name: Sharvari Pramod Jape
# Class: B.E AIML
# Roll No: 43526

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=1)
        return x

# Define the REINFORCE algorithm
class REINFORCE:
    def __init__(self, input_size, hidden_size, output_size, learning_rate):
        self.policy_network = PolicyNetwork(input_size, hidden_size, output_size)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)

    def select_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probabilities = self.policy_network(state)
        action_distribution = torch.distributions.Categorical(probabilities)
        action = action_distribution.sample()
        return action.item(), action_distribution.log_prob(action)

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []
        cumulative_reward = 0
        for reward in reversed(rewards):
            cumulative_reward = reward + 0.9 * cumulative_reward
            discounted_rewards.insert(0, cumulative_reward)
        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        policy_loss = []
        for log_prob, reward in zip(log_probs, discounted_rewards):
            policy_loss.append(-log_prob * reward)
        
        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

# Main function for training
def main():
    input_size = 4  # Example state size
    hidden_size = 128
    output_size = 2  # Example action space
    learning_rate = 0.01

    reinforce_agent = REINFORCE(input_size, hidden_size, output_size, learning_rate)

    # Dummy environment loop
    for episode in range(1000):
        state = np.random.rand(input_size)  # Example state generation
        rewards = []
        log_probs = []
        for _ in range(100):  # Example episode length
            action, log_prob = reinforce_agent.select_action(state)
            # Execute action in the environment and get the reward
            reward = np.random.rand()  # Example reward generation
            rewards.append(reward)
            log_probs.append(log_prob)

        reinforce_agent.update_policy(rewards, log_probs)
        if episode % 50 == 0:
            print(f"Episode {episode}, Total Reward: {sum(rewards)}")

if __name__ == "__main__":
    main()


Episode 0, Total Reward: 53.01022616416517
Episode 50, Total Reward: 52.88188490162602
Episode 100, Total Reward: 45.091438747599156
Episode 150, Total Reward: 53.070941480811456
Episode 200, Total Reward: 50.73842513894322
Episode 250, Total Reward: 43.041316052144616
Episode 300, Total Reward: 50.07343437131227
Episode 350, Total Reward: 48.43099674090892
Episode 400, Total Reward: 50.09940421629273
Episode 450, Total Reward: 44.40280792373482
Episode 500, Total Reward: 50.92515033604575
Episode 550, Total Reward: 48.491610880485844
Episode 600, Total Reward: 44.98059683515049
Episode 650, Total Reward: 43.875799910852635
Episode 700, Total Reward: 52.39833338397134
Episode 750, Total Reward: 54.51912816113648
Episode 800, Total Reward: 48.394859130038064
Episode 850, Total Reward: 47.10746518209345
Episode 900, Total Reward: 52.25245937329162
Episode 950, Total Reward: 49.78302628939642
