In [24]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.distributions as dist
import numpy as np
import random
import torch.nn.functional as F
from torch.distributions import Categorical

In [25]:
# Step 1: Environment
class DummyEnv:
    def __init__(self):
        self.state_space = 4  # Example state dimension
        self.action_space = 2  # Example action dimension

    def reset(self):
        return np.random.rand(self.state_space)  # Random initial state

    def step(self, action):
        # Simulate environment response with random reward
        next_state = np.random.rand(self.state_space)
        reward = np.random.randn()
        done = np.random.rand() > 0.95  # Randomly terminate episodes
        return next_state, reward, done, {}

In [26]:
# Step 2: Preprocessing (simulate with random data)
def preprocess_data(data):
    std = np.std(data, axis=0) + 1e-9  # Prevent division by zero
    return (data - np.mean(data, axis=0)) / std

# Step 3: Train-Test Split
def train_test_split(data, split_ratio=0.8):
    split_idx = int(len(data) * split_ratio)
    return data[:split_idx], data[split_idx:]

# Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim=4, output_dim=2):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, 16)
        self.output = nn.Linear(16, output_dim)

    def forward(self, state):
        x = F.relu(self.fc(state))
        action_probs = F.softmax(self.output(x), dim=-1)  # Ensure valid probability distribution
        return action_probs

In [39]:
def train_reinforce(env, policy, optimizer, num_episodes=100, gamma=0.99):
    for episode in range(num_episodes):
        log_probs = []
        rewards = []
        state = torch.tensor(env.reset(), dtype=torch.float32)

        done = False
        while not done:
            action_probs = policy(state)
            distribution = Categorical(action_probs)
            action = distribution.sample()

            log_probs.append(distribution.log_prob(action))
            next_state, reward, done, _ = env.step(action.item())
            rewards.append(reward)
            state = torch.tensor(next_state, dtype=torch.float32)

        # Compute returns
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)

        returns = torch.tensor(returns, dtype=torch.float32)

        # ✅ Only normalize if returns has more than 1 value
        if len(returns) > 1:
            std_val = returns.std(unbiased=False)
            if std_val > 1e-8:  # Avoid division by zero
                returns = (returns - returns.mean()) / std_val

        # Compute loss
        loss = -torch.sum(torch.stack(log_probs) * returns)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Episode {episode + 1:03d} | Loss: {loss.item():.4f}")

In [40]:
# Evaluation function
def evaluate_policy(env, policy, num_episodes=10):
    total_reward = 0
    for _ in range(num_episodes):
        state = torch.tensor(env.reset(), dtype=torch.float32)
        done = False
        episode_reward = 0
        while not done:
            action_probs = policy(state)
            action = torch.argmax(action_probs).item()
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = torch.tensor(next_state, dtype=torch.float32)
        total_reward += episode_reward
    print(f"Average Reward over {num_episodes} episodes: {total_reward / num_episodes:.2f}")

# Dummy Deployment
def deploy_policy(policy):
    print("Policy deployed successfully!")

In [41]:
# Main Function
def main():
    env = DummyEnv()
    
    # Generate random data and preprocess
    data = np.random.rand(100, env.state_space)
    processed_data = preprocess_data(data)

    # Train-Test Split
    train_data, test_data = train_test_split(processed_data)
    print(f"Data split: {len(train_data)} training samples, {len(test_data)} testing samples.\n")

    # Initialize Policy and Optimizer
    policy = PolicyNetwork(input_dim=env.state_space, output_dim=env.action_space)
    optimizer = optim.Adam(policy.parameters(), lr=0.01)

    print("Training policy using REINFORCE...\n")
    train_reinforce(env, policy, optimizer, num_episodes=100)

    # Step 5: Evaluate
    print("\nEvaluating the trained policy...")
    evaluate_policy(env, policy, num_episodes=10)

    # Step 6: Deploy Policy
    deploy_policy(policy)

if __name__ == "__main__":
    main()

Data split: 80 training samples, 20 testing samples.

Training policy using REINFORCE...

Episode 001 | Loss: 1.6313
Episode 002 | Loss: -0.6457
Episode 003 | Loss: -0.6906
Episode 004 | Loss: 0.4211
Episode 005 | Loss: 0.5189
Episode 006 | Loss: -0.9433
Episode 007 | Loss: -0.4621
Episode 008 | Loss: 0.0372
Episode 009 | Loss: -0.3840
Episode 010 | Loss: 0.9301
Episode 011 | Loss: 0.3736
Episode 012 | Loss: 1.4039
Episode 013 | Loss: 0.8700
Episode 014 | Loss: -0.9375
Episode 015 | Loss: -0.1025
Episode 016 | Loss: -1.0211
Episode 017 | Loss: -0.2897
Episode 018 | Loss: 0.0635
Episode 019 | Loss: 1.3083
Episode 020 | Loss: 1.2413
Episode 021 | Loss: -0.0265
Episode 022 | Loss: 1.3342
Episode 023 | Loss: -2.4558
Episode 024 | Loss: 0.2154
Episode 025 | Loss: -0.5026
Episode 026 | Loss: 0.8787
Episode 027 | Loss: -0.3820
Episode 028 | Loss: -0.2196
Episode 029 | Loss: -0.0451
Episode 030 | Loss: -0.5375
Episode 031 | Loss: 0.3166
Episode 032 | Loss: -1.0678
Episode 033 | Loss: -0.7930
E