In [1]:
import numpy as np
import gymnasium as gym
import random

In [2]:
# Environment Setup
def create_environment():
    env = gym.make("CartPole-v1")  # Using a simple environment
    return env

# Preprocessing
def preprocess_state(state):
    return np.array(state, dtype=np.float32)

# Train-Test Split (Simulating Episode Splits)
def split_data(episodes=1000, train_ratio=0.8, val_ratio=0.1):
    train_size = int(episodes * train_ratio)
    val_size = int(episodes * val_ratio)
    test_size = episodes - train_size - val_size
    return train_size, val_size, test_size

In [3]:
# Base Model Training (REINFORCE)
def train_reinforce(env, episodes=100):
    for episode in range(episodes):
        state = preprocess_state(env.reset()[0])
        done = False
        total_reward = 0
        while not done:
            action = env.action_space.sample()  # Random action
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = preprocess_state(next_state)
        print(f"REINFORCE Episode {episode + 1}: Total Reward = {total_reward}")

# TRPO Training (Simulated - Not Fully Implemented)
def train_trpo(env, episodes=100):
    for episode in range(episodes):
        state = preprocess_state(env.reset()[0])
        done = False
        total_reward = 0
        while not done:
            action = env.action_space.sample()  # Placeholder for TRPO policy action
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = preprocess_state(next_state)
        print(f"TRPO Episode {episode + 1}: Total Reward = {total_reward}")

In [4]:
# Evaluation
def evaluate_policy(env, episodes=10):
    total_rewards = []
    for episode in range(episodes):
        state = preprocess_state(env.reset()[0])
        done = False
        total_reward = 0
        while not done:
            action = env.action_space.sample()  # Random policy for now
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = preprocess_state(next_state)
        total_rewards.append(total_reward)
    avg_reward = np.mean(total_rewards)
    print(f"Evaluation Average Reward: {avg_reward}")

# Deployment (Simulated - Placeholder)
def deploy_policy(env, episodes=5):
    print("Deploying TRPO Policy...")
    evaluate_policy(env, episodes)

In [5]:
# Main Execution
if __name__ == "__main__":
    env = create_environment()
    train_size, val_size, test_size = split_data()
    print(f"Train: {train_size}, Validation: {val_size}, Test: {test_size}")
    train_reinforce(env, train_size)
    train_trpo(env, train_size)
    evaluate_policy(env, test_size)
    deploy_policy(env, 5)

Train: 800, Validation: 100, Test: 100
REINFORCE Episode 1: Total Reward = 59.0
REINFORCE Episode 2: Total Reward = 44.0
REINFORCE Episode 3: Total Reward = 21.0
REINFORCE Episode 4: Total Reward = 30.0
REINFORCE Episode 5: Total Reward = 33.0
REINFORCE Episode 6: Total Reward = 32.0
REINFORCE Episode 7: Total Reward = 20.0
REINFORCE Episode 8: Total Reward = 15.0
REINFORCE Episode 9: Total Reward = 15.0
REINFORCE Episode 10: Total Reward = 41.0
REINFORCE Episode 11: Total Reward = 19.0
REINFORCE Episode 12: Total Reward = 21.0
REINFORCE Episode 13: Total Reward = 25.0
REINFORCE Episode 14: Total Reward = 10.0
REINFORCE Episode 15: Total Reward = 12.0
REINFORCE Episode 16: Total Reward = 13.0
REINFORCE Episode 17: Total Reward = 20.0
REINFORCE Episode 18: Total Reward = 16.0
REINFORCE Episode 19: Total Reward = 17.0
REINFORCE Episode 20: Total Reward = 23.0
REINFORCE Episode 21: Total Reward = 26.0
REINFORCE Episode 22: Total Reward = 30.0
REINFORCE Episode 23: Total Reward = 11.0
REIN