In [1]:
import numpy as np
import random

In [2]:
# ----------------------------
# Environment Setup
# ----------------------------
class SimpleEnvironment:
    def __init__(self, goal_state=10, max_steps=20):
        self.state = 0
        self.goal_state = goal_state
        self.max_steps = max_steps
        self.steps = 0

    def reset(self):
        self.state = 0
        self.steps = 0
        return self.state

    def step(self, action):
        self.state += action
        self.steps += 1
        reward = 1 if self.state == self.goal_state else -0.01
        done = self.state == self.goal_state or self.steps >= self.max_steps
        return self.state, reward, done

# ----------------------------
# Preprocessing
# ----------------------------
def preprocess_state(state):
    return np.array([state / 10])  # Normalize by goal_state for simplicity

# ----------------------------
# Train-Test Split
# ----------------------------
def generate_data(env, num_episodes=100):
    data = []
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        episode = []
        while not done:
            action = random.choice([-1, 1])
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward, next_state))
            state = next_state
        data.append(episode)
    split_idx = int(0.8 * len(data))
    return data[:split_idx], data[split_idx:]

# ----------------------------
# Monte Carlo Tree Search (MCTS)
# ----------------------------
class MCTSNode:
    def __init__(self, state, parent=None):
        self.state = state
        self.parent = parent
        self.children = {}
        self.visits = 0
        self.value = 0

    def is_fully_expanded(self):
        return len(self.children) == 2  # actions: -1, 1

    def best_child(self, c_param=1.4):
        choices = [child for child in self.children.values()]
        return max(choices, key=lambda x: x.value / (x.visits + 1e-4) + c_param * np.sqrt(np.log(self.visits + 1) / (x.visits + 1e-4)))

def mcts_search(env, state, num_simulations=20):
    root = MCTSNode(state)

    for _ in range(num_simulations):
        node = root
        sim_env = SimpleEnvironment()
        sim_env.state = state

        # Selection & Expansion
        while node.is_fully_expanded() and node.children:
            node = node.best_child()
            sim_env.step(node.state - node.parent.state)

        if not node.is_fully_expanded():
            action = random.choice([-1, 1])
            next_state, reward, done = sim_env.step(action)
            new_node = MCTSNode(next_state, parent=node)
            node.children[action] = new_node
            node = new_node

        # Simulation
        total_reward = 0
        for _ in range(5):
            if sim_env.state == sim_env.goal_state:
                total_reward += 1
                break
            action = random.choice([-1, 1])
            _, reward, done = sim_env.step(action)
            total_reward += reward
            if done:
                break

        # Backpropagation
        while node:
            node.visits += 1
            node.value += total_reward
            node = node.parent

    return max(root.children.items(), key=lambda item: item[1].value / (item[1].visits + 1e-4))[0]

# ----------------------------
# Fine-Tune Model
# ----------------------------
def fine_tune(env, num_episodes=50):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            action = mcts_search(env, state, num_simulations=10)  # Reduced simulations for faster performance
            next_state, reward, done = env.step(action)
            state = next_state

# ----------------------------
# Evaluation
# ----------------------------
def evaluate(env, num_episodes=20):
    success_count = 0
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        steps = 0
        while not done:
            action = mcts_search(env, state, num_simulations=5)
            state, _, done = env.step(action)
            steps += 1
        if env.state == env.goal_state:
            success_count += 1
    print(f"Success Rate: {success_count / num_episodes * 100:.2f}%")

# ----------------------------
# Deploy Policy
# ----------------------------
def deploy_policy(env):
    state = env.reset()
    done = False
    print("Deployment Run:")
    while not done:
        action = mcts_search(env, state, num_simulations=5)
        state, reward, done = env.step(action)
        print(f"State: {state}, Action: {action}, Reward: {reward}")

In [3]:
# ----------------------------
# Main Flow
# ----------------------------
if __name__ == "__main__":
    env = SimpleEnvironment(goal_state=5, max_steps=10)

    # Preprocessing example
    preprocessed_state = preprocess_state(env.reset())
    print(f"Preprocessed State: {preprocessed_state}")

    # Train-Test Split
    train_data, test_data = generate_data(env)
    print(f"Training Episodes: {len(train_data)}, Testing Episodes: {len(test_data)}")

    # Train Base Model
    print("Training base model...")
    fine_tune(env)

    # Evaluate
    print("Evaluating model...")
    evaluate(env)

    # Deploy
    print("Deploying policy...")
    deploy_policy(env)

Preprocessed State: [0.]
Training Episodes: 80, Testing Episodes: 20
Training base model...
Evaluating model...
Success Rate: 25.00%
Deploying policy...
Deployment Run:
State: -1, Action: -1, Reward: -0.01
State: -2, Action: -1, Reward: -0.01
State: -3, Action: -1, Reward: -0.01
State: -4, Action: -1, Reward: -0.01
State: -3, Action: 1, Reward: -0.01
State: -4, Action: -1, Reward: -0.01
State: -3, Action: 1, Reward: -0.01
State: -2, Action: 1, Reward: -0.01
State: -1, Action: 1, Reward: -0.01
State: -2, Action: -1, Reward: -0.01
