# Задача о Маятнике

Наша цель обучить несколько моделей для этой [задачи](https://gymnasium.farama.org/environments/classic_control/pendulum/):

## Подготовка данных

In [None]:
import gymnasium as gym

In [None]:
env = gym.make("Pendulum-v1", render_mode="rgb_array", g=9.81)

In [None]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>

In [None]:
env.reset(seed=123, options={"low": -0.7, "high": 0.5})

(array([ 0.4123625 ,  0.91101986, -0.89235795], dtype=float32), {})

In [None]:
!pip install stable_baselines3

Collecting stable_baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (

## Обучение с подкреплением на основе DQN (предсказываю положение через н секунд)

### Обучение (n=5, дискретных действий = 21)

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Создаем среду
env = gym.make('Pendulum-v1', render_mode=None)

# Настройки
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995
LR = 0.0005
TARGET_UPDATE = 10
MEMORY_SIZE = 20000
HIDDEN_SIZE = 128
N_FUTURE_STEPS = 5
SEQUENCE_LENGTH = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используется устройство: {device}")

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.2)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        hidden_representation = F.relu(self.fc2(x))
        hidden_representation = self.dropout(hidden_representation)
        return self.fc3(hidden_representation), hidden_representation

class EnhancedSecondaryNetwork(nn.Module):
    def __init__(self, hidden_size, state_size):
        super().__init__()
        self.lstm = nn.LSTM(hidden_size, 128, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.LayerNorm(256),
            nn.Linear(256, state_size)
        )

    def forward(self, x):
        x, _ = self.lstm(x)
        return self.fc(x[:, -1, :])

def discretize_action(action_idx, action_size):
    action_range = np.linspace(-2.0, 2.0, action_size)
    return np.array([action_range[action_idx]])

def select_action(state, policy_net, action_size, epsilon):
    if random.random() < epsilon:
        return torch.tensor([[random.randrange(action_size)]], device=device, dtype=torch.long)
    else:
        with torch.no_grad():
            q_values, _ = policy_net(state)
            return q_values.max(1)[1].view(1, 1)

def optimize_model(memory, policy_net, target_net, optimizer):
    if len(memory) < BATCH_SIZE:
        return 0

    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: not s, batch.done)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s, d in zip(batch.next_state, batch.done) if not d])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    q_values, _ = policy_net(state_batch)
    state_action_values = q_values.gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_q_values, _ = target_net(non_final_next_states)
        next_state_values[non_final_mask] = next_q_values.max(1)[0]

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss.item()

def collect_trajectory_data(policy_net, num_episodes=200):
    states = []
    hidden_reps = []
    targets = []

    policy_net.eval()
    with torch.no_grad():
        for _ in tqdm(range(num_episodes), desc="Сбор данных"):
            state, _ = env.reset()
            episode_states = []
            episode_hidden = []

            for _ in range(200):
                state_tensor = torch.tensor(state, device=device, dtype=torch.float32).unsqueeze(0)
                _, hidden = policy_net(state_tensor)

                episode_states.append(state)
                episode_hidden.append(hidden.cpu().numpy()[0])

                action_idx = select_action(state_tensor, policy_net, action_size, epsilon=0.01)
                action = discretize_action(action_idx.item(), action_size)
                state, _, done, _, _ = env.step(action)

                if done:
                    break

            for i in range(len(episode_hidden) - SEQUENCE_LENGTH - N_FUTURE_STEPS):
                hidden_seq = episode_hidden[i:i+SEQUENCE_LENGTH]
                target_state = episode_states[i+SEQUENCE_LENGTH+N_FUTURE_STEPS]

                hidden_reps.append(np.array(hidden_seq))
                targets.append(target_state)

    return np.array(hidden_reps), np.array(targets)

def train_secondary_network(hidden_reps, targets):
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    X = scaler_x.fit_transform(hidden_reps.reshape(-1, hidden_reps.shape[-1])).reshape(hidden_reps.shape)
    y = scaler_y.fit_transform(targets)

    X_tensor = torch.tensor(X, dtype=torch.float32, device=device)
    y_tensor = torch.tensor(y, dtype=torch.float32, device=device)

    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    secondary_net = EnhancedSecondaryNetwork(HIDDEN_SIZE, state_size).to(device)
    optimizer = optim.AdamW(secondary_net.parameters(), lr=1e-4, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
    criterion = nn.HuberLoss()

    best_loss = float('inf')
    secondary_net.train()

    for epoch in range(100):
        epoch_loss = 0
        for batch_x, batch_y in loader:
            optimizer.zero_grad()

            predictions = secondary_net(batch_x)
            loss = criterion(predictions, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(secondary_net.parameters(), 1.0)
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(loader)
        scheduler.step(epoch_loss)

        if epoch % 20 == 0:
            print(f'Epoch {epoch}: Loss = {epoch_loss:.5f}')

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save({
                'model_state_dict': secondary_net.state_dict(),
                'scaler_x': scaler_x,
                'scaler_y': scaler_y
            }, 'best_secondary_net.pth')

    checkpoint = torch.load('best_secondary_net.pth')
    secondary_net.load_state_dict(checkpoint['model_state_dict'])
    secondary_net.eval()
    return secondary_net, checkpoint['scaler_x'], checkpoint['scaler_y']

class SecondaryPolicy:
    def __init__(self, policy_net, secondary_net, scaler_x, scaler_y, horizon=5):
        self.policy_net = policy_net
        self.secondary_net = secondary_net
        self.scaler_x = scaler_x
        self.scaler_y = scaler_y
        self.horizon = horizon
        self.history = []

        self.policy_net.eval()
        self.secondary_net.eval()

    def reset(self):
        self.history = []

    def act(self, state):
        with torch.no_grad():
            state_tensor = torch.tensor(state, device=device, dtype=torch.float32).unsqueeze(0)

            _, hidden = self.policy_net(state_tensor)
            hidden_np = hidden.cpu().numpy()[0]

            self.history.append(hidden_np)
            if len(self.history) > self.horizon:
                self.history.pop(0)

            if len(self.history) == self.horizon:
                seq = np.array(self.history)
                seq_scaled = self.scaler_x.transform(seq.reshape(-1, seq.shape[-1])).reshape(seq.shape)
                seq_tensor = torch.tensor(seq_scaled, dtype=torch.float32, device=device).unsqueeze(0)

                predicted_scaled = self.secondary_net(seq_tensor)
                predicted = self.scaler_y.inverse_transform(predicted_scaled.cpu().numpy())
                predicted_tensor = torch.tensor(predicted, dtype=torch.float32, device=device)

                q_values, _ = self.policy_net(predicted_tensor)
                return q_values.argmax().item()

            q_values, _ = self.policy_net(state_tensor)
            return q_values.argmax().item()

def test_model(model, num_episodes=50, is_secondary=False, policy_net=None, secondary_net=None, scaler_x=None, scaler_y=None):
    rewards = []

    if is_secondary:
        policy = SecondaryPolicy(policy_net, secondary_net, scaler_x, scaler_y)

    for _ in tqdm(range(num_episodes), desc="Тестирование"):
        state, _ = env.reset()
        total_reward = 0
        done = False

        if is_secondary:
            policy.reset()

        for _ in range(200):
            if is_secondary:
                action_idx = policy.act(state)
            else:
                state_tensor = torch.tensor(state, device=device, dtype=torch.float32).unsqueeze(0)
                action_idx = select_action(state_tensor, model, action_size, epsilon=0.01)

            action = discretize_action(action_idx.item(), action_size)
            state, reward, done, _, _ = env.step(action)
            total_reward += reward

            if done:
                break

        rewards.append(total_reward)

    return rewards

# Основное обучение DQN
state_size = 3
action_size = 21
num_episodes = 500

policy_net = DQN(state_size, action_size, HIDDEN_SIZE).to(device)
target_net = DQN(state_size, action_size, HIDDEN_SIZE).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory = ReplayMemory(MEMORY_SIZE)

episode_rewards = []
losses = []
epsilon = EPS_START

for episode in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor([state], device=device, dtype=torch.float32)
    total_reward = 0
    episode_loss = 0
    steps = 0
    done = False
    truncated = False

    while not (done or truncated):
        action_idx = select_action(state, policy_net, action_size, epsilon)
        action = discretize_action(action_idx.item(), action_size)
        next_state, reward, done, truncated, _ = env.step(action)
        total_reward += reward

        reward = torch.tensor([reward], device=device, dtype=torch.float32)
        next_state = torch.tensor([next_state], device=device, dtype=torch.float32)
        done_tensor = torch.tensor([done or truncated], device=device, dtype=torch.bool)

        memory.push(state, action_idx, next_state, reward, done_tensor)
        state = next_state

        loss = optimize_model(memory, policy_net, target_net, optimizer)
        if loss > 0:
            episode_loss += loss
            steps += 1

    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(EPS_END, epsilon * EPS_DECAY)
    episode_rewards.append(total_reward)
    losses.append(episode_loss / steps if steps > 0 else 0)

    if episode % 10 == 0:
        avg_reward = np.mean(episode_rewards[-10:])
        print(f"Эпизод {episode}/{num_episodes}, Средняя награда: {avg_reward:.2f}, Epsilon: {epsilon:.2f}")
# Обучение вторичной сети
print("\nОбучение вторичной сети...")
hidden_reps, targets = collect_trajectory_data(policy_net)
secondary_net, scaler_x, scaler_y = train_secondary_network(hidden_reps, targets)
def train_secondary_network(hidden_reps, targets):
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    X = scaler_x.fit_transform(hidden_reps.reshape(-1, hidden_reps.shape[-1])).reshape(hidden_reps.shape)
    y = scaler_y.fit_transform(targets)

    X_tensor = torch.tensor(X, dtype=torch.float32, device=device)
    y_tensor = torch.tensor(y, dtype=torch.float32, device=device)

    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    secondary_net = EnhancedSecondaryNetwork(HIDDEN_SIZE, state_size).to(device)
    optimizer = optim.AdamW(secondary_net.parameters(), lr=1e-4, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
    criterion = nn.HuberLoss()

    best_loss = float('inf')
    best_net = None
    best_scaler_x = None
    best_scaler_y = None
    secondary_net.train()

    for epoch in range(100):
        epoch_loss = 0
        for batch_x, batch_y in loader:
            optimizer.zero_grad()

            predictions = secondary_net(batch_x)
            loss = criterion(predictions, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(secondary_net.parameters(), 1.0)
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(loader)
        scheduler.step(epoch_loss)

        if epoch % 20 == 0:
            print(f'Epoch {epoch}: Loss = {epoch_loss:.5f}')

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            # Сохраняем копии лучших объектов
            best_net = type(secondary_net)(HIDDEN_SIZE, state_size).to(device)
            best_net.load_state_dict(secondary_net.state_dict())
            best_scaler_x = type(scaler_x)().fit(hidden_reps.reshape(-1, hidden_reps.shape[-1]))
            best_scaler_y = type(scaler_y)().fit(targets)

    best_net.eval()
    return best_net, best_scaler_x, best_scaler_y
# Тестирование моделей
print("\nТестирование DQN...")
dqn_rewards = test_model(policy_net)

print("\nТестирование вторичной сети...")
secondary_rewards = test_model(
    secondary_net,
    is_secondary=True,
    policy_net=policy_net,
    secondary_net=secondary_net,
    scaler_x=scaler_x,
    scaler_y=scaler_y
)

# Визуализация
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(dqn_rewards, label='DQN')
plt.plot(secondary_rewards, label='Вторичная сеть')
plt.title('Награды по эпизодам')
plt.xlabel('Эпизод')
plt.ylabel('Награда')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.boxplot([dqn_rewards, secondary_rewards], labels=['DQN', 'Вторичная сеть'])
plt.title('Распределение наград')
plt.ylabel('Награда')
plt.grid(True)

plt.tight_layout()
plt.show()

Используется устройство: cuda
Эпизод 0/500, Средняя награда: -1368.69, Epsilon: 0.99
Эпизод 10/500, Средняя награда: -1276.42, Epsilon: 0.95
Эпизод 20/500, Средняя награда: -1342.92, Epsilon: 0.90
Эпизод 30/500, Средняя награда: -1196.09, Epsilon: 0.86
Эпизод 40/500, Средняя награда: -1342.37, Epsilon: 0.81
Эпизод 50/500, Средняя награда: -1181.01, Epsilon: 0.77
Эпизод 60/500, Средняя награда: -1410.05, Epsilon: 0.74
Эпизод 70/500, Средняя награда: -1318.57, Epsilon: 0.70
Эпизод 80/500, Средняя награда: -1158.15, Epsilon: 0.67
Эпизод 90/500, Средняя награда: -941.05, Epsilon: 0.63
Эпизод 100/500, Средняя награда: -913.89, Epsilon: 0.60
Эпизод 110/500, Средняя награда: -940.77, Epsilon: 0.57
Эпизод 120/500, Средняя награда: -755.72, Epsilon: 0.55
Эпизод 130/500, Средняя награда: -686.66, Epsilon: 0.52
Эпизод 140/500, Средняя награда: -652.03, Epsilon: 0.49
Эпизод 150/500, Средняя награда: -533.90, Epsilon: 0.47
Эпизод 160/500, Средняя награда: -512.78, Epsilon: 0.45
Эпизод 170/500, Сред

Сбор данных: 100%|██████████| 200/200 [00:25<00:00,  7.81it/s]


Epoch 0: Loss = 0.02621
Epoch 20: Loss = 0.00130
Epoch 40: Loss = 0.00082
Epoch 60: Loss = 0.00078
Epoch 80: Loss = 0.00078
Epoch 100: Loss = 0.00078
Epoch 120: Loss = 0.00078
Epoch 140: Loss = 0.00078
Epoch 160: Loss = 0.00078
Epoch 180: Loss = 0.00078
Epoch 200: Loss = 0.00078
Epoch 220: Loss = 0.00078
Epoch 240: Loss = 0.00078
Epoch 260: Loss = 0.00078
Epoch 280: Loss = 0.00078


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL sklearn.preprocessing._data.StandardScaler was not an allowed global by default. Please use `torch.serialization.add_safe_globals([StandardScaler])` or the `torch.serialization.safe_globals([StandardScaler])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

### Метрики

In [None]:
# Вместо текущих графиков обучения DQN
plt.figure(figsize=(15, 6))

# График наград с усреднением
plt.subplot(1, 2, 1)
window_size = max(1, len(episode_rewards)//10)
smoothed_rewards = np.convolve(episode_rewards, np.ones(window_size)/window_size, mode='valid')
plt.plot(episode_rewards, alpha=0.3, label='Сырые награды')
plt.plot(smoothed_rewards, color='red', linewidth=2, label=f'Скользящее среднее (окно={window_size})')
plt.title('Динамика наград DQN')
plt.xlabel('Эпизод')
plt.ylabel('Награда')
plt.legend()
plt.grid(True)

# График потерь с логарифмической шкалой
plt.subplot(1, 2, 2)
plt.plot(losses, alpha=0.8)
plt.yscale('log')  # Логарифмическая шкала для лучшей читаемости
plt.title('Потери DQN (log scale)')
plt.xlabel('Эпизод')
plt.ylabel('Потеря')
plt.grid(True)

plt.tight_layout()
plt.savefig('improved_dqn_training.png')
plt.show()

In [None]:
def test_model(model, env, num_episodes=50, is_secondary=False):
    rewards = []
    policy_net.eval()  # Устанавливаем режим оценки

    if is_secondary:
        secondary_net.eval()

    with torch.no_grad():  # Отключаем вычисление градиентов
        for _ in range(num_episodes):
            state, _ = env.reset()
            state_tensor = torch.tensor(state, device=device, dtype=torch.float32)
            total_reward = 0
            done = False
            truncated = False
            steps = 0

            while not (done or truncated):
                if not is_secondary:
                    # Обычная DQN политика
                    q_values, _ = model(state_tensor.unsqueeze(0))
                    action_idx = q_values.max(1)[1]
                else:
                    # Используем вторичную сеть для предсказания состояния
                    _, hidden_rep = policy_net(state_tensor.unsqueeze(0))
                    predicted_state = secondary_net(hidden_rep)
                    # Используем предсказанное состояние для выбора действия
                    q_values, _ = policy_net(predicted_state)
                    action_idx = q_values.max(1)[1]

                action = discretize_action(action_idx.item(), action_size)
                next_state, reward, done, truncated, _ = env.step(action)
                total_reward += reward
                state = next_state
                state_tensor = torch.tensor(state, device=device, dtype=torch.float32)
                steps += 1

                # Ранняя остановка если застряли
                if steps > 500:  # Максимальная длина эпизода
                    break

            rewards.append(total_reward)

    return rewards

# Ускоренное тестирование с прогресс-баром
from tqdm import tqdm

def fast_test_model(model, num_episodes=50, is_secondary=False):
    rewards = []
    policy_net.eval()
    if is_secondary:
        secondary_net.eval()

    with torch.no_grad():
        for _ in tqdm(range(num_episodes), desc="Тестирование"):
            state, _ = env.reset()
            state_tensor = torch.tensor(state, device=device, dtype=torch.float32)
            total_reward = 0
            done = False

            # Используем numpy для быстрых вычислений на CPU
            state_np = state
            for _ in range(200):  # Максимум 200 шагов
                if not is_secondary:
                    q_values, _ = model(state_tensor.unsqueeze(0))
                    action_idx = q_values.cpu().numpy().argmax()
                else:
                    _, hidden_rep = policy_net(state_tensor.unsqueeze(0))
                    predicted_state = secondary_net(hidden_rep)
                    q_values, _ = policy_net(predicted_state)
                    action_idx = q_values.cpu().numpy().argmax()

                action = discretize_action(action_idx, action_size)
                next_state, reward, done, _, _ = env.step(action)
                total_reward += reward
                state_np = next_state
                state_tensor = torch.tensor(state_np, device=device, dtype=torch.float32)

                if done:
                    break

            rewards.append(total_reward)

    return rewards

# Сравнение моделей
print("Тестирование DQN модели...")
dqn_rewards = fast_test_model(policy_net, num_episodes=50, is_secondary=False)

print("\nТестирование модели с вторичной сетью...")
secondary_rewards = fast_test_model(policy_net, num_episodes=50, is_secondary=True)

# Визуализация результатов
plt.figure(figsize=(14, 6))

# График распределения наград
plt.subplot(1, 2, 1)
sns.violinplot(data=[dqn_rewards, secondary_rewards])
plt.xticks([0, 1], ['DQN', 'Вторичная сеть'])
plt.title('Распределение наград')
plt.ylabel('Награда за эпизод')

# График скользящего среднего
plt.subplot(1, 2, 2)
window_size

In [None]:
# Убедимся, что модели в режиме оценки
policy_net.eval()
secondary_net.eval()

# Тестирование только вторичной сети с детальной диагностикой
def analyze_secondary_network(num_episodes=20):
    all_rewards = []
    state_errors = []
    action_discrepancies = []

    with torch.no_grad():
        for ep in range(num_episodes):
            state, _ = env.reset()
            state_tensor = torch.tensor(state, device=device, dtype=torch.float32)
            episode_rewards = 0
            episode_errors = []
            episode_discrepancies = []

            for step in range(200):  # Максимальная длина эпизода
                # Получаем скрытое представление и предсказание
                _, hidden_rep = policy_net(state_tensor.unsqueeze(0))
                predicted_state_scaled = secondary_net(hidden_rep)
                predicted_state = scaler_y.inverse_transform(predicted_state_scaled.cpu().numpy())
                predicted_state_tensor = torch.tensor(predicted_state, device=device, dtype=torch.float32)

                # Ошибка предсказания состояния
                current_error = mean_squared_error(state, predicted_state[0])
                episode_errors.append(current_error)

                # Разница в действиях между основной и вторичной сетью
                q_main, _ = policy_net(state_tensor.unsqueeze(0))
                action_main = q_main.argmax().item()

                q_secondary, _ = policy_net(predicted_state_tensor)
                action_secondary = q_secondary.argmax().item()
                episode_discrepancies.append(1 if action_main != action_secondary else 0)

                # Применяем действие от вторичной сети
                action = discretize_action(action_secondary, action_size)
                next_state, reward, done, _, _ = env.step(action)
                episode_rewards += reward

                state = next_state
                state_tensor = torch.tensor(state, device=device, dtype=torch.float32)

                if done:
                    break

            all_rewards.append(episode_rewards)
            state_errors.append(np.mean(episode_errors))
            action_discrepancies.append(np.mean(episode_discrepancies))

            print(f"Эпизод {ep+1}: Награда={episode_rewards:.1f}, "
                  f"Средняя ошибка состояния={np.mean(episode_errors):.4f}, "
                  f"Несовпадение действий={np.mean(episode_discrepancies)*100:.1f}%")

    return all_rewards, state_errors, action_discrepancies

# Запускаем анализ
secondary_rewards, state_errors, action_diffs = analyze_secondary_network(num_episodes=20)

# Визуализация проблем
plt.figure(figsize=(15, 5))

# График наград
plt.subplot(1, 3, 1)
plt.plot(secondary_rewards, 'o-')
plt.title('Награды вторичной сети')
plt.xlabel('Эпизод')
plt.ylabel('Награда')
plt.grid(True)

# График ошибок предсказания состояния
plt.subplot(1, 3, 2)
plt.plot(state_errors, 'o-', color='orange')
plt.title('Ошибка предсказания состояния')
plt.xlabel('Эпизод')
plt.ylabel('MSE')
plt.grid(True)

# График расхождений в действиях
plt.subplot(1, 3, 3)
plt.bar(range(len(action_diffs)), action_diffs, color='red')
plt.title('Расхождение действий с основной сетью')
plt.xlabel('Эпизод')
plt.ylabel('% несовпадений')
plt.grid(True)

plt.tight_layout()
plt.show()

# Сравнение с основной сетью
print("\nСравнение средних показателей:")
print(f"Средняя награда DQN: {np.mean(dqn_rewards):.1f}")
print(f"Средняя награда вторичной сети: {np.mean(secondary_rewards):.1f}")
print(f"Средняя ошибка предсказания: {np.mean(state_errors):.4f}")
print(f"Среднее расхождение действий: {np.mean(action_diffs)*100:.1f}%")