In [None]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import random

class TemperatureControlEnv(gym.Env):
    def __init__(self):
        super(TemperatureControlEnv, self).__init__()

        self.action_space = spaces.Discrete(2)  # 0, 1
        self.observation_space = spaces.Box(low=15.0, high=30.0, shape=(1,), dtype=np.float32)

        self.state = None
        self.target_temp = 22.5
        self.threshold = 0.5
        self.steps = 0
        self.max_steps = 100

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = random.uniform(15.0, 30.0)
        self.steps = 0
        return np.array([self.state]), {}

    def step(self, action):
        self.steps += 1
        if action == 0:
          self.state += -0.25 # move left
        if action == 1:
          self.state += 0.25 # move right
        #+ self.np_random.normal(0, 0.1, size=(1,))
        self.state = np.clip(self.state, self.observation_space.low, self.observation_space.high).astype(np.float32)

        temp_diff = np.abs(self.state - self.target_temp).item()
        if temp_diff <= self.threshold:
            reward = 0
        else:
            reward = -1
        terminated = temp_diff <= self.threshold
        truncated = self.steps >= self.max_steps

        return self.state, reward, terminated, truncated, {}

    def render(self):
        print(f"Current temperature: {self.state[0]:.2f}°C, Target: {self.target_temp:.2f}°C, Steps: {self.steps}")

gym.register(
    id='TemperatureControl-v0',
    entry_point='__main__:TemperatureControlEnv',
    max_episode_steps=100,
)

In [None]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import csv
from datetime import datetime

def set_seed(seed):
    #print(f"Setting random seed: {seed}")
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class Policy(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        #print(f"Initializing Policy with {num_features} features")
        self.theta = nn.Parameter(torch.rand(num_features), requires_grad=True)

    def forward(self, features):
        #print("Policy forward pass")
        logits = torch.einsum('ijk,k->ij', features, self.theta)
        return F.softmax(logits, dim=-1)

class RandomNNFeatures(nn.Module):
    def __init__(self, input_dim, num_features, sigma=1.0, kernel_type='gaussian1', seed=42):
        super().__init__()
        #print(f"Initializing RandomNNFeatures: input_dim={input_dim}, num_features={num_features}, kernel_type={kernel_type}")
        self.input_dim = input_dim
        self.num_features = num_features
        self.sigma = sigma
        self.kernel_type = kernel_type

        set_seed(seed)

        if kernel_type == 'gaussian1':
            self.linear = nn.Linear(input_dim, num_features, bias=True)
            nn.init.normal_(self.linear.weight, std=torch.sqrt(torch.tensor(1.0 / sigma)))
            nn.init.uniform_(self.linear.bias, 0, 2 * torch.pi)
        elif kernel_type == 'gaussian2':
            self.linear = nn.Linear(input_dim, num_features // 2, bias=False)
            nn.init.normal_(self.linear.weight, std=torch.sqrt(torch.tensor(1.0 / sigma)))
        elif kernel_type == 'cauchy1':
            self.W = nn.Parameter(torch.cauchy(torch.zeros(input_dim, num_features), sigma), requires_grad=False)
            self.b = nn.Parameter(torch.rand(num_features) * 2 * torch.pi, requires_grad=False)
        elif kernel_type == 'cauchy2':
            self.W = nn.Parameter(torch.cauchy(torch.zeros(input_dim, num_features // 2), sigma), requires_grad=False)
        elif kernel_type == 'laplace1':
            self.W = nn.Parameter(torch.laplace(torch.zeros(input_dim, num_features), sigma), requires_grad=False)
            self.b = nn.Parameter(torch.rand(num_features) * 2 * torch.pi, requires_grad=False)
        elif kernel_type == 'laplace2':
            self.W = nn.Parameter(torch.laplace(torch.zeros(input_dim, num_features // 2), sigma), requires_grad=False)
        else:
            raise ValueError(f"Unsupported kernel type: {kernel_type}")

    def forward(self, x):
        #print("RandomNNFeatures forward pass")
        if self.kernel_type == 'gaussian1':
            return torch.sqrt(torch.tensor(2.0 / self.num_features)) * torch.cos(self.linear(x))
        elif self.kernel_type == 'gaussian2':
            projection = self.linear(x)
            features = torch.cat([torch.cos(projection), torch.sin(projection)], dim=-1)
            return torch.sqrt(torch.tensor(2.0 / self.num_features)) * features
        elif self.kernel_type == 'cauchy1':
            return torch.sqrt(torch.tensor(2.0 / self.num_features)) * torch.cos(x @ self.W + self.b)
        elif self.kernel_type == 'cauchy2':
            projection = x @ self.W
            features = torch.cat([torch.cos(projection), torch.sin(projection)], dim=-1)
            return torch.sqrt(torch.tensor(2.0 / self.num_features)) * features
        elif self.kernel_type == 'laplace1':
            return torch.sqrt(torch.tensor(2.0 / self.num_features)) * torch.cos(x @ self.W + self.b)
        elif self.kernel_type == 'laplace2':
            projection = x @ self.W
            features = torch.cat([torch.cos(projection), torch.sin(projection)], dim=-1)
            return torch.sqrt(torch.tensor(2.0 / self.num_features)) * features
        else:
            raise ValueError(f"Unsupported kernel type: {self.kernel_type}")

class DataCollector:
    def __init__(self, env_name='TemperatureControl-v0', kernel_type='gaussian1', num_features=8, seed=42, sigma=1.0):
        #print(f"Initializing DataCollector: env_name={env_name}, kernel_type={kernel_type}, num_features={num_features}")
        self.env = gym.make(env_name)
        self.num_features = num_features
        self.num_actions = self.env.action_space.n
        self.sigma = sigma

                # Add these lines to store target_temp and threshold
        self.target_temp = 22.5  # Assuming this is the target temperature
        self.threshold = 0.5     # Assuming this is the threshold

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.rnf = RandomNNFeatures(input_dim=3, num_features=num_features, kernel_type='gaussian1', seed=seed)
        self.policy = Policy(num_features)

    def get_features(self, state, action=None):
        #print("Getting features")
        if state.dim() == 1:
            state = state.unsqueeze(0)

        if action is None:
            features = []
            #print(state.shape)
            for a in range(self.num_actions):
                one_hot = F.one_hot(torch.tensor(a), num_classes=self.num_actions).float().expand(state.shape[0], -1)
                features.append(self.rnf(torch.cat([state, one_hot], dim=-1)))
            features = torch.stack(features, dim=1)
        else:
            if isinstance(action, int):
                action = torch.tensor([action])
            one_hot = F.one_hot(action, num_classes=self.num_actions).float()
            if one_hot.dim() == 1:
                one_hot = one_hot.unsqueeze(0)
            features = self.rnf(torch.cat([state, one_hot], dim=-1))

        return features

    def generate_optimal_actions(self, states):
        #print("Generating optimal actions")
        if not isinstance(states, torch.Tensor):
            states = torch.tensor(states, dtype=torch.float32)

        states_flat = states.reshape(-1)
        optimal_actions = torch.zeros_like(states_flat, dtype=torch.long)
        optimal_actions[states_flat < (self.target_temp - self.threshold)] = 1
        optimal_actions = optimal_actions.reshape(states.shape)

        return optimal_actions

    def sample_action(self, state):
        #print("Sampling action")
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        features = self.get_features(state_tensor)
        probs = self.policy(features).squeeze(0)
        action = torch.multinomial(probs, num_samples=1).item()
        return action

    def collect_dataset(self, num_timesteps):
        #print(f"Collecting dataset: num_timesteps={num_timesteps}")
        states, actions, rewards, next_states, initial_states, dones = [], [], [], [], [], []
        total_steps = 0

        while total_steps < num_timesteps:
            state, _ = self.env.reset()
            initial_states.append(state)

            while True:
                action = self.sample_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                states.append(state)
                actions.append(action)
                rewards.append(reward)
                next_states.append(next_state)
                dones.append(done)

                state = next_state
                total_steps += 1

                if done or total_steps >= num_timesteps:
                    break

        initial_states = torch.tensor(np.array(initial_states), dtype=torch.float32)
        states = torch.tensor(np.array(states), dtype=torch.float32)
        filtered_indices = torch.where((states < 22) | (states > 23))

        filtered_states = states[filtered_indices]
        actions = torch.tensor(actions, dtype=torch.long)
        optimal_actions = self.generate_optimal_actions(filtered_states)

        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.bool)

        X = torch.stack([self.get_features(s.unsqueeze(0), a).squeeze(0) for s, a in zip(states, actions)])
        X_opt = torch.stack([self.get_features(s.unsqueeze(0), a).squeeze(0) for s, a in zip(states, optimal_actions)])

        return initial_states, states, actions, rewards, next_states, dones, X, X_opt

class PolicyOptimizer:
    def __init__(self, collector, env_name='TemperatureControl-v0', kernel_type='gaussian1', num_features=8, sigma=0.1, learning_rate=0.01, gamma=0.99, regularizer=0.001, seed=42):
        #print(f"Initializing PolicyOptimizer: env_name={env_name}, kernel_type={kernel_type}, num_features={num_features}")
        self.env = gym.make(env_name)
        self.collector = collector
        self.policy = self.collector.policy
        self.rnf = self.collector.rnf
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.regularizer = regularizer
        self.env = gym.make(env_name)

    def compute_Y(self, next_states):
        #print("Computing Y")
        features = self.collector.get_features(next_states)
        probs = self.policy(features)
        Y = torch.einsum('ijk,ij->ik', features, probs)
        return Y

    def compute_W(self, initial_states):
        #print("Computing W")
        features = self.collector.get_features(initial_states)
        probs = self.policy(features)
        W = torch.einsum('ijk,ij->k', features, probs)
        W = W / initial_states.shape[0]
        return W

    def compute_J(self, X, next_states, initial_states, rewards):
        #print("Computing J")
        latent_dim = X.shape[1]
        C_lambda = X.T @ X + self.regularizer * torch.eye(latent_dim)

        #print("Computing Y...")
        #print(next_states.shape)
        features = self.collector.get_features(next_states)
        #print(features.shape)
        probs = self.policy(features)
        Y = torch.einsum('ijk,ij->ik', features, probs)

        features = self.collector.get_features(initial_states)
        probs = self.policy(features)
        W = torch.einsum('ijk,ij->k', features, probs)
        W = W / initial_states.shape[0]

        D = X.T @ Y
        E = X.T @ rewards

        A = torch.linalg.solve(C_lambda, E).T
        M = torch.linalg.solve(C_lambda, D)

        J = A @ torch.linalg.solve(torch.eye(latent_dim) - self.gamma * M, torch.eye(latent_dim)) @ W
        return J

    def evaluate_policy(self, num_episodes=100):
        #print(f"Evaluating policy: num_episodes={num_episodes}")
        total_reward = []
        for _ in range(num_episodes):
            state, _ = self.env.reset()
            done = False
            episode_reward = 0
            while not done:
                action = self.collector.sample_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                episode_reward += reward
                state = next_state
            total_reward.append(episode_reward)
        return np.mean(total_reward)

    def optimize(self, num_epochs, num_timesteps=10000, batch_size=1000):
        history = []
        J_history = []
        optimizer = optim.Adam(self.policy.parameters(), lr=self.learning_rate)

        total_data = 0
        X_total = None
        next_states_total = None
        initial_states_total = None
        rewards_total = None

        while total_data < num_timesteps:
            # Collect a batch of data
            current_batch_size = min(batch_size, num_timesteps - total_data)
            initial_states, states, actions, rewards, next_states, dones, X, X_opt = self.collector.collect_dataset(current_batch_size)

            # Update total data collected
            total_data += current_batch_size

            # Concatenate new data with existing data
            if X_total is None:
                X_total = X
                next_states_total = next_states
                initial_states_total = initial_states
                rewards_total = rewards
            else:
                X_total = torch.cat((X_total, X), dim=0)
                next_states_total = torch.cat((next_states_total, next_states), dim=0)
                initial_states_total = torch.cat((initial_states_total, initial_states), dim=0)
                rewards_total = torch.cat((rewards_total, rewards), dim=0)

            # Compute optimal theta
            X_opt_total = torch.cat((X_total, X_opt), dim=0)
            optimal_theta = torch.linalg.solve(X_opt_total.T @ X_opt_total, X_opt_total.T @ torch.ones(X_opt_total.shape[0]))

            # Detach tensors
            X_total = X_total.detach()
            next_states_total = next_states_total.detach()
            initial_states_total = initial_states_total.detach()
            rewards_total = rewards_total.detach()

            # Perform policy optimization
            for epoch in tqdm(range(num_epochs)):
                optimizer.zero_grad()
                J = self.compute_J(X_total, next_states_total, initial_states_total, rewards_total)
                J_history.append(J.item())

                loss = -J
                loss.backward()
                optimizer.step()

            with torch.no_grad():
                avg_reward = self.evaluate_policy() # For Optuna, we actually only need to evaluate the final policy once we no longer satisfy the while total_data < num_timesteps statement. this average reward can be the objective function
            history.append(avg_reward)

        return history, optimal_theta, J_history

def evaluate_policy(env_name, collector, num_episodes=100):
    #print(f"Evaluating policy: env_name={env_name}, num_episodes={num_episodes}")
    env = gym.make(env_name)
    total_rewards = []
    for _ in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        done = False
        while not done:
            features = collector.get_features(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
            probs = collector.policy(features).squeeze(0)
            action = torch.argmax(probs).item()
            state, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            done = terminated or truncated
        total_rewards.append(episode_reward)
    return np.mean(total_rewards)


env_name= 'TemperatureControl-v0'

def run_experiment(env_name, kernel_type, num_features, num_epochs, sigma, learning_rate, gamma, regularizer, num_timesteps):
    #print("Initialising Data Collector")
    collector = DataCollector(env_name, kernel_type=kernel_type, num_features=num_features)
    #initial_states, states, actions, rewards, next_states, dones, X, X_opt, Y, W = collector.collect_dataset(num_timesteps)

    # Run optimization
    optimizer = PolicyOptimizer(collector, env_name, kernel_type=kernel_type, num_features=num_features, sigma=sigma, learning_rate=learning_rate, gamma=gamma, regularizer=regularizer)
    history, optimal_theta, J_history = optimizer.optimize(num_epochs, num_timesteps)

    collector.policy.theta.data = optimal_theta
    optimal_reward = evaluate_policy(env_name, collector)
    #print(f"Optimal theta: {optimal_theta}")

    return history, optimal_reward, J_history

def run_experiment2(env_name, kernel_type, num_features, num_epochs, sigma, learning_rate, gamma, regularizer, num_timesteps, batch_size):
    iterations = int(num_timesteps / batch_size)
    all_history = []
    all_J_history = []

    for i in range(iterations):
        collector = DataCollector(env_name, kernel_type=kernel_type, num_features=num_features)
        optimizer = PolicyOptimizer(collector, env_name, kernel_type=kernel_type, num_features=num_features, sigma=sigma, learning_rate=learning_rate, gamma=gamma, regularizer=regularizer)
        history, _, J_history = optimizer.optimize(num_epochs, num_timesteps, batch_size)

        all_history.extend(history)
        all_J_history.extend(J_history)

    # Create the final list of tuples (epoch, averaged_reward)
    final_results = list(zip(range(0, num_timesteps + 1, batch_size), history))  # Assuming 'epochs' is a list of epoch numbers

    # Save results to CSV file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{kernel_type}_tempcontrol_results_{timestamp}.csv"

    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Epoch', 'Average Reward'])  # Write header row
        csvwriter.writerows(final_results)

    print(f"Results saved to {filename}")

    return all_history, all_J_history





In [None]:
def run_experiment2(env_name, kernel_type, num_features, num_epochs, sigma, learning_rate, gamma, regularizer, num_timesteps, batch_size):

    collector = DataCollector(env_name, kernel_type=kernel_type, num_features=num_features)
    optimizer = PolicyOptimizer(collector, env_name, kernel_type=kernel_type, num_features=num_features, sigma=sigma, learning_rate=learning_rate, gamma=gamma, regularizer=regularizer)
    history, _, J_history = optimizer.optimize(num_epochs, num_timesteps, batch_size)



    return history, J_history

feature_maps = ['gaussian2']

# Define experiment parameters
num_epochs = 25
num_timesteps = 20000
batch_size = 5000
num_runs = 3

# Initialize dictionaries to store results
reward_history = {}
J_history = {}
results = {}

# Run experiments for each kernel
for kernel_type in feature_maps:
    print(f"Running experiment for kernel type: {kernel_type}")
    reward_runs = []
    J_runs = []
    for i in range(num_runs):
        reward, J = run_experiment2(env_name, kernel_type, num_features=30, num_epochs=num_epochs, sigma=0.1, learning_rate=0.01, gamma=0.99, regularizer=0.001, num_timesteps=num_timesteps, batch_size=1000)
        reward_runs.append(reward)
        J_runs.append(J)

    # Store average reward and J for the kernel
    results[kernel_type] = {
        'avg_reward': torch.tensor(reward_runs).mean(dim=0),
        'avg_J': torch.tensor(J_runs).mean(dim=0)
    }

    # Create the final list of tuples (epoch, averaged_reward)
    final_results = list(zip(range(0, num_timesteps + 1, batch_size), results[kernel_type]['avg_reward']))  # Assuming 'epochs' is a list of epoch numbers

    # Save results to CSV file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{kernel_type}_tempcontrol_results_{timestamp}.csv"

    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Step', 'Average Reward'])  # Write header row
        csvwriter.writerows(final_results)

    print(f"Results saved to {filename}")


# Plotting average rewards
plt.figure(figsize=(10, 6))
for kernel_type, data in results.items():
     x_values = np.arange(len(data['avg_reward'])) * 1000  # Multiply x-axis values by eval_frequency (5)
     plt.plot(x_values, data['avg_reward'], label=f'Avg Reward for {kernel_type}')
plt.xlabel('Num of epochs (scaled by 5)')  # Update x-axis label
plt.ylabel('Average Reward')
plt.title('Average Reward Comparison for Different Kernels')
plt.legend()
plt.grid(True)
plt.show()

# Plotting average J values
plt.figure(figsize=(10, 6))
for kernel_type, data in results.items():
    plt.plot(data['avg_J'], label=f'Avg J for {kernel_type}')
plt.xlabel('Num of epochs')
plt.ylabel('Average J Value')
plt.title('J Value Comparison for Different Kernels')
plt.legend()
plt.grid(True)
plt.show()

Running experiment for kernel type: gaussian2


100%|██████████| 25/25 [00:00<00:00, 114.28it/s]
100%|██████████| 25/25 [00:00<00:00, 153.30it/s]
100%|██████████| 25/25 [00:00<00:00, 115.71it/s]
100%|██████████| 25/25 [00:00<00:00, 94.74it/s]
100%|██████████| 25/25 [00:00<00:00, 73.88it/s]
100%|██████████| 25/25 [00:00<00:00, 68.50it/s]
100%|██████████| 25/25 [00:00<00:00, 66.71it/s]
100%|██████████| 25/25 [00:00<00:00, 57.12it/s]
100%|██████████| 25/25 [00:00<00:00, 54.88it/s]
100%|██████████| 25/25 [00:00<00:00, 35.03it/s]
100%|██████████| 25/25 [00:00<00:00, 45.04it/s]
100%|██████████| 25/25 [00:00<00:00, 42.00it/s]
100%|██████████| 25/25 [00:00<00:00, 38.53it/s]
100%|██████████| 25/25 [00:00<00:00, 31.13it/s]
100%|██████████| 25/25 [00:00<00:00, 33.06it/s]
100%|██████████| 25/25 [00:00<00:00, 32.79it/s]
100%|██████████| 25/25 [00:00<00:00, 29.89it/s]
100%|██████████| 25/25 [00:00<00:00, 28.50it/s]
100%|██████████| 25/25 [00:00<00:00, 26.55it/s]
100%|██████████| 25/25 [00:01<00:00, 24.26it/s]
100%|██████████| 25/25 [00:00<00:00, 

In [None]:
def run_experiment2(env_name, kernel_type, num_features, num_epochs, sigma, learning_rate, gamma, regularizer, num_timesteps, batch_size):

    collector = DataCollector(env_name, kernel_type=kernel_type, num_features=num_features)
    optimizer = PolicyOptimizer(collector, env_name, kernel_type=kernel_type, num_features=num_features, sigma=sigma, learning_rate=learning_rate, gamma=gamma, regularizer=regularizer)
    history, _, J_history = optimizer.optimize(num_epochs, num_timesteps, batch_size)



    return history, J_history

feature_maps = ['gaussian2']

# Define experiment parameters
num_epochs = 25
num_timesteps = 20000
batch_size = 5000
num_runs = 3

# Initialize dictionaries to store results
reward_history = {}
J_history = {}
results = {}

# Run experiments for each kernel
for kernel_type in feature_maps:
    print(f"Running experiment for kernel type: {kernel_type}")
    reward_runs = []
    J_runs = []
    for i in range(num_runs):
        reward, J = run_experiment2(env_name, kernel_type, num_features=30, num_epochs=num_epochs, sigma=0.1, learning_rate=0.01, gamma=0.99, regularizer=0.001, num_timesteps=num_timesteps, batch_size=1000)
        reward_runs.append(reward)
        J_runs.append(J)

    # Store average reward and J for the kernel
    results[kernel_type] = {
        'avg_reward': torch.tensor(reward_runs).mean(dim=0),
        'avg_J': torch.tensor(J_runs).mean(dim=0)
    }

    # Create the final list of tuples (epoch, averaged_reward)
    final_results = list(zip(range(0, num_timesteps + 1, batch_size), results[kernel_type]['avg_reward']))  # Assuming 'epochs' is a list of epoch numbers

    # Save results to CSV file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{kernel_type}_tempcontrol_results_{timestamp}.csv"

    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Step', 'Average Reward'])  # Write header row
        csvwriter.writerows(final_results)

    print(f"Results saved to {filename}")


# Plotting average rewards
plt.figure(figsize=(10, 6))
for kernel_type, data in results.items():
     x_values = np.arange(len(data['avg_reward'])) * 1000  # Multiply x-axis values by eval_frequency (5)
     plt.plot(x_values, data['avg_reward'], label=f'Avg Reward for {kernel_type}')
plt.xlabel('Num of epochs (scaled by 5)')  # Update x-axis label
plt.ylabel('Average Reward')
plt.title('Average Reward Comparison for Different Kernels')
plt.legend()
plt.grid(True)
plt.show()

# Plotting average J values
plt.figure(figsize=(10, 6))
for kernel_type, data in results.items():
    plt.plot(data['avg_J'], label=f'Avg J for {kernel_type}')
plt.xlabel('Num of epochs')
plt.ylabel('Average J Value')
plt.title('J Value Comparison for Different Kernels')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(results['cauchy1']['avg_reward'])

tensor([-39.1320, -42.1590, -40.6780, -37.2000, -35.4380, -33.8320, -32.7620,
        -32.0380, -33.0840, -30.2030, -29.6730, -27.5660, -30.3230, -31.1230,
        -32.6730, -34.2290, -31.9810, -32.4060, -32.9190, -33.7960],
       dtype=torch.float64)


In [None]:
!pip install optuna plotly kaleido

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m18

In [None]:
import optuna
import plotly

def objective(trial):
    # Define the hyperparameters to optimize
    batch_size = trial.suggest_int('batch_size', 500, 5000)
    epochs_per_batch = trial.suggest_int('epochs_per_batch', 5, 50)

    # Fixed parameters
    env_name = 'TemperatureControl-v0'
    kernel_type = 'gaussian2'  # You can change this to 'cauchy1' if desired
    num_features = 30
    sigma = 0.1
    learning_rate = 0.01
    gamma = 0.99
    regularizer = 0.001
    num_timesteps = 10000

    # Run the experiment
    history, _ = run_experiment2(env_name, kernel_type, num_features, epochs_per_batch, sigma,
                                 learning_rate, gamma, regularizer, num_timesteps, batch_size)

    # Return the last reward (we want to maximize reward)
    return history[-1]

def run_optimization_and_plot():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)  # Adjust the number of trials as needed

    print("Best trial:")
    trial = study.best_trial
    print(f"  Value: {trial.value}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Create and display the contour plot
    plot_contour(study)

def plot_contour(study):
    # Check if the optimization parameters match what we need for the contour plot
    if set(study.best_params.keys()) != {'batch_size', 'epochs_per_batch'}:
        print("Error: The study parameters do not match what's required for the contour plot.")
        return

    # Create the contour plot
    fig = optuna.visualization.plot_contour(study, params=['batch_size', 'epochs_per_batch'])

    # If you're running this in a Jupyter notebook, you can use:
    # fig.show()

    # For a Python script, you can save the figure:
    fig.write_image("contour_plot.png")
    print("Contour plot saved as 'contour_plot.png'")

if __name__ == "__main__":
    run_optimization_and_plot()

[I 2024-10-09 03:28:32,955] A new study created in memory with name: no-name-56200538-5155-4587-9287-017489080090
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  A = torch.linalg.solve(C_lambda, E).T
100%|██████████| 17/17 [00:00<00:00, 57.53it/s]
100%|██████████| 17/17 [00:00<00:00, 32.62it/s]
100%|██████████| 17/17 [00:00<00:00, 89.00it/s]
100%|██████████| 17/17 [00:00<00:00, 70.92it/s]
100%|██████████| 17/17 [00:00<00:00, 62.45it/s]
100%|██████████| 17/17 [00:00<00:00, 35.37it/s]
100%|██████████| 17/17 [00:00<00:00, 175.35it/s]
100%|██████████| 17/17 [00:00<00:00, 121.92it/s]
100%|██████████| 17/17 [00:00<00:00, 90.70it/s]
100%|██████████| 17/17 [00:00<00:00, 47.93it/s]
100%|██████████| 17/17 [00:00<00:00, 60.03it/s]
100%|██████████| 17/17 [00:00<00:00, 54.01it/s]
100%|██████████| 17/17 [00:00<00:00, 149.09it/s]
100%|██████████| 17/17 [00:00<00:00, 84.41it/s]
100%|██████████| 17/17 [00:00<00:00, 94.99it/s]
100%|██████████| 17/17 [00:00<00:00, 68.60it/s]