In [9]:
import pandas as pd
import gym
from gym import spaces
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import optuna

# Step 1: Load the Dataset
process_data_path = r"C:\Users\RAJJY\Downloads\process_data.csv"
df = pd.read_csv(process_data_path)
print(df.head())

# Step 2: Define the Custom CPU Scheduling Environment
class SarahcustomRLEnv(gym.Env):
    def __init__(self, process_data):
        super(SarahcustomRLEnv, self).__init__()
        self.process_data = process_data
        self.process_index = 0
        self.current_time = 0

        self.action_space = spaces.Discrete(len(self.process_data))
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(3,), dtype=np.float32)

    def step(self, action):
        selected_process = self.process_data.iloc[action]
        burst_time = selected_process['Burst time']
        self.current_time += burst_time
        reward = -burst_time

        self.process_index += 1
        done = self.process_index >= len(self.process_data)
        next_state = (
            np.array([self.current_time, self.process_data.iloc[self.process_index]['Burst time'], self.process_data.iloc[self.process_index]['Resources']])
            if not done else np.array([0, 0, 0])
        )

        return next_state, reward, done, {}

    def reset(self):
        self.current_time = 0
        self.process_index = 0
        initial_process = self.process_data.iloc[self.process_index]
        return np.array([self.current_time, initial_process['Burst time'], initial_process['Resources']])

    def render(self, mode='human', close=False):
        pass

    def seed(self, seed=None):
        pass

env = SarahcustomRLEnv(process_data=df)

def optimize_ppo(trial):
    # Define the hyperparameters search space
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    gamma = trial.suggest_uniform('gamma', 0.9, 0.999)
    n_steps = trial.suggest_categorical('n_steps', [2048, 4096, 8192])
    
    # Vectorize the environment for parallel training
    vec_env = make_vec_env(lambda: SarahcustomRLEnv(process_data=df), n_envs=4)

    # Create the PPO model
    model = PPO('MlpPolicy', vec_env, verbose=0, learning_rate=learning_rate, batch_size=batch_size, gamma=gamma, n_steps=n_steps)
    
    # Train the model
    model.learn(total_timesteps=10000)

    # Evaluate the model
    rewards = []
    for _ in range(10):
        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
        rewards.append(total_reward)

    return np.mean(rewards)

# Step 3: Create Optuna Study
study = optuna.create_study(direction='maximize')
study.optimize(optimize_ppo, n_trials=50)

# Step 4: Print the Best Hyperparameters
print('Best hyperparameters: ', study.best_params)

# Train the final model with the best hyperparameters
best_params = study.best_params
vec_env = make_vec_env(lambda: SarahcustomRLEnv(process_data=df), n_envs=4)
model = PPO('MlpPolicy', vec_env, verbose=1, **best_params)

model.learn(total_timesteps=10000)
model.save("Sarah_custom_model")

# Load the model (if needed)
model = PPO.load("Sarah_custom_model")

# Evaluate the Trained Agent
evaluation_metrics = {
    'total_rewards': [],
    'average_burst_time': [],
    'average_waiting_time': [],
    'average_turnaround_time': [],
    'resource_utilization': [],
    'throughput': []
}

for episode in range(10):
    obs = env.reset()
    total_reward = 0
    done = False
    process_times = []
    waiting_times = []
    turnaround_times = []
    resource_usage = 0
    start_time = 0

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        total_reward += reward

        process_times.append(env.process_data.iloc[action]['Burst time'])
        waiting_times.append(env.current_time - env.process_data.iloc[action]['Arrival Time'])
        turnaround_times.append(env.current_time - env.process_data.iloc[action]['Arrival Time'] + env.process_data.iloc[action]['Burst time'])
        resource_usage += env.process_data.iloc[action]['Resources']

    evaluation_metrics['total_rewards'].append(total_reward)
    evaluation_metrics['average_burst_time'].append(np.mean(process_times))
    evaluation_metrics['average_waiting_time'].append(np.mean(waiting_times))
    evaluation_metrics['average_turnaround_time'].append(np.mean(turnaround_times))
    evaluation_metrics['resource_utilization'].append(resource_usage / len(process_times))
    evaluation_metrics['throughput'].append(len(process_times) / (env.current_time - start_time))

# Print the averaged metrics
for metric, values in evaluation_metrics.items():
    print(f"{metric}: {np.mean(values)}")

env.reset()


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-07-14 12:26:38,808] A new study created in memory with name: no-name-81119d68-c666-4378-ba58-7022afafe39a


   Job Id  Burst time  Arrival Time  Prremptive  Resources
0     247         199        0.4100           0          8
1      29         193        0.5925           1          2
2     170          75        0.3600           1          4
3     164          42        0.9725           0          8
4     312         257        0.6125           0          4


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
  gamma = trial.suggest_uniform('gamma', 0.9, 0.999)
[I 2024-07-14 12:27:36,465] Trial 0 finished with value: -92664.4 and parameters: {'learning_rate': 0.008739813379879766, 'batch_size': 128, 'gamma': 0.9202638901137492, 'n_steps': 8192}. Best is trial 0 with value: -92664.4.
[I 2024-07-14 12:28:02,329] Trial 1 finished with value: -101716.0 and parameters: {'learning_rate': 0.00036198599807089676, 'batch_size': 256, 'gamma': 0.9244548302168227, 'n_steps': 4096}. Best is trial 0 with value: -92664.4.
[I 2024-07-14 12:28:34,074] Trial 2 finished with value: -97240.8 and parameters: {'learning_rate': 0.0038330730701084175, 'batch_size': 128, 'gamma': 0.989765221196286, 'n_steps': 4096}. Best is trial 0 with value: -92664.4.
[I 2024-07-14 12:29:19,583] Trial 3 finished with value: -101841.6 and parameters: {'learning_rate': 0.00017056381666650668, 'batch_size': 64, 'gamma': 0.9857073855695322, 'n_steps': 4096}. Best

Best hyperparameters:  {'learning_rate': 0.009090908217104239, 'batch_size': 32, 'gamma': 0.9394634895402538, 'n_steps': 2048}
Using cpu device
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 500       |
|    ep_rew_mean     | -1.03e+05 |
| time/              |           |
|    fps             | 1071      |
|    iterations      | 1         |
|    time_elapsed    | 7         |
|    total_timesteps | 8192      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 500         |
|    ep_rew_mean          | -9.9e+04    |
| time/                   |             |
|    fps                  | 411         |
|    iterations           | 2           |
|    time_elapsed         | 39          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.024353597 |
|    clip_fraction        | 0.527       |
| 

array([  0., 199.,   8.])