<a href="https://colab.research.google.com/github/priyanshupriyank04/AI-ML-OS/blob/main/AI_ML%2BOS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#basic import utilities
import random  #for generating random process burst times and arrival times
import numpy as np #for numerical operations in the data prep section
import pandas as pd #for storing training data and process logs

from copy import deepcopy #for deep copying process objects during simulations

import math #for all math related operations

from sklearn.ensemble import RandomForestRegressor  #ML model to predict burst time
from sklearn.model_selection import train_test_split #for data split
from sklearn.metrics import mean_absolute_error, mean_squared_error #for evaluation
import warnings
from sklearn.exceptions import DataConversionWarning, NotFittedError
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names")

import pickle #for saving and loading trained ML models

import matplotlib.pyplot as plt #for plotting results

In [None]:
# Process class
# This class represents a single process in the system.
# It stores all the information regarding CPU + I/O simulation.

class Process:
    def __init__(self, pid, arrival_time, cpu_bursts, io_bursts, priority=1):
        """
        Initialize a process object with necessary fields.
        """

        self.pid = pid
        self.arrival_time = arrival_time
        self.cpu_bursts = cpu_bursts
        self.io_bursts = io_bursts
        self.priority = priority
        # ML FLAGS (default values, will be set in generator)
        self.process_type = None
        self.is_cpu_bound = False
        self.is_io_bound = False


        # Index to track which CPU burst we are executing
        self.current_burst_index = 0

        # IMPORTANT: store the remaining burst time
        self.remaining_burst_time = cpu_bursts[0]

        # Process state
        self.state = "READY"

        # Bookkeeping values
        self.start_time = None
        self.completion_time = None
        self.waiting_time = 0
        self.turnaround_time = 0
        self.response_time = None
        self.executed_time = 0

        # For ML feature logging
        self.total_wait_time = 0
        self.past_bursts = []

    def is_completed(self):
        """
        Return True if all CPU bursts have finished.
        """
        return self.current_burst_index >= len(self.cpu_bursts)

    def move_to_next_burst(self):
        """
        Move to the next CPU burst after finishing the previous one.
        Reset remaining time for that burst.
        """
        self.current_burst_index += 1

        if self.current_burst_index < len(self.cpu_bursts):
            # Update the remaining burst time for the next burst
            self.remaining_burst_time = self.cpu_bursts[self.current_burst_index]
        else:
            self.remaining_burst_time = 0  # No more bursts

    def get_next_io_burst(self):
        """
        Return the I/O burst corresponding to the current CPU burst index.
        """
        if self.current_burst_index < len(self.io_bursts):
            return self.io_bursts[self.current_burst_index]
        return None

    def __repr__(self):
        """
        String representation for debug.
        """
        return f"Process(pid={self.pid}, arrival={self.arrival_time}, state={self.state})"


In [None]:

# SECTION 3: WORKLOAD GENERATOR

# This section generates synthetic processes with:
#   - Random arrival times
#   - Random CPU burst sequences
#   - Random I/O burst sequences
# Optionally tags processes as CPU-bound or IO-bound.


def generate_random_cpu_bursts(num_bursts):
    """
    Generate a list of random CPU burst lengths.
    Each burst is between 1 and 20 time units.
    """
    return [random.randint(1, 20) for _ in range(num_bursts)]


def generate_random_io_bursts(num_bursts):
    """
    Generate I/O bursts.
    If there are N CPU bursts, then there are N-1 I/O bursts.
    Each I/O burst is between 5 and 30 time units.
    """
    if num_bursts <= 1:
        return []
    return [random.randint(5, 30) for _ in range(num_bursts - 1)]


def generate_processes(num_processes, tag_process_type=True):
    """
    Generate a list of Process objects with random attributes.

    Parameters:
    num_processes: number of processes to generate
    tag_process_type: whether to label processes as CPU-bound or IO-bound

    Returns:
    A list of Process objects.
    """

    process_list = []

    for pid in range(1, num_processes + 1):

        arrival_time = random.randint(0, 50)

        num_bursts = random.randint(2, 5)

        cpu_bursts = generate_random_cpu_bursts(num_bursts)
        io_bursts = generate_random_io_bursts(num_bursts)

        # Optional CPU/IO-bound tagging
        if tag_process_type:
            process_type = random.choice(["CPU_BOUND", "IO_BOUND"])

            if process_type == "CPU_BOUND":
                cpu_bursts = [random.randint(10, 20) for _ in range(num_bursts)]
                io_bursts = [random.randint(5, 15) for _ in range(num_bursts - 1)]
            else:
                cpu_bursts = [random.randint(1, 8) for _ in range(num_bursts)]
                io_bursts = [random.randint(15, 30) for _ in range(num_bursts - 1)]

        # Create process
        process = Process(
            pid=pid,
            arrival_time=arrival_time,
            cpu_bursts=cpu_bursts,
            io_bursts=io_bursts,
            priority=random.randint(1, 5)
        )

        # Attach process type & ML flags
        if tag_process_type:
            process.process_type = process_type

            # === REQUIRED FOR ML SCHEDULERS ===
            if process_type == "CPU_BOUND":
                process.is_cpu_bound = True
                process.is_io_bound = False
            else:
                process.is_cpu_bound = False
                process.is_io_bound = True

        else:
            process.process_type = "UNSPECIFIED"
            process.is_cpu_bound = False
            process.is_io_bound = False

        process_list.append(process)

    # Sort by arrival time
    process_list.sort(key=lambda p: p.arrival_time)

    return process_list


def export_processes_to_csv(process_list, filename="process_workload.csv"):
    """
    Optional: Export the generated workload to a CSV for external analysis.
    """
    data = []
    for p in process_list:
        data.append({
            "PID": p.pid,
            "Arrival_Time": p.arrival_time,
            "CPU_Bursts": p.cpu_bursts,
            "IO_Bursts": p.io_bursts,
            "Priority": p.priority,
            "Process_Type": p.process_type
        })

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df


In [None]:
processes = generate_processes(5)
for p in processes:
    print(p, p.cpu_bursts, p.io_bursts, p.process_type)


Process(pid=1, arrival=9, state=READY) [13, 16, 12, 16, 14] [14, 6, 12, 13] CPU_BOUND
Process(pid=3, arrival=17, state=READY) [17, 12, 15, 15] [7, 9, 12] CPU_BOUND
Process(pid=4, arrival=20, state=READY) [7, 6] [19] IO_BOUND
Process(pid=5, arrival=41, state=READY) [11, 15] [15] CPU_BOUND
Process(pid=2, arrival=44, state=READY) [6, 1, 2, 1] [23, 27, 28] IO_BOUND


In [None]:
def select_process_ml_extended(ready_queue, model):
    """
    Select the process with the lowest *predicted next CPU burst*.
    """

    if not ready_queue:
        return None

    predictions = []
    for p in ready_queue:
        # Create feature vector (dummy previous burst = 0, safe defaults)
        prev1 = p.past_bursts[-1] if p.past_bursts else 0
        prev2 = p.past_bursts[-2] if len(p.past_bursts) >= 2 else 0

        avg_past = np.mean(p.past_bursts) if p.past_bursts else 0
        var_past = np.var(p.past_bursts) if len(p.past_bursts) > 1 else 0

        remaining_bursts = len(p.cpu_bursts) - p.current_burst_index - 1
        total_cpu_used = p.executed_time
        total_waited = p.total_wait_time

        total_io_so_far = sum(p.io_bursts[:p.current_burst_index]) if p.current_burst_index > 0 else 0
        io_ratio = (total_io_so_far / total_cpu_used) if total_cpu_used > 0 else 0

        is_cpu_bound = 1 if p.is_cpu_bound else 0
        is_io_bound = 1 if p.is_io_bound else 0

        features = np.array([
            prev1, prev2, p.arrival_time, 1 if p.is_cpu_bound else 2, p.priority,
            total_cpu_used, total_waited,
            avg_past, var_past, remaining_bursts,
            io_ratio, is_cpu_bound, is_io_bound
        ]).reshape(1, -1)

        pred = model.predict(features)[0]
        pred = max(1, int(round(pred)))
        predictions.append((p, pred))

    # Choose process with minimum predicted burst time
    selected = min(predictions, key=lambda x: x[1])[0]
    return selected


In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

def train_ml_models(df):
    """
    Train both RF and XGBoost models.
    """

    X = df.drop(columns=["next_burst"]).values
    y = df["next_burst"].values

    #  Random Forest (old)
    rf = RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )
    rf.fit(X, y)

    #  XGBoost (new)
    xgb = XGBRegressor(
        n_estimators=250,
        learning_rate=0.08,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=1.5,
        reg_lambda=1.0,
        objective="reg:squarederror"
    )
    xgb.fit(X, y)

    print("Models trained: RF + XGB")

    return rf, xgb


In [None]:

# SECTION 4: CPU SCHEDULING SIMULATOR (WITH ML LOGGING SUPPORT)

# This section implements:
#   - Ready queue
#   - Waiting (I/O) queue
#   - Global clock
#   - Simulation loop
#   - FCFS, SJF, SRTF scheduling policies
#   - ML training data logging inside step 6.6



def add_arriving_processes(clock, all_processes, ready_queue):
    """
    Add all processes whose arrival_time == current clock tick to the ready queue.
    """
    for process in all_processes:
        if process.arrival_time == clock and process.state == "READY":
            ready_queue.append(process)


def update_waiting_queue(waiting_queue, ready_queue):
    """
    Decrease I/O burst remaining times of processes in waiting_queue.
    When an I/O burst finishes, move the process back to ready_queue.
    """
    updated = []

    for process, remaining_io in waiting_queue:
        if remaining_io is None:
            continue

        remaining_io -= 1  # One tick of I/O completed

        if remaining_io == 0:
            process.state = "READY"
            ready_queue.append(process)
        else:
            updated.append((process, remaining_io))

    waiting_queue.clear()
    waiting_queue.extend(updated)




# SCHEDULER SELECTION FUNCTIONS


def select_process_fcfs(ready_queue):
    """First-Come First-Served."""
    if not ready_queue:
        return None
    return ready_queue[0]


def select_process_sjf(ready_queue):
    """Shortest Job First (non-preemptive)."""
    if not ready_queue:
        return None
    return min(ready_queue, key=lambda p: p.remaining_burst_time)


def select_process_srtf(ready_queue):
    """Shortest Remaining Time First (preemptive)."""
    if not ready_queue:
        return None
    return min(ready_queue, key=lambda p: p.remaining_burst_time)




# MAIN SIMULATION FUNCTION


def run_scheduler(process_list, algorithm="fcfs"):
    """
    Run CPU scheduling simulation with FCFS, SJF, or SRTF.
    Also logs ML samples whenever a CPU burst completes.
    """

    processes = deepcopy(process_list)

    ready_queue = []
    waiting_queue = []
    clock = 0

    current_process = None
    context_switches = 0

    total_processes = len(processes)
    terminated_count = 0

    while terminated_count < total_processes:

        # Step 6.1: Add newly arrived processes
        add_arriving_processes(clock, processes, ready_queue)

        # Step 6.2: Update waiting(I/O) queue
        update_waiting_queue(waiting_queue, ready_queue)

        # Step 6.3: If CPU idle, choose next process
        if current_process is None:

            if algorithm == "fcfs":
                current_process = select_process_fcfs(ready_queue)
            elif algorithm == "sjf":
                current_process = select_process_sjf(ready_queue)
            elif algorithm == "srtf":
                current_process = select_process_srtf(ready_queue)
            # elif algorithm == "ml":
            #     current_process = select_process_ml(ready_queue, rf_model)
            elif algorithm == "ml-rf":
                current_process = select_process_ml_extended(ready_queue, rf_model)

            elif algorithm == "ml-xgb":
                current_process = select_process_ml_extended(ready_queue, xgb_model)

                # If ML returns None (empty queue), skip safely
                if current_process is not None:
                  # Safety: ensure burst time positive
                  if current_process.remaining_burst_time <= 0:
                    current_process.remaining_burst_time = 1

            if current_process is not None:
                ready_queue.remove(current_process)
                context_switches += 1

                if current_process.start_time is None:
                    current_process.start_time = clock

                if current_process.response_time is None:
                    current_process.response_time = clock - current_process.arrival_time

                current_process.state = "RUNNING"

        else:
            # Preemption check (SRTF)
            if algorithm == "srtf" and ready_queue:
                shortest_ready = select_process_srtf(ready_queue)

                if shortest_ready.remaining_burst_time < current_process.remaining_burst_time:
                    current_process.state = "READY"
                    ready_queue.append(current_process)

                    current_process = shortest_ready
                    ready_queue.remove(shortest_ready)
                    context_switches += 1

                    if current_process.start_time is None:
                        current_process.start_time = clock

                    if current_process.response_time is None:
                        current_process.response_time = clock - current_process.arrival_time

                    current_process.state = "RUNNING"

        # Step 6.4: Run CPU for 1 time unit
        if current_process is not None:
            current_process.remaining_burst_time -= 1
            current_process.executed_time += 1


            # Step 6.6: Burst completion

            if current_process.remaining_burst_time == 0:

              # Determine index of burst that just finished
              finished_index = current_process.current_burst_index

              # Safely get that burst value
              just_completed_burst = current_process.cpu_bursts[finished_index]

              # Add to past burst history
              current_process.past_bursts.append(just_completed_burst)

              # Determine next burst (target for ML)
              if finished_index + 1 < len(current_process.cpu_bursts):
                next_burst_target = current_process.cpu_bursts[finished_index + 1]
              else:
                next_burst_target = None

              # Log ML training sample
              log_training_sample(current_process, next_burst_target)

              # Move to next CPU burst (increments burst index)
              current_process.move_to_next_burst()

              # Now check if more bursts exist
              if not current_process.is_completed():

                next_io = current_process.get_next_io_burst()

                if next_io is not None:
                  current_process.state = "WAITING"
                  waiting_queue.append((current_process, next_io))
                else:
                  current_process.state = "TERMINATED"
                  current_process.completion_time = clock + 1
                  terminated_count += 1

              else:
                current_process.state = "TERMINATED"
                current_process.completion_time = clock + 1
                terminated_count += 1

              current_process = None  # CPU idle


        # Step 6.7: Update waiting timers
        for p in ready_queue:
            p.waiting_time += 1
            p.total_wait_time += 1

        clock += 1


    # Final metrics


    total_wait = sum(p.waiting_time for p in processes)
    total_turnaround = sum(p.completion_time - p.arrival_time for p in processes)
    total_response = sum(p.response_time for p in processes)

    metrics = {
        "avg_waiting_time": total_wait / total_processes,
        "avg_turnaround_time": total_turnaround / total_processes,
        "avg_response_time": total_response / total_processes,
        "context_switches": context_switches
    }

    return metrics


In [None]:

# SECTION 5: LOGGING TRAINING DATA FOR ML

# Logs training samples every time a CPU burst completes.
# Now includes extended feature set.


ml_training_data = []   # Global dataset storage


def log_training_sample(process, next_burst):
    """
    Log one ML training sample when a CPU burst finishes.

    Parameters:
        process     : Process object
        next_burst  : Next CPU burst length (target label)
    """

    prev_bursts = process.past_bursts

    #  OLD FEATURES (kept exactly same order)
    prev1 = prev_bursts[-1] if len(prev_bursts) >= 1 else 0
    prev2 = prev_bursts[-2] if len(prev_bursts) >= 2 else 0

    # Process type → numeric encoding
    if process.process_type == "CPU_BOUND":
        proc_type = 1
    elif process.process_type == "IO_BOUND":
        proc_type = 2
    else:
        proc_type = 0

    arrival = process.arrival_time
    priority = process.priority
    total_cpu_used = process.executed_time
    total_waited = process.total_wait_time

    # NEW EXTENDED FEATURES

    # Avg & variance of past burst lengths
    avg_past = np.mean(prev_bursts) if len(prev_bursts) > 0 else 0
    var_past = np.var(prev_bursts) if len(prev_bursts) > 1 else 0

    # Remaining number of CPU bursts
    remaining_bursts = len(process.cpu_bursts) - process.current_burst_index - 1

    # Compute I/O ratio so far
    if process.current_burst_index > 0:
        total_io_so_far = sum(process.io_bursts[:process.current_burst_index])
    else:
        total_io_so_far = 0

    io_ratio = (total_io_so_far / total_cpu_used) if total_cpu_used > 0 else 0

    # Boolean CPU/IO bound flags
    is_cpu_bound = 1 if process.process_type == "CPU_BOUND" else 0
    is_io_bound = 1 if process.process_type == "IO_BOUND" else 0

    # BUILD SAMPLE ROW

    sample = {
        # OLD
        "prev1": prev1,
        "prev2": prev2,
        "arrival_time": arrival,
        "process_type": proc_type,
        "priority": priority,
        "total_cpu_used": total_cpu_used,
        "total_waited": total_waited,

        #  NEW
        "avg_past": avg_past,
        "var_past": var_past,
        "remaining_bursts": remaining_bursts,
        "io_ratio": io_ratio,
        "is_cpu_bound": is_cpu_bound,
        "is_io_bound": is_io_bound,

        #  TARGET
        "next_burst": next_burst if next_burst is not None else 0
    }

    # Add row to dataset
    ml_training_data.append(sample)



def save_ml_dataset(filename="ml_training_dataset.csv"):
    """
    Saves the collected ML samples to a CSV file.
    Also returns the DataFrame for immediate training use.
    """
    df = pd.DataFrame(ml_training_data)
    df.to_csv(filename, index=False)
    return df


In [None]:
processes = generate_processes(5)

print("FCFS:", run_scheduler(processes, algorithm="fcfs"))
print("SJF :", run_scheduler(processes, algorithm="sjf"))
print("SRTF:", run_scheduler(processes, algorithm="srtf"))


FCFS: {'avg_waiting_time': 70.8, 'avg_turnaround_time': 130.8, 'avg_response_time': 15.6, 'context_switches': 15}
SJF : {'avg_waiting_time': 44.0, 'avg_turnaround_time': 104.0, 'avg_response_time': 26.2, 'context_switches': 15}
SRTF: {'avg_waiting_time': 43.2, 'avg_turnaround_time': 103.2, 'avg_response_time': 18.4, 'context_switches': 17}


In [None]:
ml_training_data = []  # reset dataset

processes = generate_processes(15)
run_scheduler(processes, "fcfs")

df = save_ml_dataset()
df.head()


Unnamed: 0,prev1,prev2,arrival_time,process_type,priority,total_cpu_used,total_waited,avg_past,var_past,remaining_bursts,io_ratio,is_cpu_bound,is_io_bound,next_burst
0,14,0,6,1,5,14,0,14.0,0.0,3,0.0,1,0,12
1,20,0,13,1,4,20,7,20.0,0.0,4,0.0,1,0,17
2,7,0,21,2,3,7,19,7.0,0.0,1,0.0,0,1,5
3,6,0,23,2,2,6,24,6.0,0.0,1,0.0,0,1,5
4,3,0,25,2,4,3,28,3.0,0.0,4,0.0,0,1,1


In [None]:

# SECTION 6: ML MODEL TRAINING


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Features and target
X = df[["prev1", "prev2", "arrival_time", "process_type", "priority",
        "total_cpu_used", "total_waited"]]

y = df["next_burst"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Train ML model
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf_model.fit(X_train, y_train)


In [None]:
# Predictions
preds = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print("Model Performance:")
print("MAE :", mae)
print("RMSE:", rmse)


Model Performance:
MAE : 2.0642857142857145
RMSE: 2.419321038166346


In [None]:
import joblib
joblib.dump(rf_model, "trained_scheduler_model.pkl")


['trained_scheduler_model.pkl']

In [None]:
def extract_features(process):
    """
    Extract extended ML input features for prediction.
    Must match dataset used during training.
    """

    # OLD FIELDS (must stay in same order)
    prev_bursts = process.past_bursts
    prev1 = prev_bursts[-1] if len(prev_bursts) >= 1 else 0
    prev2 = prev_bursts[-2] if len(prev_bursts) >= 2 else 0

    # Encode type using your scheme
    if process.process_type == "CPU_BOUND":
        proc_type = 1
    elif process.process_type == "IO_BOUND":
        proc_type = 2
    else:
        proc_type = 0

    arrival = process.arrival_time
    priority = process.priority
    total_cpu_used = process.executed_time
    total_waited = process.total_wait_time

    # NEW EXTENDED FEATURES
    avg_past = np.mean(prev_bursts) if len(prev_bursts) > 0 else 0
    var_past = np.var(prev_bursts) if len(prev_bursts) > 1 else 0

    remaining_bursts = len(process.cpu_bursts) - process.current_burst_index - 1

    # IO ratio calculation
    if process.current_burst_index > 0:
        total_io_so_far = sum(process.io_bursts[:process.current_burst_index])
    else:
        total_io_so_far = 0

    io_ratio = (total_io_so_far / total_cpu_used) if total_cpu_used > 0 else 0

    # CPU/IO bound flags
    is_cpu_bound = 1 if process.process_type == "CPU_BOUND" else 0
    is_io_bound = 1 if process.process_type == "IO_BOUND" else 0

    return [
        prev1, prev2,
        arrival,
        proc_type,
        priority,
        total_cpu_used,
        total_waited,

        # extended
        avg_past,
        var_past,
        remaining_bursts,
        io_ratio,
        is_cpu_bound,
        is_io_bound
    ]


In [None]:

# SECTION 8: ML-BASED PROCESS SELECTOR


def select_process_ml(ready_queue, model):
    """
    ML-based selection: choose process with minimum predicted next CPU burst.
    Prediction is clamped to minimum 1 to avoid scheduling lock-ups.
    """
    if not ready_queue:
        return None

    predictions = []

    for process in ready_queue:
        features = extract_features(process)

        predicted_burst = model.predict([features])[0]

        # Prevent zero or negative predictions
        predicted_burst = max(1, int(predicted_burst))

        predictions.append((predicted_burst, process))

    predictions.sort(key=lambda x: x[0])
    return predictions[0][1]



In [None]:

# STEP 17: Generate LARGE ML training dataset


print("STEP 17 → Generating large ML dataset...")

ml_training_data = []   # reset dataset

NUM_BATCHES = 30            # number of independent simulations
PROCESSES_PER_BATCH = 50     # number of processes per simulation

for b in range(NUM_BATCHES):
    processes = generate_processes(PROCESSES_PER_BATCH)

    # Run scheduler in FCFS mode (good for learning burst behavior)
    run_scheduler(processes, "fcfs")

print("Large ML dataset created!")
print("Total samples collected:", len(ml_training_data))


STEP 17 → Generating large ML dataset...
Large ML dataset created!
Total samples collected: 3674


In [None]:

# STEP 18: Save dataset & Train the Random Forest


print("\nSTEP 18 → Training Random Forest on large dataset...")

df = save_ml_dataset()   # convert logged samples to DataFrame

X = df.drop("next_burst", axis=1)
y = df["next_burst"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_model = RandomForestRegressor(
    n_estimators=250,
    max_depth=12,
    n_jobs=-1,
    random_state=42
)

rf_model.fit(X_train, y_train)

print("Training complete!")



STEP 18 → Training Random Forest on large dataset...
Training complete!


In [None]:

# STEP 19: Evaluate predictive accuracy


from sklearn.metrics import mean_absolute_error, r2_score

pred = rf_model.predict(X_test)

print("\nMODEL PERFORMANCE")
print("MAE:", mean_absolute_error(y_test, pred))
print("R² :", r2_score(y_test, pred))



MODEL PERFORMANCE
MAE: 2.4850318480067894
R² : 0.7630251743387675


In [None]:

# # STEP 20: Benchmark ML vs FCFS, SJF, SRTF


# print("\nSTEP 20 → Benchmarking ML vs classic schedulers")

# # Generate a brand new workload for fair comparison
# test_processes = generate_processes(25)

# # Run all schedulers on IDENTICAL deep copies
# import copy
# fcfs_result = run_scheduler(copy.deepcopy(test_processes), "fcfs")
# sjf_result  = run_scheduler(copy.deepcopy(test_processes), "sjf")
# srtf_result = run_scheduler(copy.deepcopy(test_processes), "srtf")
# ml_result   = run_scheduler(copy.deepcopy(test_processes), "ml")

# print("\n=== FINAL COMPARISON ===")
# print("FCFS :", fcfs_result)
# print("SJF  :", sjf_result)
# print("SRTF :", srtf_result)
# print("ML   :", ml_result)


In [None]:

# FULL PIPELINE: DATA GENERATION → MODEL TRAINING → BENCHMARKING


print("STEP 1: Reset ML dataset ")
ml_training_data = []



# STEP 2: Generate large ML dataset by running schedulers

print("STEP 2: Generating ML training dataset")

NUM_WORKLOADS = 50      # You can increase to 100 or 200 if needed
PROCS_PER_WORKLOAD = 25

for w in range(NUM_WORKLOADS):
    procs = generate_processes(PROCS_PER_WORKLOAD)

    # Run classic schedulers (these log ML samples)
    run_scheduler(procs, "fcfs")
    run_scheduler(procs, "sjf")
    run_scheduler(procs, "srtf")

    if w % 10 == 0:
        print(f"  → Processed workload {w}/{NUM_WORKLOADS}")

print("Dataset collection complete.")
print(f"Total ML samples: {len(ml_training_data)}")


# -------------------------------------------------------------
# STEP 3: Save Dataset
# -------------------------------------------------------------
print("\n STEP 3: Saving dataset to CSV")
df = save_ml_dataset("ml_training_dataset.csv")
print(df.head())


# -------------------------------------------------------------
# STEP 4: Train RF + XGB Models
# -------------------------------------------------------------
print("\n STEP 4: Training ML models (RF & XGB)")
rf_model, xgb_model = train_ml_models(df)
print("Models trained successfully.")





=== STEP 1: Reset ML dataset ===
=== STEP 2: Generating ML training dataset... ===
  → Processed workload 0/50
  → Processed workload 10/50
  → Processed workload 20/50
  → Processed workload 30/50
  → Processed workload 40/50
Dataset collection complete.
Total ML samples: 9228

=== STEP 3: Saving dataset to CSV ===
   prev1  prev2  arrival_time  process_type  priority  total_cpu_used  \
0      5      0             0             2         4               5   
1      3      0             4             2         1               3   
2     20      0             6             1         2              20   
3     16      0            12             1         5              16   
4      3      0            15             2         1               3   

   total_waited  avg_past  var_past  remaining_bursts  io_ratio  is_cpu_bound  \
0             0       5.0       0.0                 4       0.0             0   
1             1       3.0       0.0                 3       0.0             0   


In [None]:

# STEP 5: Benchmark All Schedulers

print("\n STEP 5: Benchmarking all schedulers ")

TEST_PROCESSES = 20
test_procs = generate_processes(TEST_PROCESSES)

results = {
    "FCFS": run_scheduler(deepcopy(test_procs), "fcfs"),
    "SJF": run_scheduler(deepcopy(test_procs), "sjf"),
    "SRTF": run_scheduler(deepcopy(test_procs), "srtf"),
    "ML-RF": run_scheduler(deepcopy(test_procs), "ml-rf"),
    "ML-XGB": run_scheduler(deepcopy(test_procs), "ml-xgb"),
}

# Pretty-print results
print("\n FINAL COMPARISON ")
for name, metrics in results.items():
    print(f"{name:6} : {metrics}")

print("\nDONE")


=== STEP 5: Benchmarking all schedulers ===

=== FINAL COMPARISON ===
FCFS   : {'avg_waiting_time': 242.0, 'avg_turnaround_time': 288.5, 'avg_response_time': 75.75, 'context_switches': 52}
SJF    : {'avg_waiting_time': 132.25, 'avg_turnaround_time': 178.75, 'avg_response_time': 95.05, 'context_switches': 52}
SRTF   : {'avg_waiting_time': 131.95, 'avg_turnaround_time': 178.45, 'avg_response_time': 93.75, 'context_switches': 64}
ML-RF  : {'avg_waiting_time': 159.25, 'avg_turnaround_time': 205.75, 'avg_response_time': 69.4, 'context_switches': 52}
ML-XGB : {'avg_waiting_time': 172.25, 'avg_turnaround_time': 218.75, 'avg_response_time': 66.75, 'context_switches': 52}

=== DONE ===
