### Imports

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from siml import BinaryClassificationSimulation
import pandas as pd
import numpy as np

### Parameter definitions

In [2]:
# Set numpy seed
np.random.seed(42)

# Parameters for data generation
n_samples = 1000
n_features = 20
n_classes = 2
n_informative = 5
n_redundant = 2
test_training_split = 0.02

# Parameters for the scheduling
n_jobs = int(n_samples * test_training_split)
jobs = list(range(n_jobs))
weights = np.random.randint(10, 100, size=n_classes)
durations = np.random.randint(1, 10, size=n_jobs)

### Functions related to the scheduling problem

In [3]:
# Solve the scheduling problem using the STP rule
def solve_scheduling(job_class):
    job_weight = [weights[job_class[j]] for j in jobs]
    ratios = [durations[j] / job_weight[j] for j in jobs]
    solution = [x for _, x in sorted(zip(ratios, jobs), reverse=True)]
    return solution

# Compute the weighted completion time of a solution
def compute_weighted_completion_time(job_class, solution):
    job_weight = [weights[job_class[j]] for j in jobs]
    completion_time = 0
    weighted_completion_time = 0
    for j in solution:
        completion_time += durations[j]
        weighted_completion_time += job_weight[j] * completion_time
    return weighted_completion_time


### Functions related to the confusion matrices

In [4]:
# Compute TPR and FPR from confusion matrix
def compute_TPR_FPR(conf_matrix):
    tp, fn, fp, tn = conf_matrix.ravel()
    TPR_m = tp / (tp + fn) if (tp + fn) != 0 else 0
    FPR_m = fp / (fp + tn) if (fp + tn) != 0 else 0
    return TPR_m, FPR_m

# Generate all sum of n with n non-negative integers
def generate_ordered_partitions(n, m):
    result = []
    def backtrack(path, remaining, depth):
        if depth == m:
            if remaining == 0:
                result.append(path[:])
            return
        for i in range(remaining + 1):  # allow zero
            path.append(i)
            backtrack(path, remaining - i, depth + 1)
            path.pop()

    backtrack([], n, 0)
    return result

# Generate all valid confusion matrices for a given number of classes and samples
def generate_valid_confusion_matrices(n_classes, n_per_class):
    possible_row_values = []
    for c in range(n_classes):
        possible_row_values.append(generate_ordered_partitions(n_per_class[c], n_classes))

    matrices = []
    for row_0 in possible_row_values[0]:
        for row_1 in possible_row_values[1]:
            matrices.append(np.array([row_0, row_1]))

    return matrices

### Generate synthetic dataset and get the true optimal objective value

In [5]:
# Generate a multiclass classification dataset
X, y = make_classification(n_samples=n_samples,
                           n_features=n_features,
                           n_classes=n_classes,
                           n_informative=n_informative,
                           n_redundant=n_redundant,
                           class_sep=0.5,  # less separation between classes
                           flip_y=0.2,  # label noise
                           random_state=42)

X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(n_features)])
y = pd.Series(y, name="class_label")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_training_split, random_state=42)
y_test = y_test.reset_index(drop=True)
all_classes = np.unique(y)
n_per_class = ()
for cls in all_classes:
    n_per_class += (y_test.value_counts()[cls],)

# Solve the optimization problem using y_test
true_solution = solve_scheduling(y_test)
true_optimal_makespan = compute_weighted_completion_time(y_test, true_solution)

### Generate all valid confusion matrices and simulate their predictions

In [6]:
# Try to simulate all possible confusion n_classes x n_classes matrices for the y_test instances
matrices = generate_valid_confusion_matrices(n_classes, n_per_class)

for m, matrix in enumerate(matrices):

    # Calculate macro TPR and FPR for the generated matrix
    TPR_m, FPR_m = compute_TPR_FPR(matrix)

    # Simulate predictions using the generated TPR and FPR
    TPR_sims = []
    FPR_sims = []
    gaps = []
    obj_vals = []
    for seed in range(10):
        siml = BinaryClassificationSimulation(seed=seed)
        y_simulated = [siml.simulate(tc, TPR_m, FPR_m) for tc in y_test]
        conf_matrix_sim = confusion_matrix(y_test, y_simulated, labels=all_classes)
        TPR_sim, FPR_sim = compute_TPR_FPR(conf_matrix_sim)

        TPR_sims.append(TPR_sim)
        FPR_sims.append(FPR_sim)

        # Solve the optimization problem using y_simulated
        sim_solution = solve_scheduling(y_simulated)
        sim_makespan = compute_weighted_completion_time(y_test, sim_solution)
        gap = (true_optimal_makespan - sim_makespan) / true_optimal_makespan
        gaps.append(gap)
        obj_vals.append(sim_makespan)

    print(f"{m};{TPR_m};{np.mean(TPR_sims)};{np.std(TPR_sims)};{np.min(TPR_sims)};{np.max(TPR_sims)};{FPR_m};{np.mean(FPR_sims)};{np.std(FPR_sims)};{np.min(FPR_sims)};{np.max(FPR_sims)};{np.mean(obj_vals)};{np.std(obj_vals)};{np.mean(gaps)};{np.std(gaps)}")

0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;62541.0;0.0;0.0661480342237685;0.0
1;0.0;0.0;0.0;0.0;0.0;0.1111111111111111;0.07777777777777777;0.111665284679121;0.0;0.3333333333333333;61660.8;1165.852975293197;0.07929103641874842;0.017408325622929283
2;0.0;0.0;0.0;0.0;0.0;0.2222222222222222;0.1777777777777778;0.17356110390903676;0.0;0.5555555555555556;60733.0;1439.1219545264398;0.09314479401532005;0.021488733250607572
3;0.0;0.0;0.0;0.0;0.0;0.3333333333333333;0.2888888888888889;0.1507184440694504;0.0;0.5555555555555556;59462.0;2048.326585288586;0.1121231577847128;0.030585276989870038
4;0.0;0.0;0.0;0.0;0.0;0.4444444444444444;0.4;0.17356110390903678;0.1111111111111111;0.6666666666666666;57799.8;2946.253512513816;0.13694285586298546;0.043992974757937266
5;0.0;0.0;0.0;0.0;0.0;0.5555555555555556;0.5222222222222223;0.1724908299584447;0.2222222222222222;0.7777777777777778;56608.2;2885.9829798527917;0.15473563184064743;0.04309302503849117
6;0.0;0.0;0.0;0.0;0.0;0.6666666666666666;0.6333333333333333;0

### Train actual models on the dataset

In [8]:
# Train XGB and solve
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
y_xgb = xgb.predict(X_test)
conf_matrix_xgb = confusion_matrix(y_test, y_xgb, labels=all_classes)
TPR_xgb, FPR_xgb = compute_TPR_FPR(conf_matrix_xgb)
xgb_solution = solve_scheduling(y_xgb)
xgb_makespan = compute_weighted_completion_time(y_test, xgb_solution)
xgb_gap = (true_optimal_makespan - xgb_makespan) / true_optimal_makespan
print(f"XGB;{TPR_xgb};{FPR_xgb};{xgb_makespan};{xgb_gap}")

# Train LR and solve
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)
conf_matrix_lr = confusion_matrix(y_test, y_lr, labels=all_classes)
TPR_lr, FPR_lr = compute_TPR_FPR(conf_matrix_lr)
lr_solution = solve_scheduling(y_lr)
lr_makespan = compute_weighted_completion_time(y_test, lr_solution)
lr_gap = (true_optimal_makespan - lr_makespan) / true_optimal_makespan
print(f"LR;{TPR_lr};{FPR_lr};{lr_makespan};{lr_gap}")

# Train RF and solve
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
conf_matrix_rf = confusion_matrix(y_test, y_rf, labels=all_classes)
TPR_rf, FPR_rf= compute_TPR_FPR(conf_matrix_rf)
rf_solution = solve_scheduling(y_rf)
rf_makespan = compute_weighted_completion_time(y_test, rf_solution)
rf_gap = (true_optimal_makespan - rf_makespan) / true_optimal_makespan
print(f"RF;{TPR_rf};{FPR_rf};{rf_makespan};{rf_gap}")

XGB;0.8181818181818182;0.1111111111111111;64806;0.03232742530348957
LR;0.36363636363636365;0.3333333333333333;59327;0.11413895566737842
RF;0.8181818181818182;0.2222222222222222;63751;0.04808051246061728
