In [1]:
import gurobipy as gp
import math
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter, PercentFormatter
import numpy as np
import pandas as pd
import re
import os
import shutil

In [2]:
# output generation for paper 2

In [3]:
# get input paths
test_set = "miplib_2017_5000_paper2"
instance_fldr = os.path.join("instances", test_set)
test_set_fldr = os.path.join("test_sets", test_set)
results_fldr = os.path.join("results", test_set)
out_fldr = os.path.join("outputs", test_set)

# set filters
seed_idxs = [0]  
max_indices = 100
degrees = [-1, 1]  # todo update this as needed
term_list = [4, 64]
filter_cbc = False
max_base_std = 1e10
min_termination_time = 10  # todo update this as needed - fitler wins on default runs that took at least 10 seconds
short, medium, long = 60, 600, 3600
remove_status_changes = False
win_threshold = .1
filter_redundant = True

generators = ["None", "New", "Farkas", "All", "Disjunction",  "NoDisjunction"]  # ,"Matrix", "Term", "Basis", "NoMatrix", "NoTerm", "NoBasis"]

# set up some mappings
cat_map_new_lines = {
    "None": "Default",
    "Farkas": "Param Disj,\nParam Cuts",
    "Old": "Param Disj,\nCalc Cuts",
    "New": "Calc Disj,\nCalc Cuts",
    "All": "Prune and\nSupport",
    "Disjunction": "Prune\nDisjunction",
    "Matrix": "Support\nMatrix",
    "Term": "Support\nTerm",
    "Basis": "Support\nBasis",
    "NoDisjunction": "Support"
}
cat_map = {
    "None": "Default",
    "Farkas": "Param Disj, Param Cuts",
    "Old": "Param Disj, Calc Cuts",
    "New": "Calc Disj, Calc Cuts",
    "All": "Prune and Support",
    "Disjunction": "Prune Disjunction",
    "Matrix": "Strengthen Matrix",
    "Term": "Strengthen Term",
    "Basis": "Strengthen Basis",
    "NoDisjunction": "Support"
}
perturbation_map = {
    "matrix": "Coefficient Matrix",
    "rhs": "Right Hand Side",
    "objective": "Objective"
}
label = {
    "postRootTime": "Time after Processing Root nodes",
    "rootDualBoundTimeSansVpc": "Root Processing Time (Minus VPC Generation)",
    "terminationTimeSansVpc": "Time (Minus VPC Generation)",
    "terminationTime": "Time",
    "nodes": "Nodes Processed",
    "iterations": "LP iterations",
}
unit = {
    "postRootTime": "(seconds)",
    "rootDualBoundTimeSansVpc": "(seconds)",
    "terminationTimeSansVpc": "(seconds)",
    "terminationTime": "(seconds)",
    "nodes": "(1000 nodes)",
    "iterations": "(1000 iterations)",
}
limits = {
    "postRootTime": 7200,
    "terminationTimeSansVpc": 7200,
    "terminationTime": 7200,
    "rootDualBoundTimeSansVpc": 5,
    "nodes": 10000,
    "iterations": 37500
}
bracket_bounds = {
    "short": (min_termination_time, short),
    "medium": (short, medium),
    "long": (medium, long)
}
param_map = {
    "degree": "Degree of Perturbation",
    "terms": "Number of Disjunctive Terms",
}

In [4]:
# matplotlib settings
plt.rc('text', usetex=True)  # use latex fonts
plt.rcParams['font.size'] = 18
plt.rcParams['figure.titlesize'] = 24
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 14

## Check run failures

In [5]:
# check if each folder in test_set_fldr has a corresponding .mps file in instance_fldr
# for instance in os.listdir(test_set_fldr):
#     if not os.path.isdir(os.path.join(test_set_fldr, instance)):
#         continue
#     if not os.path.exists(os.path.join(instance_fldr, f"{instance}.mps")):
#         # remove the folder if the instance is missing
#         # shutil.rmtree(os.path.join(test_set_fldr, instance))
#         print(f"Removed {instance} from test set")

In [6]:
# running list of strings contained by different error codes
# last two are catchalls
err = {
    "walltime": [],
    "bad_alloc": [],
    "out of memory": [],
    "vmem": [],
    "takeoffcuts": [],
    "solver is dual infeasible": [],
    "solver must be optimal": [],
    "segmentation fault": [],
    "no vpcs were made from a new disjunction": [],
    "must have primalbound >= root lp objective": [],
    "objective at parent nodes": [],
    "failed to optimize mip": [],
    "disjunction does not represent a full binary tree": [],
    "solver not proven optimal for nodes": [],
    "unable to open": [],
    "license": [],
    "dot product with obj differs from solver": [],
    "gurobi: error during callback: addCut": [],
    "cglvpc::setupconstraints: objective at disjunctive term": [],
    "unable to read file": [],
    "stats.id == stats_vec": [],
    "size of our disjunction is not what we expected it to be": [],
    "dimension must stay fixed": [],
    "vpcgenerator must be": [],
    "objective values must match": [],
    "objective at disjunctive term": [],
}

# read in cbc acceptable instances from cbc.txt
with open("cbc.txt", "r") as f:
    cbc_instances = f.read().split("\n")

# runs that errored out with new error code
other = []

# runs that had no errors
empty = []

# runs that only had warnings
warn_strs = ["warning", "prlp is primal infeasible", "farkas", "x:", "x[", "b:",
             "b[", "v:", "v[", "cut:", "A_i . x", "dot product with obj differs from solver"]
warning = []

# series that didn't run
no_go = []

# track sizes of instances
rows, cols, density = {}, {}, {}

# map the names
names = {}

# counts
count_series = 0
count_instances = 0
number_instances = {}

# iterate over all expected runs
for instance in os.listdir(test_set_fldr):
    if not os.path.isdir(os.path.join(test_set_fldr, instance)):
        continue
    # only look at cbc instances if we ran with cbc
    if instance not in cbc_instances and "gurobi" not in test_set and filter_cbc:
        continue
        
    # get the number of rows and columns in the instance
    mdl = gp.read(os.path.join(instance_fldr, f"{instance}.mps"))
    rows[instance] = mdl.NumConstrs
    cols[instance] = mdl.NumVars
    density[instance] = mdl.NumNZs / (mdl.NumConstrs * mdl.NumVars)
        
    for perturbation in os.listdir(os.path.join(test_set_fldr, instance)):
        if not os.path.isdir(os.path.join(test_set_fldr, instance, perturbation)):
            continue
        # only look at perturbations that were run
        p, d = perturbation.split("_")
        if int(d) not in degrees or p not in perturbation_map:
            continue
        for terms in term_list:
            for generator in generators:
                for seed_idx in seed_idxs:

                    # set variables for this iterations
                    count_series += 1
                    stem = f"{instance}_{perturbation}_{terms}_{generator}_{seed_idx}"
                    file_pth = os.path.join(results_fldr, f"{stem}.err")
                    series_fldr = os.path.join(test_set_fldr, instance, perturbation)
                    current_count = len([f for f in os.listdir(series_fldr) if f.endswith(".mps")])
                    count_instances += current_count
                    names[stem] = instance
                    number_instances[stem] = {
                        "expected": current_count,
                        "recorded": 0,
                        "generator": generator,
                        "error": "N/A"
                    }
    
                    # check if the series wasn't run
                    if not os.path.exists(file_pth):
                        number_instances[stem]["error"] = "no go"
                        no_go.append(stem)
                    
                    # check if the series ran with no errors or warnings
                    elif os.path.getsize(file_pth) == 0:
                        number_instances[stem]["error"] = "empty"
                        empty.append(stem)
                    
                    # track which error codes were thrown
                    else:
                        # read the file
                        with open(file_pth, "r") as f:
                            text = f.read().lower()
                        
                        # assign the error file to the appropriate list
                        found_code = False
                        for code in err:
                            if code in text:
                                if code == "dot product with obj differs from solver":
                                    pattern = r"obj viol from solver: (-?\d+\.\d+)\. calculated: (-?\d+\.\d+)"
                                    s, c = re.findall(pattern, text)[-1]
                                    # if we didn't terminate, this isn't an error, so keep going
                                    if abs(float(s) - float(c)) < 1e-3:
                                        continue
                                err[code].append(stem)
                                found_code = True
                                number_instances[stem]["error"] = code
                                break
                        if not found_code:
                            if all(not line or any(w in line for w in warn_strs) for line in text.splitlines()):
                                warning.append(stem)
                                number_instances[stem]["error"] = "warning"
                            else:
                                other.append(stem)
                                number_instances[stem]["error"] = "other"

Set parameter Username
Academic license - for non-commercial use only - expires 2025-08-21
Read MPS format model from file instances/miplib_2017_5000_paper2/bienst2.mps
Reading time = 0.00 seconds
bienst2: 576 rows, 505 columns, 2184 nonzeros
Read MPS format model from file instances/miplib_2017_5000_paper2/set3-15.mps
Reading time = 0.00 seconds
set3-15: 3747 rows, 4019 columns, 13747 nonzeros
Read MPS format model from file instances/miplib_2017_5000_paper2/f2gap801600.mps
Reading time = 0.00 seconds
f2gap801600: 80 rows, 1600 columns, 3200 nonzeros
Read MPS format model from file instances/miplib_2017_5000_paper2/stein15inf.mps
Reading time = 0.00 seconds
stein15inf: 37 rows, 15 columns, 135 nonzeros
Read MPS format model from file instances/miplib_2017_5000_paper2/neos-3610173-itata.mps
Reading time = 0.00 seconds
neos-3610173-itata: 747 rows, 844 columns, 2130 nonzeros
Read MPS format model from file instances/miplib_2017_5000_paper2/10teams.mps
Reading time = 0.00 seconds
10TEAMS

In [7]:
# check which series didn't run
print(no_go)

[]


In [8]:
# get the proportion of series that at least got started
1 - (len(no_go) / count_series)

1.0

In [9]:
# out of time - got hung up in code somewhere - ok
print(err["walltime"])
len(err["walltime"]) / count_series

['cod105_rhs_1_64_New_0', 'cod105_rhs_1_64_Farkas_0', 'cod105_rhs_1_64_All_0', 'cod105_rhs_1_64_Disjunction_0', 'cod105_rhs_1_64_NoDisjunction_0', 'cod105_objective_1_64_New_0', 'cod105_objective_1_64_Farkas_0', 'cod105_objective_1_64_All_0', 'cod105_matrix_1_64_New_0', 'cod105_matrix_1_64_Farkas_0', 'cod105_matrix_1_64_All_0', 'cod105_matrix_1_64_Disjunction_0', 'cod105_matrix_-1_64_All_0', 'cod105_matrix_-1_64_Disjunction_0', 'cod105_objective_-1_64_New_0', 'cod105_objective_-1_64_All_0', 'neos-1605061_rhs_1_64_New_0', 'neos-1605061_rhs_1_64_All_0']


0.001286449399656947

In [10]:
# out of memory - memory is maxed already - this is what it is
# todo: figure out where we ran short on memory so we can explain why we dropped them
print(err["bad_alloc"] + err["out of memory"] + err["vmem"])
len(err["bad_alloc"] + err["out of memory"] + err["vmem"]) / count_series

['f2gap801600_objective_1_64_New_0', '10teams_objective_-1_64_New_0', 'piperout-d27_objective_1_64_New_0', 'piperout-d27_objective_1_64_Farkas_0', 'piperout-d27_objective_1_64_All_0', 'piperout-d27_objective_1_64_Disjunction_0', 'piperout-d27_objective_1_64_NoDisjunction_0', 'piperout-d27_objective_-1_64_New_0', 'piperout-d27_objective_-1_64_Farkas_0', 'piperout-d27_objective_-1_64_All_0', 'piperout-d27_objective_-1_64_Disjunction_0', 'piperout-d27_objective_-1_64_NoDisjunction_0', 'piperout-d20_objective_1_64_New_0', 'piperout-d20_objective_1_64_All_0', 'piperout-d20_objective_1_64_Disjunction_0', 'piperout-d20_objective_1_64_NoDisjunction_0', 'piperout-d20_objective_-1_64_New_0', 'piperout-d20_objective_-1_64_Farkas_0', 'piperout-d20_objective_-1_64_All_0', 'piperout-d20_objective_-1_64_Disjunction_0', 'nexp-150-20-1-5_matrix_1_64_NoDisjunction_0', 'nexp-150-20-1-5_rhs_-1_64_NoDisjunction_0', 'qnet1_matrix_1_64_New_0', 'neos-2328163-agri_objective_1_64_New_0', 'neos-2328163-agri_matr

0.02444253859348199

In [11]:
# rerun this if want to give more memory to some instances
# bad_alloc_names = set(n.split("_")[0] for n in err["bad_alloc"])
# mem = pd.read_csv("more_memory.csv", index_col=0)
# mem["reason"] = "hard solve" 
# 
# for n in bad_alloc_names:
#     if f"{n}.mps" not in mem.index:
#         new_row = pd.DataFrame([{'file_name': f"{n}.mps", 'memory': 16.0, 'reason': 'big disjunction'}]).set_index('file_name')
#         mem = pd.concat([mem, new_row])
#     else:
#         mem.loc[f'{n}.mps', 'memory'] = 16.0
# 
# mem.to_csv("more_memory.csv")

In [12]:
# this is an issue with John's bookkeeping - not much we can do here
print(err["takeoffcuts"])
len(err["takeoffcuts"]) / count_series

[]


0.0

In [13]:
print(err["solver is dual infeasible"])
len(err["solver is dual infeasible"]) / count_series

[]


0.0

In [14]:
# these are usually issues with CLP finding optimality - not much we can do here
print(err["solver must be optimal"])
len(err["solver must be optimal"]) / count_series

[]


0.0

In [15]:
print(err["segmentation fault"])
len(err["segmentation fault"]) / count_series

['neos-3665875-lesum_rhs_1_64_New_0', 'neos-3665875-lesum_matrix_1_64_All_0', 'neos-3665875-lesum_objective_-1_64_Disjunction_0']


0.0002144082332761578

In [16]:
# seg_err = {
#     "Bad image at line": [],
# }
# 
# seg_other = []
# 
# for stem in err["segmentation fault"]:
#     file_pth = os.path.join(results_fldr, f"{stem}.out")
# 
#     with open(file_pth, "r") as f:
#         text = f.read()
#     
#     # assign the error file to the appropriate list
#     found_code = False
#     for code in seg_err:
#         if code in text:
#             seg_err[code].append(stem)
#             found_code = True
#             break
#     if not found_code:
#         seg_other.append(stem)

In [17]:
# print(seg_err["Bad image at line"])
# len(seg_err["Bad image at line"]) / len(err["segmentation fault"]) if err["segmentation fault"] else 0

In [18]:
# print(seg_other)
# len(seg_other)/len(err["segmentation fault"]) if err["segmentation fault"] else 0

In [19]:
# # get breakdown of why vpc generation failed - mostly from lack of provisioning
# for code, exps in seg_err.items():
#     print(f"{code}: {len(exps) / len(err['segmentation fault']) if err['segmentation fault'] else 0}")
# 
# print(f"other: {len(seg_other) / len(err['segmentation fault']) if err['segmentation fault'] else 0}")

In [20]:
# todo: check aleks' removals and drop those below for similar reasons
# todo: check size of disjunctions and decide what to do with those that are too big
# these should all be from the problem being too big and hitting the time limit or integer solutions
print(err["no vpcs were made from a new disjunction"])
missing_4_term = [n for n in err["no vpcs were made from a new disjunction"] if "_4_" in n]
missing_64_term = [n for n in err["no vpcs were made from a new disjunction"] if "_64_" in n]
print(f'4 term: {len(missing_4_term) / count_series}')
print(f'64 term: {len(missing_64_term) / count_series}')

['bienst2_rhs_1_64_New_0', 'bienst2_rhs_1_64_Farkas_0', 'bienst2_rhs_1_64_All_0', 'bienst2_rhs_1_64_NoDisjunction_0', 'bienst2_objective_1_64_New_0', 'bienst2_objective_1_64_Farkas_0', 'bienst2_objective_1_64_All_0', 'bienst2_objective_1_64_Disjunction_0', 'bienst2_matrix_-1_64_New_0', 'bienst2_matrix_-1_64_Farkas_0', 'bienst2_matrix_-1_64_All_0', 'bienst2_matrix_-1_64_Disjunction_0', 'bienst2_matrix_-1_64_NoDisjunction_0', 'bienst2_objective_-1_64_New_0', 'bienst2_objective_-1_64_Farkas_0', 'bienst2_objective_-1_64_All_0', 'bienst2_objective_-1_64_NoDisjunction_0', 'neos-555343_rhs_1_4_New_0', 'neos-555343_rhs_1_4_Farkas_0', 'neos-555343_rhs_1_4_All_0', 'neos-555343_rhs_1_4_Disjunction_0', 'neos-555343_rhs_1_4_NoDisjunction_0', 'neos-555343_rhs_1_64_New_0', 'neos-555343_rhs_1_64_Farkas_0', 'neos-555343_rhs_1_64_All_0', 'neos-555343_rhs_1_64_Disjunction_0', 'neos-555343_rhs_1_64_NoDisjunction_0', 'neos-555343_objective_1_4_New_0', 'neos-555343_objective_1_4_Farkas_0', 'neos-555343_obje

In [21]:
# vpc_err = {
#     "CglVPC: Finishing with exit reason: PRLP_TIME_LIMIT": [],
#     "CglVPC: Finishing with exit reason: TIME_LIMIT": [],
#     "CglVPC: Finishing with exit reason: NO_CUTS_LIKELY": [],
#     "CglVPC: Finishing with exit reason: PRLP_INFEASIBLE": [],
#     "CglVPC: Finishing with exit reason: SUCCESS": [],
#     "CglVPC: Finishing with exit reason: OPTIMAL_SOLUTION_FOUND": [],
#     "CglVPC: Finishing with exit reason: FAIL_LIMIT": [],
#     "CglVPC: Finishing with exit reason: NO_DISJUNCTION": [],
# }
# 
# vpc_other = []
# 
# for stem in err["no vpcs were made from a new disjunction"]:
#     file_pth = os.path.join(results_fldr, f"{stem}.out")
# 
#     with open(file_pth, "r") as f:
#         text = f.read()
#     
#     # assign the error file to the appropriate list
#     found_code = False
#     for code in vpc_err:
#         if code in text:
#             vpc_err[code].append(stem)
#             found_code = True
#             break
#     if not found_code:
#         vpc_other.append(stem)

In [22]:
# print(vpc_err["CglVPC: Finishing with exit reason: PRLP_TIME_LIMIT"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: PRLP_TIME_LIMIT"]) / len(err["no vpcs were made from a new disjunction"])

In [23]:
# print(vpc_err["CglVPC: Finishing with exit reason: TIME_LIMIT"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: TIME_LIMIT"]) / len(err["no vpcs were made from a new disjunction"])

In [24]:
# print(vpc_err["CglVPC: Finishing with exit reason: NO_CUTS_LIKELY"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: NO_CUTS_LIKELY"]) / len(err["no vpcs were made from a new disjunction"])

In [25]:
# print(vpc_err["CglVPC: Finishing with exit reason: PRLP_INFEASIBLE"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: PRLP_INFEASIBLE"]) / len(err["no vpcs were made from a new disjunction"])

In [26]:
# print(vpc_err["CglVPC: Finishing with exit reason: SUCCESS"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: SUCCESS"]) / len(err["no vpcs were made from a new disjunction"])

In [27]:
# print(vpc_err["CglVPC: Finishing with exit reason: OPTIMAL_SOLUTION_FOUND"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: OPTIMAL_SOLUTION_FOUND"]) / len(err["no vpcs were made from a new disjunction"])

In [28]:
# print(vpc_err["CglVPC: Finishing with exit reason: FAIL_LIMIT"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: FAIL_LIMIT"]) / len(err["no vpcs were made from a new disjunction"])

In [29]:
# print(vpc_err["CglVPC: Finishing with exit reason: NO_DISJUNCTION"])
# if err["no vpcs were made from a new disjunction"]:
#     len(vpc_err["CglVPC: Finishing with exit reason: NO_DISJUNCTION"]) / len(err["no vpcs were made from a new disjunction"])

In [30]:
# vpc_other

In [31]:
# # get breakdown of why vpc generation failed - mostly from lack of provisioning/problem being too large
# if err["no vpcs were made from a new disjunction"]:
#     for code, exps in vpc_err.items():
#         print(f"{code}: {len(exps) / len(err['no vpcs were made from a new disjunction'])}")
#     
#     print(f"other: {len(vpc_other) / len(err['no vpcs were made from a new disjunction'])}")

In [32]:
print(err["must have primalbound >= root lp objective"])
len(err["must have primalbound >= root lp objective"]) / count_series

['neos4_objective_1_4_None_0', 'neos4_objective_1_64_None_0']


0.0001429388221841052

In [33]:
# LP relaxation objective is not going to match root nodes objective when warm starting 
print(err["objective at parent nodes"])
len(err["objective at parent nodes"]) / count_series

[]


0.0

In [34]:
# not enough tolerance added to bound (or we hit time limit) - element 2 from 5 and 4 from 4
print(err["failed to optimize mip"])
len(err["failed to optimize mip"]) / count_series

[]


0.0

In [35]:
# todo: figure out why
print(err["disjunction does not represent a full binary tree"])
len(err["disjunction does not represent a full binary tree"]) / count_series

[]


0.0

In [36]:
# again issue with not getting through vpc generation in time
# todo: handle this gracefully
print(err["solver not proven optimal for nodes"])
len(err["solver not proven optimal for nodes"]) / count_series

[]


0.0

In [37]:
print(err["unable to open"])
len(err["unable to open"]) / count_series

[]


0.0

In [38]:
print(err["license"])
len(err["license"]) / count_series

[]


0.0

In [39]:
print(warning)
len(warning) / count_series

['bienst2_rhs_1_4_New_0', 'bienst2_rhs_1_4_Farkas_0', 'bienst2_rhs_1_4_All_0', 'bienst2_objective_1_4_New_0', 'bienst2_objective_1_4_Farkas_0', 'bienst2_objective_1_4_All_0', 'bienst2_objective_1_4_Disjunction_0', 'bienst2_objective_1_4_NoDisjunction_0', 'bienst2_matrix_-1_4_New_0', 'bienst2_matrix_-1_4_Disjunction_0', 'bienst2_matrix_-1_4_NoDisjunction_0', 'bienst2_objective_-1_4_New_0', 'bienst2_objective_-1_4_Farkas_0', 'bienst2_objective_-1_4_NoDisjunction_0', '10teams_objective_1_4_New_0', '10teams_matrix_1_64_New_0', '10teams_matrix_1_64_Farkas_0', '10teams_matrix_1_64_All_0', '10teams_objective_-1_4_New_0', 'gmu-35-40_objective_1_4_New_0', 'gmu-35-40_objective_1_4_Farkas_0', 'gmu-35-40_objective_1_4_All_0', 'gmu-35-40_objective_1_4_Disjunction_0', 'gmu-35-40_objective_1_4_NoDisjunction_0', 'gmu-35-40_objective_1_64_New_0', 'gmu-35-40_matrix_-1_4_New_0', 'gmu-35-40_matrix_-1_4_Farkas_0', 'gmu-35-40_matrix_-1_4_All_0', 'gmu-35-40_matrix_-1_4_Disjunction_0', 'gmu-35-40_matrix_-1_4_

0.05717552887364208

In [40]:
# errors unaccounted for
print(other)
len(other) / count_series

['neos-3083819-nubu_matrix_-1_4_New_0', 'neos-3083819-nubu_matrix_-1_64_New_0', 'eil33-2_objective_1_4_New_0']


0.0002144082332761578

In [41]:
# proportion of series that were improperly provisioned
(len(err["bad_alloc"] + err["out of memory"] + err["walltime"] + err["vmem"])) / count_series

0.025728987993138937

In [42]:
# todo handle this
print(err["dot product with obj differs from solver"])
len(err["dot product with obj differs from solver"]) / count_series

['neos-3592146-hawea_matrix_-1_4_New_0']


7.14694110920526e-05

In [43]:
# changed code to ignore this error
print(err["gurobi: error during callback: addCut"])
len(err["gurobi: error during callback: addCut"]) / count_series

[]


0.0

In [44]:
# largely not replicating - only issue I could find was aleks missing updated objective from CLP when resolving to check this
print(err["cglvpc::setupconstraints: objective at disjunctive term"])
len(err["cglvpc::setupconstraints: objective at disjunctive term"]) / count_series

[]


0.0

In [45]:
# not replicating - rerun
print(err["unable to read file"])
len(err["unable to read file"]) / count_series

[]


0.0

In [46]:
# not replicating - rerun
print(err["stats.id == stats_vec"])
len(err["stats.id == stats_vec"]) / count_series

[]


0.0

In [47]:
print(err["size of our disjunction is not what we expected it to be"])
len(err["size of our disjunction is not what we expected it to be"]) / count_series

[]


0.0

In [48]:
print(err["vpcgenerator must be"])
len(err["vpcgenerator must be"]) / count_series

[]


0.0

In [49]:
print(err["dimension must stay fixed"])
len(err["dimension must stay fixed"]) / count_series

[]


0.0

In [50]:
print(err["objective values must match"])
len(err["objective values must match"]) / count_series

['f2gap801600_objective_-1_64_New_0', 'neos-3610173-itata_matrix_-1_4_New_0', 'neos-3610051-istra_matrix_-1_64_New_0', 'f2gap401600_objective_1_64_New_0', 'traininstance6_objective_-1_4_New_0', 'traininstance6_objective_-1_64_New_0', 'mas74_matrix_1_4_New_0', 'mas74_matrix_1_64_New_0', 'mas74_matrix_-1_64_New_0', 'rentacar_objective_-1_4_New_0', 'neos-3421095-cinca_objective_1_64_New_0', 'irp_objective_-1_4_New_0', 'aligninq_matrix_1_4_New_0', 'neos-631517_matrix_-1_64_New_0', 'neos-3610040-iskar_matrix_-1_64_New_0', 'neos-3627168-kasai_matrix_-1_64_New_0', 'pg_rhs_1_64_New_0', 'neos-3611689-kaihu_matrix_-1_64_New_0', 'neos-3611689-kaihu_objective_-1_64_New_0', 'mas76_matrix_1_64_New_0', 'mas76_matrix_-1_64_New_0', 'neos-3754480-nidda_objective_1_4_New_0', 'neos-3754480-nidda_objective_1_64_New_0', 'neos-3754480-nidda_rhs_-1_4_New_0', 'neos-3754480-nidda_rhs_-1_64_New_0', 'neos-3754480-nidda_objective_-1_64_New_0', 'control30-3-2-3_matrix_-1_64_New_0', 'control30-3-2-3_objective_-1_64_

0.002501429388221841

In [51]:
print(err["objective at disjunctive term"])
len(err["objective at disjunctive term"]) / count_series

['neos-631517_matrix_1_4_New_0', 'neos-631517_matrix_1_64_New_0', 'gus-sch_matrix_1_4_New_0', 'gus-sch_matrix_1_64_New_0', 'neos-5182409-nasivi_matrix_-1_4_New_0', 'neos-5182409-nasivi_matrix_-1_64_New_0', 'roll3000_matrix_-1_4_New_0', 'roll3000_matrix_-1_64_New_0', 'control30-3-2-3_matrix_-1_4_New_0']


0.0006432246998284735

In [52]:
# get breakdown of errors
for code, exps in err.items():
    print(f"{code}: {len(exps) / count_series}")

print(f"other: {len(other) / count_series}")

print(f"warning: {len(warning) / count_series}")

print(f"no errors/warnings: {len(empty) / count_series}")

print(f"no go: {len(no_go) / count_series}")

walltime: 0.001286449399656947
bad_alloc: 0.02058319039451115
out of memory: 0.0011435105774728416
vmem: 0.002715837621497999
takeoffcuts: 0.0
solver is dual infeasible: 0.0
solver must be optimal: 0.0
segmentation fault: 0.0002144082332761578
no vpcs were made from a new disjunction: 0.324757004002287
must have primalbound >= root lp objective: 0.0001429388221841052
objective at parent nodes: 0.0
failed to optimize mip: 0.0
disjunction does not represent a full binary tree: 0.0
solver not proven optimal for nodes: 0.0
unable to open: 0.0
license: 0.0
dot product with obj differs from solver: 7.14694110920526e-05
gurobi: error during callback: addCut: 0.0
cglvpc::setupconstraints: objective at disjunctive term: 0.0
unable to read file: 0.0
stats.id == stats_vec: 0.0
size of our disjunction is not what we expected it to be: 0.0
dimension must stay fixed: 0.0
vpcgenerator must be: 0.0
objective values must match: 0.002501429388221841
objective at disjunctive term: 0.0006432246998284735
o

## Read in data

In [53]:
# map generator names to the corresponding data frames
df_map = {g: pd.DataFrame() for g in generators} 
gap_map = {g: pd.DataFrame() for g in generators}
regex = re.compile(r'([a-zA-Z0-9-]+(?:_o)?)_([a-z]+)_([0-9-]+)_([0-9]+)_([a-zA-Z ]+)')
solution_pattern = r"_(\d+)\.pb"

# declaring types as needed
column_types = {
    "lpBound": float,
    "lpBoundPostVpc": float,
    "disjunctiveDualBound": float,
    "primalBound": float,
    "rootDualBound": float,
    "dualBound": float
}

skipped_instances = set()
primal_bounds = {}
same_solution = {}

# iterate over all files in the folder
for file_name in os.listdir(results_fldr):
    
    file_pth = os.path.join(results_fldr, file_name)
    
    # if the file is not a nonempty csv, skip it
    if not file_name.endswith(".csv") or os.path.getsize(file_pth) == 0:
        continue
    
    # get the experimental set up
    match = regex.search(file_name)
    instance_name = names.get(file_name[:-4])
    if not instance_name:
        skipped_instances.add(file_name[:-4].split("_")[0])
        os.remove(file_pth)
        continue
    # instance_name = match.group(1)
    perturbation = match.group(2)
    assert perturbation in ["matrix", "rhs", "bound", "objective"], f"Unknown perturbation: {perturbation}"
    expo = int(match.group(3))
    assert expo in degrees, f"Unknown degree: {expo}"
    degree = 2**int(expo)
    terms = int(match.group(4))
    assert terms in term_list, f"Unknown number of terms: {terms}"
    generator = match.group(5)
    assert generator in generators, f"Unknown generator: {generator}"
    base_name = f"{instance_name}_0"
    
    # get the primal bounds for this experiment
    cur_instance_test_set_fldr = os.path.join(test_set_fldr, instance_name, f"{perturbation}_{expo}")
    for test_set_file in os.listdir(cur_instance_test_set_fldr):
        if test_set_file.endswith(".pb"):
            with open(os.path.join(cur_instance_test_set_fldr, test_set_file), "r") as f:
                primal_bounds[perturbation, expo, ".".join(test_set_file.split(".")[:-1])] = float(f.read())
                
    # see if solution changed
    for test_set_file in os.listdir(cur_instance_test_set_fldr):
        if test_set_file.endswith(".pb"):
            perturbation_name = ".".join(test_set_file.split(".")[:-1])
            same_solution[perturbation, expo, perturbation_name] = \
                primal_bounds[perturbation, expo, base_name] == primal_bounds[perturbation, expo, perturbation_name]
            
    # read the file
    df = pd.read_csv(file_pth, keep_default_na=False, dtype=column_types, index_col=0)
    
    for instance_idx in df.index:
        
        # fill in primal bounds if missing
        # df.loc[instance_idx, "primalBound"] = min(primal_bounds.get(stem_map.get(instance_idx), 1e100), df.loc[instance_idx, "primalBound"])
        df.loc[instance_idx, "primalBound"] = min(
            primal_bounds[perturbation, expo, f"{instance_name}_{instance_idx}"], df.loc[instance_idx, "primalBound"]
        )
        
        # same with root dual bound
        df.loc[instance_idx, "rootDualBound"] = df.loc[instance_idx, "rootDualBound"] if df.loc[instance_idx, "rootDualBound"] < 1e100 else df.loc[instance_idx, "lpBoundPostVpc"] 
    
    # get rid of the index so the rest of the notebook works
    df.reset_index(inplace=True)
    
    # add some identifying columns
    df["instance"] = instance_name
    df["perturbation"] = perturbation
    df["degree"] = degree
    df["terms"] = terms
    df["rows"] = rows[instance_name]
    df["cols"] = cols[instance_name]
    df["density"] = density[instance_name]
    
    # append to the appropriate data frame
    df_map[generator] = pd.concat([df_map[generator], df])
    
    # track recorded vs expected experiments
    number_instances[file_name[:-4]]["recorded"] = len(df)

In [54]:
# convert number_instances to dataframe
frame = pd.DataFrame(number_instances).T
frame.head()

Unnamed: 0,expected,recorded,generator,error
bienst2_rhs_1_4_None_0,6,6,,empty
bienst2_rhs_1_4_New_0,6,6,New,warning
bienst2_rhs_1_4_Farkas_0,6,6,Farkas,warning
bienst2_rhs_1_4_All_0,6,6,All,warning
bienst2_rhs_1_4_Disjunction_0,6,6,Disjunction,empty


In [55]:
# redo the runs that have incomplete data that we're not sure should be that way
redos = frame.loc[(frame["expected"] > frame["recorded"]) & (frame["error"] != "no vpcs were made from a new disjunction")].index.tolist()
redos = pd.DataFrame({"experiment": redos})
redos.to_csv("redos.csv", index=False)

In [56]:
if "miplib" in test_set or "quick" in test_set:
    # group frame by generator and sum remaining columns
    gb = frame.groupby(["generator", "error"]).sum().reset_index()
    gb["missing"] = gb["expected"] - gb["recorded"]
    total = gb.groupby("generator")[["expected", "missing"]].sum().reset_index()
    gb = pd.merge(gb, total, on="generator", suffixes=("", " total"))
    gb["ratio missing (by generator)"] = gb["missing"] / gb["missing total"]
    gb["ratio missing (by generator)"] = gb["ratio missing (by generator)"].apply(lambda x: round(x, 4))
    gb = gb.loc[:, ~gb.columns.str.contains("total")]  # get rid of the total columns
    gb.set_index(["generator", "error"], inplace=True)
    gb.to_csv(os.path.join(out_fldr, "missing_table.csv"), index=False, mode="w")
else:
    gb = None
gb

Unnamed: 0_level_0,Unnamed: 1_level_0,expected,recorded,missing,ratio missing (by generator)
generator,error,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
All,bad_alloc,326,89,237,0.0432
All,empty,6347,5789,558,0.1018
All,no vpcs were made from a new disjunction,4580,0,4580,0.8355
All,out of memory,15,8,7,0.0013
All,segmentation fault,5,0,5,0.0009
All,vmem,39,12,27,0.0049
All,walltime,25,0,25,0.0046
All,warning,761,718,43,0.0078
Disjunction,bad_alloc,270,80,190,0.0347
Disjunction,empty,6486,5908,578,0.1056


In [57]:
for gen in generators:
    masks = {
        0: -1e20 > df_map[gen]["lpBound"],
        1: df_map[gen]["lpBound"] - 1e-3 > df_map[gen]["lpBoundPostVpc"],
        2: (df_map[gen]["lpBoundPostVpc"] - 1e-3 > df_map[gen]["disjunctiveDualBound"]) & ((gen == "None") | (gen == "New")),
        3: df_map[gen]["rootDualBound"] - 1e-3 > df_map[gen]["dualBound"],
        4: (df_map[gen]["dualBound"] - 1e-3 > df_map[gen]["primalBound"]) & (df_map[gen]["dualBound"] / df_map[gen]["primalBound"] > 1 + 1e-3),
        5: df_map[gen]["primalBound"] > 1e20,
        6: 0 > df_map[gen]["vpcGenerationTime"],
        7: df_map[gen]["vpcGenerationTime"] - 1e-3 > df_map[gen]["rootDualBoundTime"],
        8: df_map[gen]["rootDualBoundTime"] - 1e-3 > df_map[gen]["terminationTime"],
        9: df_map[gen]["vpcGenerationTime"] - 1e-3 > df_map[gen]["bestSolutionTime"],
        10: df_map[gen]["bestSolutionTime"] - 1e-3 > df_map[gen]["terminationTime"]
    }
    for i, mask in masks.items():
        print(f"{gen} {i}: {mask.sum() / len(df_map[gen])}")

None 0: 0.0
None 1: 0.0
None 2: 0.0
None 3: 0.0
None 4: 0.0006634599436059048
None 5: 0.0
None 6: 0.0
None 7: 0.0
None 8: 0.0
None 9: 0.0
None 10: 0.0
New 0: 0.0
New 1: 0.0
New 2: 0.0
New 3: 0.0
New 4: 0.0009084027252081756
New 5: 0.0
New 6: 0.0
New 7: 0.0
New 8: 0.0
New 9: 0.0
New 10: 0.0
Farkas 0: 0.0
Farkas 1: 0.0
Farkas 2: 0.0
Farkas 3: 0.0
Farkas 4: 0.0008265601322496211
Farkas 5: 0.0
Farkas 6: 0.0
Farkas 7: 0.0
Farkas 8: 0.0
Farkas 9: 0.0
Farkas 10: 0.0
All 0: 0.0
All 1: 0.0
All 2: 0.0
All 3: 0.0
All 4: 0.0007557436517533253
All 5: 0.0
All 6: 0.0
All 7: 0.0
All 8: 0.0
All 9: 0.0
All 10: 0.0
Disjunction 0: 0.0
Disjunction 1: 0.0
Disjunction 2: 0.0
Disjunction 3: 0.0
Disjunction 4: 0.0010570824524312897
Disjunction 5: 0.0
Disjunction 6: 0.0
Disjunction 7: 0.0
Disjunction 8: 0.0
Disjunction 9: 0.0
Disjunction 10: 0.0
NoDisjunction 0: 0.0
NoDisjunction 1: 0.0
NoDisjunction 2: 0.0
NoDisjunction 3: 0.0
NoDisjunction 4: 0.0009671179883945841
NoDisjunction 5: 0.0
NoDisjunction 6: 0.0
NoD

In [58]:
for gen, df in df_map.items():
    print(f"{gen}: {df.size}")

None: 458204
New: 250990
Farkas: 275842
All: 251408
Disjunction: 251636
NoDisjunction: 275044


In [59]:
# it shouldn't be possible that dual bound > primal bound. this only happens when we use the saved primal bound, which was used to set the dual bound
df_map["Farkas"][masks[0]]

  df_map["Farkas"][masks[0]]


Unnamed: 0,instanceIndex,seedIndex,vpcGenerator,terms,lpBound,disjunctiveDualBound,lpBoundPostVpc,rootDualBound,dualBound,primalBound,...,tighten_disjunction,tighten_matrix_perturbation,tighten_infeasible_to_feasible_term,tighten_feasible_to_infeasible_basis,instance,perturbation,degree,rows,cols,density


In [60]:
for gen in df_map:
    mask = (-1e20 > df_map[gen]["lpBound"]) | \
        (df_map[gen]["lpBound"] - 1e-3 > df_map[gen]["lpBoundPostVpc"]) | \
        ((df_map[gen]["lpBoundPostVpc"] - 1e-3 > df_map[gen]["disjunctiveDualBound"]) & (gen != "Farkas")) | \
        (df_map[gen]["rootDualBound"] - 1e-3 > df_map[gen]["dualBound"]) | \
        ((df_map[gen]["dualBound"] - 1e-3 > df_map[gen]["primalBound"]) & (df_map[gen]["dualBound"] / df_map[gen]["primalBound"] > 1 + 1e-3)) | \
        (df_map[gen]["primalBound"] > 1e20) | \
        (0 > df_map[gen]["vpcGenerationTime"]) | \
        (df_map[gen]["vpcGenerationTime"] - 1e-3 > df_map[gen]["rootDualBoundTime"]) | \
        (df_map[gen]["rootDualBoundTime"] - 1e-3 > df_map[gen]["terminationTime"]) | \
        (df_map[gen]["vpcGenerationTime"] - 1e-3 > df_map[gen]["bestSolutionTime"]) | \
        (df_map[gen]["bestSolutionTime"] - 1e-3 > df_map[gen]["terminationTime"])
    print(f"{gen}: {mask.sum() / len(df_map[gen])}")
    df_map[gen] = df_map[gen][~mask]

None: 0.0006634599436059048
New: 0.0009084027252081756
Farkas: 0.0008265601322496211
All: 0.0009068923821039903
Disjunction: 0.0012080942313500453
NoDisjunction: 0.0011052777010223818


In [61]:
# merge the different data frames into one
join_cols = ["instance", "perturbation", "degree", "terms", "instanceIndex", "seedIndex"]
df = df_map[generators[0]].merge(df_map[generators[1]], on=join_cols, suffixes=(f" {generators[0]}", None))
for g1, g2 in zip(generators[1:-1], generators[2:]):
    df = df.merge(df_map[g2], on=join_cols, suffixes=(f" {g1}", None if g2 != generators[-1] else f" {g2}"))
df.head()

Unnamed: 0,instanceIndex,seedIndex,vpcGenerator None,terms,lpBound None,disjunctiveDualBound None,lpBoundPostVpc None,rootDualBound None,dualBound None,primalBound None,...,termRemainsFeasibleBasisInfeasible NoDisjunction,cutsChangedCoefficients NoDisjunction,feasibleTermsPrunedByBound NoDisjunction,tighten_disjunction NoDisjunction,tighten_matrix_perturbation NoDisjunction,tighten_infeasible_to_feasible_term NoDisjunction,tighten_feasible_to_infeasible_basis NoDisjunction,rows NoDisjunction,cols NoDisjunction,density NoDisjunction
0,0,0,,64,-120.0,-120.0,-120.0,-120.0,-120.0,-120.0,...,0,0,0,0,0,0,0,8357,10735,0.000534
1,1,0,,64,-120.0,-120.0,-120.0,-120.0,-120.0,-120.0,...,0,0,0,0,1,1,1,8357,10735,0.000534
2,2,0,,64,-120.5,-120.5,-120.5,-120.5,-120.5,-120.5,...,0,0,0,0,1,1,1,8357,10735,0.000534
3,0,0,,64,-4632.298153,-4632.298153,-4632.298153,-4631.571278,-4607.140232,-4606.67961,...,0,0,0,0,0,0,0,46,29,0.976762
4,1,0,,64,-4628.667162,-4628.667162,-4628.667162,-4627.808946,-4604.833773,-4604.373375,...,0,0,0,0,1,1,1,46,29,0.976762


In [62]:
# get proportion of tests run to completion
len(generators) * len(df) / count_instances

0.48495619110596794

In [63]:
# assign nan's to experiments that didn't need to run - matrix support for RHS or any support for objective 
if filter_redundant:
    target_cols = [c for c in df.columns if any(s in c for s in [" NoDisjunction", " All"])
                   and any(metric in c for metric in ["Bound", "Time", "nodes", "iterations"])]
    df.loc[df["perturbation"] == "objective", target_cols] = np.nan

In [64]:
def gap_closed(df, col):
    gap = abs(df[col] - df["lpBound None"]) / abs(df['primalBound None'] - df["lpBound None"])
    gap[(gap > 1) | (gap == np.nan)] = 1  # get corner cases
    return gap

# Function to map values based on a dictionary
def check_same_solution(row):
    # Create a tuple of the key based on the key_columns
    return same_solution[row["perturbation"], int(math.log2(row["degree"])), f'{row["instance"]}_{row["instanceIndex"]}']

In [65]:
# find the optimality gap closed by each generator
df["Disjunction (New)"] = gap_closed(df, "disjunctiveDualBound New")
df["Disjunction (Old)"] = gap_closed(df, "disjunctiveDualBound Farkas")
for g in generators:
    if g != "None":
        df[f"VPCs ({g})"] = gap_closed(df, f"lpBoundPostVpc {g}")        
    df[f"Root Cuts ({g})"] = gap_closed(df, f"rootDualBound {g}")

df["Root Optimality Gap Improvement"] = df["Root Cuts (Farkas)"] - df["Root Cuts (None)"] 
# df = df.dropna()

In [66]:
# find times without vpc generation
df["terminationTimeSansVpc None"] = df["terminationTime None"]
df["rootDualBoundTimeSansVpc None"] = df["rootDualBoundTime None"]
for gen in generators:
    if gen != "None":
        df[f"terminationTimeSansVpc {gen}"] = df[f"terminationTime {gen}"] - df[f"vpcGenerationTime {gen}"]
        df[f"rootDualBoundTimeSansVpc {gen}"] = df[f"rootDualBoundTime {gen}"] - df[f"vpcGenerationTime {gen}"]
    df[f"postRootTime {gen}"] = df[f"terminationTime {gen}"] - df[f"rootDualBoundTime {gen}"]
    if gen not in ["None", "New"]:
        df[f"terminationTimeImprovement {gen}"] = (df["terminationTime None"] - df[f"terminationTime {gen}"]) / df["terminationTime None"]
        df[f"terminationTimeSansVpcImprovement {gen}"] = (df["terminationTimeSansVpc None"] - df[f"terminationTimeSansVpc {gen}"]) / df["terminationTimeSansVpc None"]
        df[f"nodesImprovement {gen}"] = (df["nodes None"] - df[f"nodes {gen}"]) / df["nodes None"] 
        df[f"iterationsImprovement {gen}"] = (df["iterations None"] - df[f"iterations {gen}"]) / df["iterations None"] 
        df[f"terminationTimeRatio {gen}"] = df[f"terminationTime {gen}"] / df["terminationTime None"]
        df[f"terminationTimeSansVpcRatio {gen}"] = df[f"terminationTimeSansVpc {gen}"] / df["terminationTimeSansVpc None"]
        df[f"nodesRatio {gen}"] = df[f"nodes {gen}"] / df["nodes None"] 
        df[f"iterationsRatio {gen}"] = df[f"iterations {gen}"] / df["iterations None"]
        df[f"nodesImproves {gen}"] = df["nodes None"] > df[f"nodes {gen}"]
        df[f"terminationTimeImproves {gen}"] = df["terminationTime None"] > df[f"terminationTime {gen}"]
        df[f"terminationTimeSansVpcImproves {gen}"] = df["terminationTimeSansVpc None"] > df[f"terminationTimeSansVpc {gen}"]
        df[f"iterationsImproves {gen}"] = df["iterations None"] > df[f"iterations {gen}"]
        
# df[f'{metric}Win{gen}'] = df[[f'{metric} {gen2}' for gen2 in compare_gens]].mean(axis=1) - 3 * df[[f'{metric} {gen2}' for gen2 in compare_gens]].std(axis=1) > df[f'{metric} {gen}']
for metric in ["nodes", "terminationTime", "terminationTimeSansVpc", "iterations"]:
    for gen in generators:
        compare_gens = [gen2 for gen2 in generators if gen2 != gen]
        df[f'{metric}Win{gen}'] = pd.concat([
            pd.Series(
                np.where(
                    df[f'{metric} {gen}'].isna(), False,
                    np.where(
                        df[f'{metric} {gen2}'].isna(), True,
                        df[f'{metric} {gen2}'] * (1 - win_threshold) > df[f'{metric} {gen}']
                    )
                ),
                index=df.index
            )
            for gen2 in compare_gens
        ], axis=1).all(axis=1)

    df[f'{metric}WinAny'] = pd.concat([
        pd.Series(
            np.where(
                df[f'{metric} {gen}'].isna(), False,
                df[f'{metric} None'] * (1 - win_threshold) > df[f'{metric} {gen}']
            ),
            index=df.index
        )
        for gen in generators if gen != "None"
    ], axis=1).any(axis=1)
    
    df[f'{metric}WinStrengthened'] = pd.concat([
        df[f"{metric}Win{gen}"] for gen in generators if gen not in ["None", "New", "Farkas"]
        ], axis=1).any(axis=1)

df["bracket"] = ["short" if t <= short else "medium" if t <= medium else "long" for t in df["terminationTime None"]]
df["sameSolution"] = df.apply(check_same_solution, axis=1)

  df[f"iterationsImprovement {gen}"] = (df["iterations None"] - df[f"iterations {gen}"]) / df["iterations None"]
  df[f"terminationTimeRatio {gen}"] = df[f"terminationTime {gen}"] / df["terminationTime None"]
  df[f"terminationTimeSansVpcRatio {gen}"] = df[f"terminationTimeSansVpc {gen}"] / df["terminationTimeSansVpc None"]
  df[f"nodesRatio {gen}"] = df[f"nodes {gen}"] / df["nodes None"]
  df[f"iterationsRatio {gen}"] = df[f"iterations {gen}"] / df["iterations None"]
  df[f"nodesImproves {gen}"] = df["nodes None"] > df[f"nodes {gen}"]
  df[f"terminationTimeImproves {gen}"] = df["terminationTime None"] > df[f"terminationTime {gen}"]
  df[f"terminationTimeSansVpcImproves {gen}"] = df["terminationTimeSansVpc None"] > df[f"terminationTimeSansVpc {gen}"]
  df[f"iterationsImproves {gen}"] = df["iterations None"] > df[f"iterations {gen}"]
  df[f"terminationTimeSansVpc {gen}"] = df[f"terminationTime {gen}"] - df[f"vpcGenerationTime {gen}"]
  df[f"rootDualBoundTimeSansVpc {gen}"] = df[f"rootDu

In [67]:
# get sensitivity stats as ratios
for gen_name in generators:
    if gen_name == "None":
        continue
    df[f"infeasibleTermsRatio {gen_name}"] = df[f"infeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
    df[f"infeasibleToFeasibleTermsRatio {gen_name}"] = df[f"infeasibleToFeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
    df[f"zeroInfeasibleToFeasibleTerms {gen_name}"] = df[f"infeasibleToFeasibleTerms {gen_name}"] == 0
    df[f"feasibleToInfeasibleTermsRatio {gen_name}"] = df[f"feasibleToInfeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]

  df[f"infeasibleTermsRatio {gen_name}"] = df[f"infeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
  df[f"infeasibleToFeasibleTermsRatio {gen_name}"] = df[f"infeasibleToFeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
  df[f"zeroInfeasibleToFeasibleTerms {gen_name}"] = df[f"infeasibleToFeasibleTerms {gen_name}"] == 0
  df[f"feasibleToInfeasibleTermsRatio {gen_name}"] = df[f"feasibleToInfeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
  df[f"infeasibleTermsRatio {gen_name}"] = df[f"infeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
  df[f"infeasibleToFeasibleTermsRatio {gen_name}"] = df[f"infeasibleToFeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
  df[f"zeroInfeasibleToFeasibleTerms {gen_name}"] = df[f"infeasibleToFeasibleTerms {gen_name}"] == 0
  df[f"feasibleToInfeasibleTermsRatio {gen_name}"] = df[f"feasibleToInfeasibleTerms {gen_name}"] / df[f"actualTerms {gen_name}"]
  df[f"infeasibleTermsRatio {gen_name}"] = df[f"infeasibleTerms 

In [68]:
def optimality_gap(df, generator=None):
    if generator:
        return abs(df[f"primalBound {generator}"] - df[f"dualBound {generator}"]) / \
            abs(df[f"primalBound {generator}"])
    else:
        return abs(df[f"primalBound"] - df[f"dualBound"]) / abs(df[f"primalBound"])

In [69]:
# aleks filters
# df = df.loc[df["terms"] == df["actualTerms Farkas"]]
# df = df.loc[df["zeroInfeasibleToFeasibleTerms Farkas"]]

In [70]:
df.head()

Unnamed: 0,instanceIndex,seedIndex,vpcGenerator None,terms,lpBound None,disjunctiveDualBound None,lpBoundPostVpc None,rootDualBound None,dualBound None,primalBound None,...,zeroInfeasibleToFeasibleTerms All,feasibleToInfeasibleTermsRatio All,infeasibleTermsRatio Disjunction,infeasibleToFeasibleTermsRatio Disjunction,zeroInfeasibleToFeasibleTerms Disjunction,feasibleToInfeasibleTermsRatio Disjunction,infeasibleTermsRatio NoDisjunction,infeasibleToFeasibleTermsRatio NoDisjunction,zeroInfeasibleToFeasibleTerms NoDisjunction,feasibleToInfeasibleTermsRatio NoDisjunction
0,0,0,,64,-120.0,-120.0,-120.0,-120.0,-120.0,-120.0,...,True,0.0,0.886364,0.0,True,0.0,0.886364,0.0,True,0.0
1,1,0,,64,-120.0,-120.0,-120.0,-120.0,-120.0,-120.0,...,True,0.0,0.886364,0.0,True,0.0,0.886364,0.0,True,0.0
2,2,0,,64,-120.5,-120.5,-120.5,-120.5,-120.5,-120.5,...,True,0.0,0.886364,0.0,True,0.0,0.886364,0.0,True,0.0
3,0,0,,64,-4632.298153,-4632.298153,-4632.298153,-4631.571278,-4607.140232,-4606.67961,...,True,0.0,0.852273,0.0,True,0.0,0.852273,0.0,True,0.0
4,1,0,,64,-4628.667162,-4628.667162,-4628.667162,-4627.808946,-4604.833773,-4604.373375,...,True,0.0,0.852273,0.0,True,0.0,0.852273,0.0,True,0.0


In [71]:
# set aside core columns and filter for all subsequent dataframes
group_cols = ["instance", "perturbation", "bracket", "degree", "terms"]
id_cols = ["instanceIndex"]

# keep the instance, perturbation, instanceIndex triples that exist for all combinations of degree and terms
# where VPC did not find the optimal solution
full_df = df.loc[df["Disjunction (New)"] < .9999]
triples = (full_df.groupby(
        ["instance", "perturbation", "instanceIndex"]
    ).size().reset_index().rename(columns={0: "count"}))
triples.head()

Unnamed: 0,instance,perturbation,instanceIndex,count
0,10teams,matrix,0,2
1,10teams,matrix,1,2
2,10teams,matrix,2,2
3,10teams,matrix,3,1
4,10teams,matrix,4,2


In [72]:
# uncomment to filter for only the triples that exist for all combinations of degree and terms (and seed index)
triples = triples[triples["count"] == len(degrees) * len(term_list) * len(seed_idxs)]
# full_df = full_df.merge(triples, on=["instance", "perturbation", "instanceIndex"])
full_df.to_csv(os.path.join(out_fldr, "cleaned_combined_complete.csv"), index=False, mode="w")

## Check Root Node Stats

In [73]:
def interleave(list_of_lists):
    return [item for sublist in zip(*list_of_lists) for item in sublist]

In [74]:
# additional filtering for dataframe on bounds
fields = ["Disjunction (New)", "Disjunction (Old)"] + [f"VPCs ({gen_name})" for gen_name in generators if gen_name != "None"] + \
    interleave([[f"Root Cuts ({gen_name})", f"terminationTime {gen_name}", f"nodes {gen_name}",
                 f"iterations {gen_name}", f"terminationTimeSansVpc {gen_name}", f"vpcGenerationTime {gen_name}", 
                 f"rootDualBoundTime {gen_name}"]
                for gen_name in generators]) + \
    interleave([[f"infeasibleTermsRatio {gen_name}", f"infeasibleToFeasibleTermsRatio {gen_name}",
                 f"zeroInfeasibleToFeasibleTerms {gen_name}", f"feasibleToInfeasibleTermsRatio {gen_name}"]
                for gen_name in generators if gen_name != "None"])

# now reduce bound_df to just the perturbed instances - make > -1 to include base instance
bound_df = full_df.loc[full_df["instanceIndex"] > 0, group_cols + id_cols + fields]  #  & (full_df["Disjunction (Old)"] > .1)

In [75]:
def geometric_mean(series, offset=1e-6):
    adjusted_series = series + offset  # Add a small offset to avoid zeros
    return np.exp(np.log(adjusted_series).mean())

# paper currently uses mean, but we can switch to geometric mean if we want
aggregations = {f: "mean" if not "Time" in f else geometric_mean for f in fields}  # geometric_mean if f not in ["sameSolution"] else 
aggregations["instance"] = "nunique"
aggregations["instanceIndex"] = "count"

In [76]:
# get gap closed by degree and term
out = bound_df.groupby(["degree", "terms"]).agg(aggregations).reset_index()
out.to_csv(os.path.join(out_fldr, "bound_table.csv"), index=False, mode="w")
out

Unnamed: 0,degree,terms,Disjunction (New),Disjunction (Old),VPCs (New),VPCs (Farkas),VPCs (All),VPCs (Disjunction),VPCs (NoDisjunction),Root Cuts (None),...,zeroInfeasibleToFeasibleTerms All,zeroInfeasibleToFeasibleTerms Disjunction,zeroInfeasibleToFeasibleTerms NoDisjunction,feasibleToInfeasibleTermsRatio New,feasibleToInfeasibleTermsRatio Farkas,feasibleToInfeasibleTermsRatio All,feasibleToInfeasibleTermsRatio Disjunction,feasibleToInfeasibleTermsRatio NoDisjunction,instance,instanceIndex
0,0.5,4,0.081033,0.052704,0.042736,0.026206,0.020854,0.026188,0.020825,0.602701,...,0.99563,0.99563,0.99563,0.0,0.012245,0.012245,0.012245,0.012245,158,1373
1,0.5,64,0.145173,0.096323,0.073955,0.048169,0.040147,0.048086,0.040234,0.623902,...,0.93129,0.93129,0.929175,0.0,0.023385,0.023385,0.023385,0.023385,123,946
2,2.0,4,0.090557,0.052347,0.044357,0.024344,0.01742,0.024277,0.017453,0.613977,...,0.996835,0.996835,0.996835,0.0,0.021064,0.021064,0.021064,0.021064,166,1264
3,2.0,64,0.160376,0.091676,0.077244,0.039684,0.039001,0.040003,0.038966,0.63125,...,0.917234,0.917234,0.913832,0.0,0.034753,0.034753,0.034753,0.034753,133,882


In [77]:
# now break it down by type of perturbation
out = bound_df.groupby(["degree", "terms", "perturbation"]).agg(aggregations).reset_index()
out.to_csv(os.path.join(out_fldr, "bound_table_by_perturbation.csv"), index=False, mode="w")
out

Unnamed: 0,degree,terms,perturbation,Disjunction (New),Disjunction (Old),VPCs (New),VPCs (Farkas),VPCs (All),VPCs (Disjunction),VPCs (NoDisjunction),...,zeroInfeasibleToFeasibleTerms All,zeroInfeasibleToFeasibleTerms Disjunction,zeroInfeasibleToFeasibleTerms NoDisjunction,feasibleToInfeasibleTermsRatio New,feasibleToInfeasibleTermsRatio Farkas,feasibleToInfeasibleTermsRatio All,feasibleToInfeasibleTermsRatio Disjunction,feasibleToInfeasibleTermsRatio NoDisjunction,instance,instanceIndex
0,0.5,4,matrix,0.109022,0.037555,0.052804,0.020508,0.02006,0.020538,0.020017,...,0.992788,0.992788,0.992788,0.0,0.024189,0.024189,0.024189,0.024189,116,416
1,0.5,4,objective,0.060635,0.065881,0.036528,0.03045,,0.030401,,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,140,747
2,0.5,4,rhs,0.098146,0.035842,0.044879,0.022395,0.022426,0.022395,0.022426,...,0.985714,0.985714,0.985714,0.0,0.032143,0.032143,0.032143,0.032143,73,210
3,0.5,64,matrix,0.180725,0.085662,0.078055,0.03448,0.035954,0.034589,0.035799,...,0.839161,0.839161,0.835664,0.0,0.033892,0.033892,0.033892,0.033892,87,286
4,0.5,64,objective,0.120346,0.108888,0.071307,0.05893,,0.058471,,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,107,469
5,0.5,64,rhs,0.152904,0.081434,0.074316,0.042245,0.046424,0.042794,0.046876,...,0.900524,0.900524,0.895288,0.0,0.065073,0.065073,0.065073,0.065073,69,191
6,2.0,4,matrix,0.121719,0.054756,0.051168,0.017159,0.017108,0.017104,0.017225,...,0.988372,0.988372,0.988372,0.0,0.042515,0.042515,0.042515,0.042515,101,344
7,2.0,4,objective,0.069345,0.055974,0.044809,0.029083,,0.029066,,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,149,746
8,2.0,4,rhs,0.119892,0.032035,0.028956,0.018228,0.018038,0.017928,0.017903,...,1.0,1.0,1.0,0.0,0.068966,0.068966,0.068966,0.068966,58,174
9,2.0,64,matrix,0.168024,0.087012,0.082915,0.025612,0.030909,0.025733,0.030948,...,0.753138,0.753138,0.740586,0.0,0.044558,0.044558,0.044558,0.044558,76,239


In [78]:
def make_pareto_frontier(bound_df, save_fig=True):
    # Identify relevant fields
    strength_fields = [f for f in fields if "Root Cuts" in f]
    time_fields = [f for f in fields if "vpcGenerationTime" in f]

    # Compute means
    strength_df = bound_df[strength_fields].mean().reset_index()
    strength_df.columns = ["key", "value"]
    strength_df['category'] = strength_df['key'].str.extract(r'\((.*?)\)')

    time_df = bound_df[time_fields].apply(geometric_mean).reset_index()
    time_df.columns = ["key", "value"]
    time_df['category'] = time_df['key'].str.extract(r'(None|Farkas|Old|New|All|NoDisjunction|Disjunction|Matrix|Term|Basis)')

    # Merge on category
    merged_df = pd.merge(strength_df, time_df, on='category', suffixes=('_strength', '_time'))
    merged_df['category'] = merged_df['category'].replace(cat_map)

    # Plotting
    plt.figure(figsize=(6, 5))
    categories = merged_df['category'].unique()
    cmap = plt.get_cmap('tab10')

    for i, category in enumerate(categories):
        sub_df = merged_df[merged_df['category'] == category]
        plt.scatter(
            sub_df['value_time'],
            sub_df['value_strength'],
            label=category,
            color=cmap(i % 10),
            s=25
        )

    plt.ylabel("Average Root Nodes\nOptimality Gap Closed")
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0, 1))
    plt.xlabel("Average Time (s) to Process VPCs")
    plt.title("Root Nodes Optimality Gap Closed vs. Processing Time")
    plt.grid(True)
    plt.legend(title="Generator", loc="best", fontsize=12, title_fontsize=14)
    plt.tight_layout()

    if save_fig:
        plt.savefig(os.path.join(out_fldr, "strength_vs_time.png"), dpi=1200)

    print(merged_df.sort_values("value_strength", ascending=True)[["key_strength", "value_strength", "value_time"]])
    plt.show()


In [79]:
# again nearly pareto optimal - time and strength both ordered in terms of doing more "work". Makes sense for matrix case compared to 
# make_pareto_frontier(bound_df)

## Root Stats

In [80]:
# example table for VPC strength
out[["degree", "terms", "perturbation"] + [c for c in out.columns if "VPCs" in c or "Disjunction (" in c]].round(4)

Unnamed: 0,degree,terms,perturbation,Disjunction (New),Disjunction (Old),VPCs (New),VPCs (Farkas),VPCs (All),VPCs (Disjunction),VPCs (NoDisjunction)
0,0.5,4,matrix,0.109,0.0376,0.0528,0.0205,0.0201,0.0205,0.02
1,0.5,4,objective,0.0606,0.0659,0.0365,0.0304,,0.0304,
2,0.5,4,rhs,0.0981,0.0358,0.0449,0.0224,0.0224,0.0224,0.0224
3,0.5,64,matrix,0.1807,0.0857,0.0781,0.0345,0.036,0.0346,0.0358
4,0.5,64,objective,0.1203,0.1089,0.0713,0.0589,,0.0585,
5,0.5,64,rhs,0.1529,0.0814,0.0743,0.0422,0.0464,0.0428,0.0469
6,2.0,4,matrix,0.1217,0.0548,0.0512,0.0172,0.0171,0.0171,0.0172
7,2.0,4,objective,0.0693,0.056,0.0448,0.0291,,0.0291,
8,2.0,4,rhs,0.1199,0.032,0.029,0.0182,0.018,0.0179,0.0179
9,2.0,64,matrix,0.168,0.087,0.0829,0.0256,0.0309,0.0257,0.0309


In [81]:
# example table for root cut strength
out[["degree", "terms", "perturbation"] + [c for c in out.columns if "Root Cuts" in c]].round(4)

Unnamed: 0,degree,terms,perturbation,Root Cuts (None),Root Cuts (New),Root Cuts (Farkas),Root Cuts (All),Root Cuts (Disjunction),Root Cuts (NoDisjunction)
0,0.5,4,matrix,0.6635,0.666,0.6638,0.6624,0.6645,0.6605
1,0.5,4,objective,0.5679,0.5766,0.5779,,0.5764,
2,0.5,4,rhs,0.606,0.6151,0.6152,0.6126,0.6096,0.6005
3,0.5,64,matrix,0.7131,0.7156,0.7142,0.7166,0.7194,0.7156
4,0.5,64,objective,0.5746,0.5929,0.5899,,0.5938,
5,0.5,64,rhs,0.6113,0.646,0.6415,0.6424,0.638,0.6429
6,2.0,4,matrix,0.6927,0.6878,0.6896,0.6869,0.6878,0.6896
7,2.0,4,objective,0.5813,0.5902,0.5942,,0.5918,
8,2.0,4,rhs,0.5985,0.5967,0.5969,0.5929,0.597,0.5958
9,2.0,64,matrix,0.6857,0.6935,0.6835,0.6876,0.6806,0.6839


In [82]:
# example table for root cut generation time
out[["degree", "terms", "perturbation"] + [c for c in out.columns if "rootDualBoundTime" in c]].round(3)

Unnamed: 0,degree,terms,perturbation,rootDualBoundTime None,rootDualBoundTime New,rootDualBoundTime Farkas,rootDualBoundTime All,rootDualBoundTime Disjunction,rootDualBoundTime NoDisjunction
0,0.5,4,matrix,0.557,5.002,0.783,0.986,0.742,1.032
1,0.5,4,objective,0.841,4.504,0.932,,0.956,
2,0.5,4,rhs,0.376,1.367,0.43,0.46,0.434,0.446
3,0.5,64,matrix,0.519,19.799,1.567,2.444,1.472,2.371
4,0.5,64,objective,0.629,24.919,1.526,,1.436,
5,0.5,64,rhs,0.367,10.142,0.772,0.941,0.751,0.935
6,2.0,4,matrix,0.444,4.711,0.604,0.781,0.637,0.785
7,2.0,4,objective,0.635,3.335,0.767,,0.803,
8,2.0,4,rhs,0.343,2.172,0.418,0.454,0.419,0.458
9,2.0,64,matrix,0.379,15.673,1.38,2.197,1.416,2.177


## Check Termination Stats

In [83]:
# only check perturbed instances that solve to optimality and VPC didn't find optimal solution
mask = (
    (df["Disjunction (New)"] < 0.9999) & (df["instanceIndex"] > 0)
    & np.logical_and.reduce([(optimality_gap(df, gen) <= 1e-4) | (pd.isnull(optimality_gap(df, gen))) for gen in generators])
    & (df["terminationTime None"] > min_termination_time)
)
# if remove_status_changes:
#     mask = mask & (df["infeasibleToFeasibleTermsRatio Farkas"] == 0) & (df["feasibleToInfeasibleTermsRatio Farkas"] == 0)

gap_df = df.loc[mask]

In [84]:
def plot_distributions(histogram_df, feature, bins=100, xlim=(-2, 1), ylim=(0, 1), perturbation=None, exclude_perturbation=False, title_x=.525, relative=True):
    """
    Generate a grid of cumulative distribution functions (CDFs) for a given feature,
    one for each combination of terms and degrees.
    """

    unique_degrees = histogram_df['degree'].sort_values(ascending=False).unique()
    unique_terms = histogram_df['terms'].sort_values().unique()

    fig, axes = plt.subplots(len(unique_degrees), len(unique_terms),
                             figsize=(4 * len(unique_terms), 4 * len(unique_degrees)))
    
    compare_gens = [gen for gen in generators if gen != "None" and gen != "New"]

    for i, degree in enumerate(unique_degrees):
        for j, terms in enumerate(unique_terms):
            ax = axes[i, j] if len(unique_degrees) > 1 and len(unique_terms) > 1 else axes[i] if len(unique_degrees) > 1 \
                else axes[j] if len(unique_terms) > 1 else axes

            subset_df = histogram_df[(histogram_df['degree'] == degree) & (histogram_df['terms'] == terms)]

            if perturbation is not None:
                subset_df = subset_df[subset_df["perturbation"] == perturbation] if not exclude_perturbation \
                    else subset_df[subset_df["perturbation"] != perturbation]

            # Compute relative improvements dynamically from generators
            relative_improvements = {gen: subset_df[f"{feature} {gen}"] for gen in compare_gens} if not relative else \
                {gen: -(subset_df[f"{feature} None"] - subset_df[f"{feature} {gen}"]) / subset_df[f"{feature} None"] for gen in compare_gens}
            
            # filter out nan's and drop groups that are emptied - e.g. any supporting for objective perturbations or matrix supporting for rhs
            relative_improvements = {gen: ri[ri.notna()] for gen, ri in relative_improvements.items() if ri.notna().sum() > 1}

            x = np.linspace(xlim[0], xlim[1], bins)

            # Compute CDFs
            cdfs = {
                gen: np.array([(ri <= val).sum() / len(ri) for val in x])
                for gen, ri in relative_improvements.items()
            }

            # Plot each generator's CDF with distinct color
            cmap = plt.get_cmap("tab10")
            for gen in cdfs:
                ax.plot(x, cdfs[gen], label=cat_map.get(gen, gen), color=cmap(generators.index(gen)))

            # Optionally fill region where first generator dominates all others
            primary = "Farkas"
            others = [g for g in cdfs if g != "Farkas"]
            fill_region = [max(cdfs[gen][k] for gen in others) > cdfs[primary][k] for k in range(len(x))]
            ax.fill_between(x, cdfs[primary], np.maximum.reduce([cdfs[gen] for gen in others]),
                            where=fill_region,
                            facecolor='yellow', alpha=0.7,
                            label=f'Improvement over\n{cat_map.get(primary, primary)}')

            ax.set_xlim(xlim)
            ax.set_ylim(ylim)
            ax.set_title(f"{degree} Degree{'s' if degree > 1 else ''}, {terms} Terms")
            if j == 0:
                ax.set_ylabel("Probability")
            if i == len(unique_degrees) - 1:
                ax.set_xlabel("Relative Change")

    # Shared legend
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(title_x, -0.1), ncol=3)

    fig.suptitle(f"CDF of {'Relative Change in ' if relative else ''}Solve {label[feature]}"
                 f"{' for ' + (perturbation.title() if perturbation != 'rhs' else perturbation.upper()) + ' Perturbations' if perturbation else ''}", x=title_x)
    plt.tight_layout()

    plt.savefig(os.path.join(out_fldr, f"cdf_{feature}{'_' + perturbation if perturbation else ''}{'_relative_improvements' if relative else ''}.png"), dpi=1200, bbox_inches='tight')
    plt.show()

In [85]:
target_cols = ["infeasibleToFeasibleTerms", "termRemainsFeasibleBasisInfeasible", "cutsChangedCoefficients",
               "feasibleTermsPrunedByBound", "tighten_disjunction", "tighten_matrix_perturbation",
               "tighten_infeasible_to_feasible_term", "tighten_feasible_to_infeasible_basis"]

gb = gap_df.groupby(["perturbation", "degree", "terms"]).agg(
    {f"{c} {gen}": "mean" for gen in generators for c in target_cols if gen in ["Disjunction", "NoDisjunction", "All"]}
)

In [86]:
# plot_distributions(gap_df, "terminationTime", xlim=(100, 3600), ylim=(.6, 1), bins=350, relative=False)
# plot_distributions(gap_df, "terminationTime", xlim=(-1, 0), ylim=(0, .5), bins=1000, relative=True)

In [87]:
# plot_distributions(gap_df, "terminationTime", xlim=(100, 3600), ylim=(.6, 1), bins=350, relative=False, perturbation="matrix")
# plot_distributions(gap_df, "terminationTime", xlim=(-1, 0), ylim=(0, .5), bins=1000, relative=True, perturbation="matrix")

In [88]:
gb.loc["matrix", [c for c in gb.columns if " All" in c]]

Unnamed: 0_level_0,Unnamed: 1_level_0,infeasibleToFeasibleTerms All,termRemainsFeasibleBasisInfeasible All,cutsChangedCoefficients All,feasibleTermsPrunedByBound All,tighten_disjunction All,tighten_matrix_perturbation All,tighten_infeasible_to_feasible_term All,tighten_feasible_to_infeasible_basis All
degree,terms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.5,4,0.010204,2.647959,34.770408,0.076531,1.0,1.0,1.0,1.0
0.5,64,1.384,30.088,40.2,0.112,1.0,1.0,1.0,1.0
2.0,4,0.02381,2.769841,42.047619,0.119048,1.0,1.0,1.0,1.0
2.0,64,1.949367,32.886076,41.556962,0.316456,1.0,1.0,1.0,1.0


In [89]:
# plot_distributions(gap_df, "terminationTime", xlim=(100, 3600), ylim=(.6, 1), bins=350, relative=False, perturbation="objective")
# plot_distributions(gap_df, "terminationTime", xlim=(-1, 0), ylim=(0, .5), bins=1000, relative=True, perturbation="objective")

In [90]:
gb.loc["objective", [c for c in gb.columns if " Disjunction" in c]]

Unnamed: 0_level_0,Unnamed: 1_level_0,infeasibleToFeasibleTerms Disjunction,termRemainsFeasibleBasisInfeasible Disjunction,cutsChangedCoefficients Disjunction,feasibleTermsPrunedByBound Disjunction,tighten_disjunction Disjunction,tighten_matrix_perturbation Disjunction,tighten_infeasible_to_feasible_term Disjunction,tighten_feasible_to_infeasible_basis Disjunction
degree,terms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.5,4,0.0,0.0,0.0,0.093583,1.0,0.0,0.0,0.0
0.5,64,0.0,0.021186,0.0,0.322034,1.0,0.0,0.0,0.0
2.0,4,0.0,0.0,0.0,0.085227,1.0,0.0,0.0,0.0
2.0,64,0.0,0.012821,0.0,0.200855,1.0,0.0,0.0,0.0


In [91]:
# plot_distributions(gap_df, "terminationTime", xlim=(100, 3600), ylim=(.6, 1), bins=350, relative=False, perturbation="rhs")
# plot_distributions(gap_df, "terminationTime", xlim=(-1, 0), ylim=(0, .5), bins=1000, relative=True, perturbation="rhs")

In [92]:
gb.loc["rhs", [c for c in gb.columns if " All" in c]]

Unnamed: 0_level_0,Unnamed: 1_level_0,infeasibleToFeasibleTerms All,termRemainsFeasibleBasisInfeasible All,cutsChangedCoefficients All,feasibleTermsPrunedByBound All,tighten_disjunction All,tighten_matrix_perturbation All,tighten_infeasible_to_feasible_term All,tighten_feasible_to_infeasible_basis All
degree,terms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.5,4,0.0,2.964706,0.0,0.035294,1.0,1.0,1.0,1.0
0.5,64,0.207317,31.353659,0.0,0.012195,1.0,1.0,1.0,1.0
2.0,4,0.0,2.961538,0.0,0.0,1.0,1.0,1.0,1.0
2.0,64,1.104167,37.583333,0.0,0.0,1.0,1.0,1.0,1.0


In [93]:
def rename_cols(cols, feature):
    new_cols = []
    for col in cols:
        if f"{feature}Win" in col:
            col = col.replace(f"{feature}Win", f"{label[feature]} Win % ")
        elif "instanceIndex" in col:
            col = col.replace("instanceIndex", "Test Instances")
        elif "instance" in col:
            col = col.replace("instance", "Base Instances")
        new_cols.append(col)
    return new_cols

In [94]:
def get_wins(feature, grouping):
    """
    
    :param feature: "nodes", "terminationTime", "terminationTimeSansVpc"
    :param grouping: "perturbation", "terms", "bracket", "degree" 
    :return: 
    """

    # get the win percentages for each feature on average and broken down by grouping type
    features = [feature]
    keys = []    
    wins = {}
    
    for feature in features:
        
        # define aggregating operations
        aggregations = {f"{feature}Win{gen}": "mean" for gen in generators + ["Strengthened", "Any"]}
        if feature == features[-1]:
            aggregations = aggregations | {"instance": "nunique", "instanceIndex": "count"}
        
        # find the average wins for the feature grouped by degree and terms
        feature_wins = gap_df[gap_df["perturbation"] != "bound"].groupby(keys + [grouping]).agg(aggregations)
        
        # clean up formatting
        feature_wins.columns = rename_cols(feature_wins.columns, feature)
        win_cols = [c for c in feature_wins.columns if "Win" in c]
        feature_wins[win_cols] = feature_wins[win_cols].applymap(lambda x: round(x * 100, 2))
        instance_cols = [c for c in feature_wins.columns if "instance" in c]
        feature_wins[instance_cols] = feature_wins[instance_cols].applymap(lambda x: int(x))
        
        # save the df
        wins[feature] = feature_wins
        
    # bring them all together now
    all_wins = pd.concat(wins.values(), axis=1).sort_values(keys + [grouping], ascending=[c != "bracket" for c in keys + [grouping]]).reset_index()
    all_wins.to_csv(os.path.join(out_fldr, f"branch_and_bound_wins_{feature}_{grouping}.csv"), index=False, mode="w")
    all_wins
    return all_wins

In [95]:
# more degree, more impact strengthening has
get_wins("nodes", "degree")

Unnamed: 0,degree,Nodes Processed Win % None,Nodes Processed Win % New,Nodes Processed Win % Farkas,Nodes Processed Win % All,Nodes Processed Win % Disjunction,Nodes Processed Win % NoDisjunction,Nodes Processed Win % Strengthened,Nodes Processed Win % Any,Base Instances,Test Instances
0,0.5,11.75,12.3,11.84,2.55,11.48,3.73,17.76,62.48,105,1098
1,2.0,11.45,13.3,10.8,2.73,12.76,3.93,19.41,63.14,105,917


In [96]:
get_wins("terminationTime", "degree")

Unnamed: 0,degree,Time Win % None,Time Win % New,Time Win % Farkas,Time Win % All,Time Win % Disjunction,Time Win % NoDisjunction,Time Win % Strengthened,Time Win % Any,Base Instances,Test Instances
0,0.5,18.31,7.38,14.85,3.19,18.21,4.92,26.32,65.57,105,1098
1,2.0,20.72,9.92,14.5,2.4,13.52,5.34,21.26,62.16,105,917


In [97]:
# more terms, more impact strengthening has
get_wins("nodes", "terms")

Unnamed: 0,terms,Nodes Processed Win % None,Nodes Processed Win % New,Nodes Processed Win % Farkas,Nodes Processed Win % All,Nodes Processed Win % Disjunction,Nodes Processed Win % NoDisjunction,Nodes Processed Win % Strengthened,Nodes Processed Win % Any,Base Instances,Test Instances
0,4,11.89,13.21,11.4,2.31,12.06,3.88,18.25,62.26,114,1211
1,64,11.19,12.06,11.32,3.11,12.06,3.73,18.91,63.56,88,804


In [98]:
get_wins("terminationTime", "terms")

Unnamed: 0,terms,Time Win % None,Time Win % New,Time Win % Farkas,Time Win % All,Time Win % Disjunction,Time Win % NoDisjunction,Time Win % Strengthened,Time Win % Any,Base Instances,Test Instances
0,4,16.35,10.82,15.28,2.39,15.85,5.2,23.45,67.3,114,1211
1,64,24.0,5.1,13.81,3.48,16.42,4.98,24.88,59.08,88,804


In [99]:
# more run time, more impact "disjunctive cuts" (not just stregnthening) have, but strengthening reinforces this pattern
get_wins("nodes", "bracket")

Unnamed: 0,bracket,Nodes Processed Win % None,Nodes Processed Win % New,Nodes Processed Win % Farkas,Nodes Processed Win % All,Nodes Processed Win % Disjunction,Nodes Processed Win % NoDisjunction,Nodes Processed Win % Strengthened,Nodes Processed Win % Any,Base Instances,Test Instances
0,short,13.15,11.69,12.48,2.26,10.89,4.25,17.4,60.29,93,753
1,medium,12.73,13.32,9.81,2.34,13.67,4.09,20.09,63.08,88,856
2,long,6.4,13.55,12.56,3.94,10.84,2.46,17.24,66.75,51,406


In [100]:
get_wins("terminationTime", "bracket")

Unnamed: 0,bracket,Time Win % None,Time Win % New,Time Win % Farkas,Time Win % All,Time Win % Disjunction,Time Win % NoDisjunction,Time Win % Strengthened,Time Win % Any,Base Instances,Test Instances
0,short,25.63,4.91,16.33,2.66,13.94,4.91,21.51,57.5,93,753
1,medium,17.64,8.88,12.73,2.69,18.57,5.37,26.64,65.77,88,856
2,long,11.58,14.53,15.76,3.45,14.78,4.93,23.15,72.41,51,406


In [101]:
# obj -> rhs -> matrix increasingly more opportunities for strengthening (most opportunities for disj with obj though) largely mirrored in time results
get_wins("nodes", "perturbation")

Unnamed: 0,perturbation,Nodes Processed Win % None,Nodes Processed Win % New,Nodes Processed Win % Farkas,Nodes Processed Win % All,Nodes Processed Win % Disjunction,Nodes Processed Win % NoDisjunction,Nodes Processed Win % Strengthened,Nodes Processed Win % Any,Base Instances,Test Instances
0,matrix,7.98,9.89,8.37,6.46,7.6,10.08,24.14,67.3,87,526
1,objective,13.55,15.38,13.21,0.0,15.13,0.0,15.13,59.78,89,1196
2,rhs,10.24,7.17,9.22,6.48,7.51,8.19,22.18,66.89,58,293


In [102]:
get_wins("terminationTime", "perturbation")

Unnamed: 0,perturbation,Time Win % None,Time Win % New,Time Win % Farkas,Time Win % All,Time Win % Disjunction,Time Win % NoDisjunction,Time Win % Strengthened,Time Win % Any,Base Instances,Test Instances
0,matrix,17.49,6.08,10.65,6.84,11.22,13.69,31.75,68.63,87,526
1,objective,21.74,10.2,17.31,0.0,19.48,0.0,19.48,60.45,89,1196
2,rhs,13.31,6.14,11.26,7.17,10.92,10.58,28.67,70.31,58,293


In [505]:
def aggregate_wins(gap_df, feature, grouping):

    def custom_key(col):
        # bump improvement columns second to win percentage columns
        offset = int("Improvement %" in col)
        # check groupings
        if 'matrix' in col or 'short' in col:
            return (2 + offset, col)
        elif 'objective' in col or 'medium' in col:
            return (4 + offset, col)
        elif 'rhs' in col or 'long' in col:
            return (6 + offset, col)    
        return (offset, col)
    
    # find the average wins for the feature grouped by degree, terms and grouping type
    win_aggregations = {f"{feature}Win{gen}": "mean" for gen in generators + ["Any"]}
    feature_wins = gap_df[gap_df["perturbation"] != "bound"].groupby(["degree", "terms", grouping]).agg(win_aggregations).reset_index().pivot(
        index=['degree', 'terms'], columns=grouping, values=[f"{feature}Win{gen}" for gen in generators + ["Any"]]
    )
    feature_wins.columns = rename_cols([' '.join(col).strip() for col in feature_wins.columns.values], feature)
    feature_wins = feature_wins[sorted(feature_wins.columns, key=custom_key)]
    feature_wins = feature_wins.applymap(lambda x: round(x * 100, 2))
    
    # get the counts for the feature grouped by degree, terms and grouping type
    count_aggregations = {"instance": "nunique", "instanceIndex": "count"}
    feature_counts = gap_df[gap_df["perturbation"] != "bound"].groupby(["degree", "terms", grouping]).agg(count_aggregations).reset_index().pivot(
        index=['degree', 'terms'], columns=grouping, values=["instance", "instanceIndex"]
    )
    feature_counts.columns = rename_cols([' '.join(col).strip() for col in feature_counts.columns.values], feature)
    feature_counts = feature_counts[sorted(feature_counts.columns, key=custom_key)]
    feature_counts = feature_counts.applymap(lambda x: int(x))
    
    # save wins, base instance counts, and test instance counts to csv
    feature_wins.reset_index().to_csv(os.path.join(out_fldr, f"branch_and_bound_wins_{feature}_{grouping}.csv"), index=False, mode="w")
    feature_counts.reset_index().to_csv(os.path.join(out_fldr, f"branch_and_bound_counts_{grouping}.csv"), index=False, mode="w")
    
    return feature_wins, feature_counts

In [506]:
# wins, counts = aggregate_wins(gap_df, "terminationTime", "bracket")
# wins

In [507]:
# wins, counts = aggregate_wins(gap_df, "terminationTime", "perturbation")
# wins

## High Performing Run Time Subset

In [508]:
# additional filtering for dataframe on run time
fields = [f"terminationTime {gen}" for gen in generators] + \
         [f"terminationTimeImprovement {gen}" for gen in generators if gen not in ["None", "New"]]

# create time dataframe
time_df = df.loc[mask, group_cols + id_cols + fields]

In [509]:
aggregations = {f"Average Time {gen}": (f"terminationTime {gen}", geometric_mean) for gen in generators} | \
    {f"Average Improvement {gen}": (f"terminationTimeImprovement {gen}", "mean") for gen in generators if gen not in ["None", "New"]} | \
    {"count": ("terminationTimeImprovement Farkas", "size")}

tmp = time_df.groupby(["instance", "perturbation", "degree", "terms"]).agg(**aggregations).reset_index()
tmp = tmp[(tmp["count"] > 1)]
tmp.to_csv(os.path.join(out_fldr, "high_perform_all.csv"), index=False, mode="w")
tmp.head()

Unnamed: 0,instance,perturbation,degree,terms,Average Time None,Average Time New,Average Time Farkas,Average Time All,Average Time Disjunction,Average Time NoDisjunction,Average Improvement Farkas,Average Improvement All,Average Improvement Disjunction,Average Improvement NoDisjunction,count
0,10teams,objective,0.5,4,54.341924,1171.370903,49.194143,,48.498847,,0.020962,,0.081746,,2
2,50v-10,objective,2.0,4,208.019719,216.426065,185.650493,,59.811219,,0.100349,,0.634732,,2
3,50v-10,objective,2.0,64,98.690415,178.578986,173.416464,,96.435675,,-0.909495,,0.018504,,2
4,a1c1s1,objective,0.5,4,702.257125,648.285896,718.781602,,780.561067,,-0.027907,,-0.142546,,4
5,a1c1s1,objective,0.5,64,245.301821,302.363039,334.793629,,242.967462,,-0.454197,,0.004678,,2


In [510]:
def make_improvement_table(tmp, generator):
    
    # columns we always choose
    key_cols = ["degree", "terms", "perturbation", "instance"]
    time_cols = [f"Average Time {g}" for g in ["None", "New", "Farkas"]]
    
    # subset the ones we want
    all_df = tmp[
        key_cols + time_cols + [f"Average Time {generator}", f"Average Improvement Farkas", f"Average Improvement {generator}", "count"]
    ].sort_values(f"Average Improvement {generator}", ascending=False)
    all_df = all_df[
        (all_df[f"Average Improvement {generator}"] > 0) & 
        (all_df["Average Time Farkas"] > 1.1 * all_df[f"Average Time {generator}"]) & 
        (all_df["Average Time New"] > 1.1 * all_df[f"Average Time {generator}"])
    ]
    best_df = all_df.loc[
        all_df.groupby(['perturbation', 'degree', 'terms'])[f'Average Improvement {generator}'].idxmax()
    ].sort_values(f"Average Improvement {generator}", ascending=False).round(2)
    
    # save all the winners
    all_df.to_csv(os.path.join(out_fldr, f"high_perform_{generator.lower()}.csv"), index=False, mode="w")
    
    # return just the best
    return all_df, best_df

In [511]:
generator = "Disjunction"
all_df, best_df = make_improvement_table(tmp, generator)
best_df[["degree", "terms", "perturbation", "instance", "Average Time None", "Average Time New", "Average Time Farkas", f"Average Time {generator}", "count"]].head(3)

Unnamed: 0,degree,terms,perturbation,instance,Average Time None,Average Time New,Average Time Farkas,Average Time Disjunction,count
63,2.0,64,objective,bppc8-09,262.89,114.43,66.83,57.33,7
476,2.0,4,objective,piperout-03,24.98,15.03,8.41,7.53,3
506,0.5,4,objective,ran12x21,57.64,45.68,35.53,15.18,4


In [512]:
generator = "NoDisjunction"
all_df, best_df = make_improvement_table(tmp, generator)
best_df[["degree", "terms", "perturbation", "instance", "Average Time None", "Average Time New", "Average Time Farkas", f"Average Time {generator}", "count"]].head(3)

Unnamed: 0,degree,terms,perturbation,instance,Average Time None,Average Time New,Average Time Farkas,Average Time NoDisjunction,count
397,0.5,4,rhs,neos-807639,17.19,6.74,5.87,4.98,2
98,0.5,4,matrix,danoint,984.36,1286.77,966.2,411.9,2
266,0.5,64,matrix,neos-1445738,332.75,51427.61,327.71,150.26,2


In [124]:
generator = "All"
all_df, best_df = make_improvement_table(tmp, generator)
best_df[["degree", "terms", "perturbation", "instance", "Average Time None", "Average Time New", "Average Time Farkas", f"Average Time {generator}", "count"]].head(3)

Unnamed: 0,degree,terms,perturbation,instance,Average Time None,Average Time New,Average Time Farkas,Average Time All,count
267,2.0,4,matrix,neos-1445738,515.4,19164.97,263.77,183.81,4
575,0.5,4,matrix,umts,1579.06,2410.74,2259.35,633.2,2
498,2.0,4,rhs,r50x360,56.31,76.22,70.56,39.69,4
