# Verify effect of Hidden Similarity Regularization on Models

In [1]:
import pandas as pd

hidden_reg_df = pd.read_csv("../results/spectralmpnn_experiment.csv")
hidden_reg_df

Unnamed: 0,algorithm,mpnn_f1_output,specformer_f1_output,spectralmpnn_f1_output
0,activity_selector,0.8897,0.6731,0.9326
1,articulation_points,0.6792,0.0607,0.7156
2,bellman_ford,0.9805,0.8916,0.9722
3,bfs,0.9961,0.9937,0.9932
4,binary_search,0.2813,0.0625,0.2813
5,bridges,0.673,0.3147,0.8036
6,bubble_sort,0.0791,0.4888,0.147
7,dag_shortest_paths,0.9888,0.7153,0.9854
8,dfs,0.2524,0.3167,0.3154
9,dijkstra,0.9849,0.9531,0.9292


# Load Datasets

In [2]:
import torch
import yaml
from algo_reasoning.src.sampler import CLRSDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_algorithm_args(args_file):
    with open(args_file, 'r') as f:
        args = yaml.safe_load(f)

    return args

# Comparing effect of Spectral Architectures to Algorithm Length

In [4]:
algorithm_args = load_algorithm_args("../algorithm_args/default.yaml")
hidden_reg_df["max_length"] = pd.Series([0] * len(hidden_reg_df))

for alg in hidden_reg_df.algorithm:
    print("Generating sample for: ", alg)
    algorithms = [alg]
    nb_nodes = 64
    ds = CLRSDataset(algorithms, nb_nodes, 1, 1000, seed=7, algorithms_args=algorithm_args)
    obj = next(iter(ds)).to(device=device)

    hidden_reg_df.loc[hidden_reg_df.algorithm == alg, ["max_length"]] =  obj.max_length.item()

Generating sample for:  activity_selector
Generating sample for:  articulation_points
Generating sample for:  bellman_ford
Generating sample for:  bfs
Generating sample for:  binary_search
Generating sample for:  bridges
Generating sample for:  bubble_sort
Generating sample for:  dag_shortest_paths
Generating sample for:  dfs
Generating sample for:  dijkstra
Generating sample for:  find_maximum_subarray_kadane
Generating sample for:  graham_scan
Generating sample for:  heapsort
Generating sample for:  insertion_sort
Generating sample for:  jarvis_march
Generating sample for:  kmp_matcher
Generating sample for:  lcs_length
Generating sample for:  minimum
Generating sample for:  mst_kruskal
Generating sample for:  mst_prim
Generating sample for:  naive_string_matcher
Generating sample for:  quickselect
Generating sample for:  quicksort


In [5]:
reg_1_effect = hidden_reg_df["specformer_f1_output"] - hidden_reg_df["mpnn_f1_output"]
reg_5_effect = hidden_reg_df["spectralmpnn_f1_output"] - hidden_reg_df["mpnn_f1_output"]

hidden_reg_df["specformer_effect"] = reg_1_effect
hidden_reg_df["spectralmpnn_effect"] = reg_5_effect

In [6]:
hidden_reg_df[["specformer_effect", "spectralmpnn_effect"]].mean()

specformer_effect     -0.056948
spectralmpnn_effect    0.045478
dtype: float64

In [7]:
_corr = hidden_reg_df[hidden_reg_df.columns.difference(['algorithm'])].corr()

_corr["max_length"]

max_length                1.000000
mpnn_f1_output           -0.059296
specformer_effect         0.107271
specformer_f1_output      0.006319
spectralmpnn_effect       0.062648
spectralmpnn_f1_output   -0.045470
Name: max_length, dtype: float64

In [15]:
hidden_reg_df

Unnamed: 0,algorithm,mpnn_f1_output,specformer_f1_output,spectralmpnn_f1_output,max_length,specformer_effect,spectralmpnn_effect,_type,_output_type
0,activity_selector,0.8897,0.6731,0.9326,65,-0.2166,0.0429,greedy,mask
1,articulation_points,0.6792,0.0607,0.7156,1041,-0.6185,0.0364,graphs,mask
2,bellman_ford,0.9805,0.8916,0.9722,5,-0.0889,-0.0083,graphs,pointer
3,bfs,0.9961,0.9937,0.9932,3,-0.0024,-0.0029,graphs,pointer
4,binary_search,0.2813,0.0625,0.2813,7,-0.2188,0.0,searching,mask_one
5,bridges,0.673,0.3147,0.8036,1041,-0.3583,0.1306,graphs,mask
6,bubble_sort,0.0791,0.4888,0.147,2017,0.4097,0.0679,sorting,permutation_pointer
7,dag_shortest_paths,0.9888,0.7153,0.9854,124,-0.2735,-0.0034,graphs,pointer
8,dfs,0.2524,0.3167,0.3154,192,0.0643,0.063,graphs,pointer
9,dijkstra,0.9849,0.9531,0.9292,65,-0.0318,-0.0557,graphs,pointer


# Effect by Algorithm Type

In [8]:
type_dict = {
    "divide_and_conquer": ["find_maximum_subarray_kadane"],
    "dynamic_programming": ["matrix_chain_order", "lcs_length", "optimal_bst"],
    "geometry": ["segments_intersect", "graham_scan", "jarvis_march"],
    "graphs": ["dfs", "bfs", "topological_sort", "articulation_points", "bridges", "strongly_connected_components", "mst_kruskal", "mst_prim", "bellman_ford", "dijkstra", "dag_shortest_paths", "floyd_warshall"],
    "greedy": ["activity_selector", "task_scheduling"], 
    "searching": ["minimum", "binary_search", "quickselect"],
    "sorting": ["insertion_sort", "bubble_sort", "heapsort", "quicksort"],
    "strings": ["naive_string_matcher", "kmp_matcher"]
}

def get_algo_type(algo):
    for _type in type_dict.keys():
        print
        if algo in type_dict[_type]:
            return _type
        
hidden_reg_df["_type"] = hidden_reg_df.algorithm.apply(get_algo_type)
agg_df = hidden_reg_df[hidden_reg_df.columns.difference(['algorithm'])].groupby(['_type']).mean()
agg_df

Unnamed: 0_level_0,max_length,mpnn_f1_output,specformer_effect,specformer_f1_output,spectralmpnn_effect,spectralmpnn_f1_output
_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
divide_and_conquer,64.0,0.1563,-0.0157,0.1406,0.0468,0.2031
dynamic_programming,41.0,0.3591,-0.0628,0.2963,-0.0225,0.3366
geometry,385.5,0.81265,-0.12595,0.6867,0.0079,0.82055
graphs,422.111111,0.812222,-0.156322,0.6559,0.016,0.828222
greedy,65.0,0.8897,-0.2166,0.6731,0.0429,0.9326
searching,59.0,0.385433,-0.008967,0.376467,0.0705,0.455933
sorting,720.5,0.1177,0.269325,0.387025,0.0375,0.1552
strings,13.0,0.2188,-0.20315,0.01565,0.22875,0.44755


In [9]:
agg_df[["mpnn_f1_output", "specformer_f1_output", "spectralmpnn_f1_output"]]

Unnamed: 0_level_0,mpnn_f1_output,specformer_f1_output,spectralmpnn_f1_output
_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
divide_and_conquer,0.1563,0.1406,0.2031
dynamic_programming,0.3591,0.2963,0.3366
geometry,0.81265,0.6867,0.82055
graphs,0.812222,0.6559,0.828222
greedy,0.8897,0.6731,0.9326
searching,0.385433,0.376467,0.455933
sorting,0.1177,0.387025,0.1552
strings,0.2188,0.01565,0.44755


In [10]:
agg_df[["specformer_effect", "spectralmpnn_effect"]]

Unnamed: 0_level_0,specformer_effect,spectralmpnn_effect
_type,Unnamed: 1_level_1,Unnamed: 2_level_1
divide_and_conquer,-0.0157,0.0468
dynamic_programming,-0.0628,-0.0225
geometry,-0.12595,0.0079
graphs,-0.156322,0.016
greedy,-0.2166,0.0429
searching,-0.008967,0.0705
sorting,0.269325,0.0375
strings,-0.20315,0.22875


In [11]:
agg_df.corr()

Unnamed: 0,max_length,mpnn_f1_output,specformer_effect,specformer_f1_output,spectralmpnn_effect,spectralmpnn_f1_output
max_length,1.0,0.032673,0.613501,0.420555,-0.332386,-0.049554
mpnn_f1_output,0.032673,1.0,-0.625962,0.876099,-0.36454,0.971696
specformer_effect,0.613501,-0.625962,1.0,-0.172414,-0.254972,-0.730819
specformer_f1_output,0.420555,0.876099,-0.172414,1.0,-0.618079,0.775523
spectralmpnn_effect,-0.332386,-0.36454,-0.254972,-0.618079,1.0,-0.134243
spectralmpnn_f1_output,-0.049554,0.971696,-0.730819,0.775523,-0.134243,1.0


# Effect by Output Type

In [12]:
from algo_reasoning.src.specs import SPECS, Stage

def get_output_type(algo):
    for k, v in SPECS[algo].items():
        stage, _, _type = v

        if stage == Stage.OUTPUT:
            return _type
        
hidden_reg_df["_output_type"] = hidden_reg_df.algorithm.apply(get_output_type)
hidden_reg_df
output_agg_df = hidden_reg_df[hidden_reg_df.columns.difference(['algorithm', "_type"])].groupby(['_output_type']).mean()
output_agg_df

Unnamed: 0_level_0,max_length,mpnn_f1_output,specformer_effect,specformer_f1_output,spectralmpnn_effect,spectralmpnn_f1_output
_output_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
categorical,41.0,0.3591,-0.0628,0.2963,-0.0225,0.3366
mask,696.833333,0.797567,-0.2506,0.546967,0.038083,0.83565
mask_one,44.5,0.2917,-0.074817,0.216883,0.1193,0.411
permutation_pointer,720.5,0.1177,0.269325,0.387025,0.0375,0.1552
pointer,75.666667,0.839933,-0.061967,0.777967,-0.0043,0.835633


In [13]:
output_agg_df[["mpnn_f1_output", "specformer_f1_output", "spectralmpnn_f1_output"]]

Unnamed: 0_level_0,mpnn_f1_output,specformer_f1_output,spectralmpnn_f1_output
_output_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
categorical,0.3591,0.2963,0.3366
mask,0.797567,0.546967,0.83565
mask_one,0.2917,0.216883,0.411
permutation_pointer,0.1177,0.387025,0.1552
pointer,0.839933,0.777967,0.835633


In [14]:
output_agg_df[["specformer_effect", "spectralmpnn_effect"]]

Unnamed: 0_level_0,specformer_effect,spectralmpnn_effect
_output_type,Unnamed: 1_level_1,Unnamed: 2_level_1
categorical,-0.0628,-0.0225
mask,-0.2506,0.038083
mask_one,-0.074817,0.1193
permutation_pointer,0.269325,0.0375
pointer,-0.061967,-0.0043


# SpectralMPNN x MPNN

In [9]:
import pandas as pd

comparison_df = pd.read_csv("../results/spectralmpnn2.csv", sep=";")
comparison_df

Unnamed: 0,algorithm,mpnn_0,mpnn_1,mpnn_2,mpnn_3,mpnn_4,spectralmpnn_0,spectralmpnn_1,spectralmpnn_2,spectralmpnn_3,spectralmpnn_4
0,activity_selector,0.914,0.8382,0.9242,0.9583,0.8444,0.7448,0.8457,0.7909,0.9217,0.919
1,articulation_points,0.9703,0.9833,0.9598,0.8481,0.8996,0.9703,0.9833,0.9598,0.8481,0.8996
2,bellman_ford,0.9868,0.981,0.9731,0.9785,0.9814,0.9541,0.9717,0.9731,0.9663,0.9746
3,bfs,0.998,0.9863,0.9932,0.9951,0.9961,0.9971,1.0,0.9917,0.998,0.998
4,binary_search,0.4063,0.4063,0.1875,0.125,0.25,0.375,0.1875,0.2188,0.3125,0.1875
5,bridges,0.8406,0.5379,0.9055,0.9989,0.7142,0.8537,0.9012,0.7029,0.9012,0.7919
6,bubble_sort,0.0659,0.4014,0.0889,0.3047,0.2852,0.4888,0.597,0.4267,0.3215,0.4888
7,dag_shortest_path,0.9829,0.9917,0.9927,0.9888,0.9912,0.8267,0.9849,0.8853,0.791,0.8057
8,dfs,0.1011,0.2881,0.1436,0.3438,0.0986,0.1987,0.2461,0.0591,0.1855,0.2139
9,dijkstra,0.9727,0.9761,0.9795,0.9858,0.9868,0.9775,0.979,0.9663,0.9761,0.9814


In [10]:
mpnn_results_list = comparison_df.apply(lambda x: [x[f"mpnn_{i}"] for i in range(5)], axis=1)
mpnn_results_list = mpnn_results_list.set_axis(comparison_df["algorithm"])

spectralmpnn_results_list = comparison_df.apply(lambda x: [x[f"spectralmpnn_{i}"] for i in range(5)], axis=1)
spectralmpnn_results_list = spectralmpnn_results_list.set_axis(comparison_df["algorithm"])

In [11]:
import numpy as np

comparison_df["mpnn_avg"] = mpnn_results_list.apply(lambda x: sum(x)/len(x)).values
comparison_df["spectralmpnn_avg"] = spectralmpnn_results_list.apply(lambda x: sum(x)/len(x)).values
comparison_df[["mpnn_avg", "spectralmpnn_avg"]]

Unnamed: 0,mpnn_avg,spectralmpnn_avg
0,0.89582,0.84442
1,0.93222,0.93222
2,0.98016,0.96796
3,0.99374,0.99696
4,0.27502,0.25626
5,0.79942,0.83018
6,0.22922,0.46456
7,0.98946,0.85872
8,0.19504,0.18066
9,0.98018,0.97606


In [12]:
type_dict = {
    "divide_and_conquer": ["find_maximum_subarray_kadane"],
    "dynamic_programming": ["matrix_chain_order", "lcs_length", "optimal_bst"],
    "geometry": ["segments_intersect", "graham_scan", "jarvis_march"],
    "graphs": ["dfs", "bfs", "topological_sort", "articulation_points", "bridges", "strongly_connected_components", "mst_kruskal", "mst_prim", "bellman_ford", "dijkstra", "dag_shortest_paths", "floyd_warshall"],
    "greedy": ["activity_selector", "task_scheduling"], 
    "searching": ["minimum", "binary_search", "quickselect"],
    "sorting": ["insertion_sort", "bubble_sort", "heapsort", "quicksort"],
    "strings": ["naive_string_matcher", "kmp_matcher"]
}

def get_algo_type(algo):
    for _type in type_dict.keys():
        print
        if algo in type_dict[_type]:
            return _type
        
comparison_df["_type"] = comparison_df.algorithm.apply(get_algo_type)
agg_df = comparison_df[comparison_df.columns.difference(['algorithm'])].groupby(['_type']).mean()
agg_df

Unnamed: 0_level_0,mpnn_0,mpnn_1,mpnn_2,mpnn_3,mpnn_4,mpnn_avg,spectralmpnn_0,spectralmpnn_1,spectralmpnn_2,spectralmpnn_3,spectralmpnn_4,spectralmpnn_avg
_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
divide_and_conquer,0.2656,0.1875,0.1094,0.0781,0.1094,0.15,0.2656,0.2031,0.2813,0.25,0.1563,0.23126
dynamic_programming,0.0846,0.135933,0.088267,0.166667,0.1339,0.121873,0.0837,0.066833,0.169233,0.079567,0.181533,0.116173
geometry,0.9709,0.9626,0.982,0.9793,0.9807,0.9751,0.9675,0.9675,0.9582,0.9633,0.9123,0.95376
graphs,0.763536,0.737255,0.755518,0.780018,0.733418,0.753949,0.721027,0.734655,0.691855,0.712318,0.681773,0.708325
greedy,0.86405,0.82755,0.8676,0.8832,0.82495,0.85347,0.80765,0.8512,0.8625,0.92485,0.8986,0.86896
searching,0.458367,0.447933,0.395833,0.354167,0.416667,0.414593,0.501467,0.4351,0.449833,0.4712,0.4336,0.45824
sorting,0.071175,0.2577,0.097575,0.2174,0.193975,0.167565,0.39135,0.373725,0.44275,0.334625,0.45115,0.39872
strings,0.2344,0.2188,0.3594,0.18755,0.40625,0.28128,0.50005,0.4905,0.18755,0.2969,0.32815,0.36063


In [None]:
from scipy.stats import mannwhitneyu

def apply_mannwhitneyu(ls1, ls2):
    avg1 = sum(ls1)/len(ls1)
    avg2 = sum(ls2)/len(ls2)

    x, y = (ls1, ls2) if avg1 > avg2 else (ls2, ls1)
    
    res = mannwhitneyu(x, y, alternative='greater')
    
    gt_avg = 1 if avg1 > avg2 else 2
    return gt_avg if res.pvalue < 0.05 else 0

mannwhitneyu_result = pd.Series(comparison_df["algorithm"].apply(lambda algo: apply_mannwhitneyu(mpnn_results_list[algo], spectralmpnn_results_list[algo])))
comparison_df["result"] = mannwhitneyu_result.apply(lambda x: "tie" if x == 0 else ("mpnn" if x == 1 else "spectralmpnn"))
comparison_df[["algorithm", "result"]]

0     0
1     0
2     1
3     0
4     0
5     0
6     2
7     1
8     0
9     0
10    0
11    0
12    1
13    2
14    2
15    0
16    0
17    2
18    0
19    0
20    0
21    0
22    0
23    2
24    0
25    0
26    2
27    1
Name: algorithm, dtype: int64

In [19]:

def apply_stdtest(ls1, ls2):
    avg1 = np.mean(ls1)
    avg2 = np.mean(ls2)

    x, y = (ls1, ls2) if avg1 > avg2 else (ls2, ls1)

    std = np.std(x)
    
    gt_avg = 1 if avg1 > avg2 else 2
    return gt_avg if np.mean(x) - std > np.mean(y) else 0

stdtest_result = pd.Series(comparison_df["algorithm"].apply(lambda algo: apply_stdtest(mpnn_results_list[algo], spectralmpnn_results_list[algo])))
comparison_df["result"] = stdtest_result.apply(lambda x: "tie" if x == 0 else ("mpnn" if x == 1 else "spectralmpnn"))
comparison_df[["algorithm", "result"]]

Unnamed: 0,algorithm,result
0,activity_selector,mpnn
1,articulation_points,tie
2,bellman_ford,mpnn
3,bfs,spectralmpnn
4,binary_search,tie
5,bridges,tie
6,bubble_sort,spectralmpnn
7,dag_shortest_path,mpnn
8,dfs,tie
9,dijkstra,tie
