In [2]:
# Copyright [2024] Stefan Dendorfer
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import pandas as pd
from scipy.stats import t

In [3]:
'''We use the results from LayerNAS paper https://arxiv.org/abs/2304.11517.
Table 3, Comparison on NATS-Bench topology search. Mean and deviation of 
test accuracy on 5 runs.'''

N = 5 # this is fixed from the original data reporting 5 runs
SF = 2 # significant figures is 2 from data

table3 = {
    "Cifar10": {
        "RS": {"mean": 92.39, "std": 0.06},
        "RE": {"mean": 94.13, "std": 0.18},
        "PPO": {"mean": 94.02, "std": 0.13},
        "KNAS": {"mean": 93.05, "std": 0.00},
        "TE-NAS": {"mean": 93.90, "std": 0.47},
        "EigenNas": {"mean": 93.46, "std": 0.02},
        "NASI": {"mean": 93.55, "std": 0.10},
        "FairNAS": {"mean": 93.23, "std": 0.18},
        "SGNAS": {"mean": 93.53, "std": 0.12},
        "LayerNAS": {"mean": 94.34, "std": 0.12}
    },
    "Cifar100": {
        "RS": {"mean": 63.54, "std": 0.24},
        "RE": {"mean": 71.40, "std": 0.50},
        "PPO": {"mean": 71.68, "std": 0.65},
        "KNAS": {"mean": 68.91, "std": 0.00},
        "TE-NAS": {"mean": 71.24, "std": 0.56},
        "EigenNas": {"mean": 71.42, "std": 0.63},
        "NASI": {"mean": 71.20, "std": 0.14},
        "FairNAS": {"mean": 71.00, "std": 1.46},
        "SGNAS": {"mean": 70.31, "std": 1.09},
        "LayerNAS": {"mean": 73.01, "std": 0.63}
    },
    "ImageNet16-120": {
        "RS": {"mean": 42.71, "std": 0.34},
        "RE": {"mean": 44.76, "std": 0.64},
        "PPO": {"mean": 44.95, "std": 0.52},
        "KNAS": {"mean": 34.11, "std": 0.00},
        "TE-NAS": {"mean": 42.38, "std": 0.46},
        "EigenNas": {"mean": 45.54, "std": 0.04},
        "NASI": {"mean": 44.84, "std": 1.41},
        "FairNAS": {"mean": 42.19, "std": 0.31},
        "SGNAS": {"mean": 44.98, "std": 2.10},
        "LayerNAS": {"mean": 46.58, "std": 0.59}
    }
}

In [4]:
''''We use two sample independent t-test.'''

def welch_ttest(mean1,std1,mean2,std2, n1, n2):
    '''This ttest is applied when the variances are not assumed similar.'''
    # Calculate the t-statistic
    t_value = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))

    # Calculate the degrees of freedom
    df_numerator = ((std1**2 / n1) + (std2**2 / n2))**2
    df_denominator = ((std1**2 / n1)**2 / (n1-1)) + ((std2**2 / n2)**2 / (n2-1))
    df = df_numerator / df_denominator

    # Calculate the p-value
    p_value = t.sf(abs(t_value), df) * 2  # two-tailed test
    return p_value

def ind_ttest(mean1,std1,mean2,std2, n1, n2):
    '''This ttest is applied when the variances are assumed similar.'''
    df = n1+n2 - 2
    pooled_std = ((n1-1)*(std1 ** 2) + (n2-1)*(std2 ** 2)) / (df)
    
    mean_diff = (mean1 - mean2)
    t_value = mean_diff / math.sqrt((1/n1)+(1/n2)) / (pooled_std ** 0.5)
    
    p_value = t.sf(abs(t_value), df) * 2  # two-tailed test
    return p_value


def get_pvalue(dataset, algorithm_1, algorithm_2):
    '''Applies the two sample t-test depending on similarity of variances.'''
    mean1 = table3[dataset][algorithm_1]["mean"]
    std1_biased = table3[dataset][algorithm_1]["std"]
    std1 = std1_biased * N / (N-1) # correcting for the population

    
    mean2 = table3[dataset][algorithm_2]["mean"]
    std2_biased = table3[dataset][algorithm_2]["std"]
    std2 = std2_biased * N / (N-1) # correcting for the population
    
    use_welch = std1 > 2*std2 or std2 > 2*std1
    if use_welch:
        # variances not similar
        p_value = welch_ttest(mean1,std1,mean2,std2,N,N)
    else:
        # variances similar
        p_value = ind_ttest(mean1,std1,mean2,std2,N,N)
    return p_value, use_welch

In [5]:
'''Here, we compare pairwise the algorithm with the the next best one'''

# choose which algorithms to compare
algorithms = ["RE", "PPO", "TE-NAS", "NASI", "LayerNAS"]

# choose which datasets to evaluate 
datasets = ["Cifar10", "Cifar100", "ImageNet16-120"]

# choose a threshold for statistical significance
# we apply bonferroni correction by dividing through the number of comparisons
p_threshold = 0.05 / (len(datasets)*(len(algorithms)-1))


for dataset in datasets:
    print(f"Dataset: {dataset}")
    algorithms.sort(key=lambda x: table3[dataset][x]["mean"], reverse=True)
    
    for i in range(len(algorithms)-1):
        best_alg = algorithms[i]
        second_best_alg = algorithms[i+1]
        
        print(f"   Comparing {best_alg} with {second_best_alg}", end="")
        
        p_value, use_welch = get_pvalue(dataset, best_alg, second_best_alg)
        if use_welch:
            print(" (using welch)", end="")
        print(":") 
        
        if p_value < p_threshold:
            print("\tStatistical significance"
                  f"(p < {p_threshold:.{SF}g}): p={p_value:.{SF}g}")
        else:
            print("\tNo statistical significance"
                  f"(p >= {p_threshold:.{SF}g}): p={p_value:.{SF}g}")
        

Dataset: Cifar10
   Comparing LayerNAS with RE:
	No statistical significance(p >= 0.0042): p=0.12
   Comparing RE with PPO:
	No statistical significance(p >= 0.0042): p=0.4
   Comparing PPO with TE-NAS (using welch):
	No statistical significance(p >= 0.0042): p=0.68
   Comparing TE-NAS with NASI (using welch):
	No statistical significance(p >= 0.0042): p=0.26
Dataset: Cifar100
   Comparing LayerNAS with PPO:
	No statistical significance(p >= 0.0042): p=0.03
   Comparing PPO with RE:
	No statistical significance(p >= 0.0042): p=0.56
   Comparing RE with TE-NAS:
	No statistical significance(p >= 0.0042): p=0.71
   Comparing TE-NAS with NASI (using welch):
	No statistical significance(p >= 0.0042): p=0.91
Dataset: ImageNet16-120
   Comparing LayerNAS with PPO:
	No statistical significance(p >= 0.0042): p=0.006
   Comparing PPO with NASI (using welch):
	No statistical significance(p >= 0.0042): p=0.9
   Comparing NASI with RE (using welch):
	No statistical significance(p >= 0.0042): p=0.93

In [6]:
'''Here, we compare algorithms after ordering by performance with each other.'''

# In the paper we only publish comparison on Cifar10 and part of the algorithms.
# Comparing all would lower threshold even more, making it even harder to have
# statistical significance.

datasets = ["Cifar10"]
algorithms = ["RE", "PPO", "TE-NAS", "NASI", "LayerNAS"]

p_threshold = 0.05 / (len(datasets)*len(algorithms)*(len(algorithms)-1)/2)
print(f"Bonferroni corrected threshold: {p_threshold:.{SF}g}")


tables = {}
for dataset in datasets:
    algorithms.sort(key=lambda x: table3[dataset][x]["mean"], reverse=True)
    table_data = []
    for i in range(len(algorithms)):
        row = []
        for j in range(len(algorithms)):
            if i != j:
                p_value, _ = get_pvalue(dataset, algorithms[i], algorithms[j])
                significant = "*" if p_value<p_threshold else ""
                entry = f"{p_value:.{SF}g}" + significant
                row.append(entry)
            else:
                row.append("-")
        table_data.append([algorithms[i]]+row)

    tables[dataset] = pd.DataFrame(table_data, columns=[""]+algorithms)

# Displaying the tables
for dataset, table in tables.items():
    print(f"Dataset: {dataset}")
    print(table)
    print()

# Latex print for paper
# print(tables["Cifar10"].to_latex(index=False))


Bonferroni corrected threshold: 0.005
Dataset: Cifar10
             LayerNAS      RE      PPO TE-NAS      NASI
0  LayerNAS         -    0.12    0.012   0.17  1.8e-05*
1        RE      0.12       -      0.4   0.45    0.001*
2       PPO     0.012     0.4        -   0.68   0.0009*
3    TE-NAS      0.17    0.45     0.68      -      0.26
4      NASI  1.8e-05*  0.001*  0.0009*   0.26         -

