In [1]:
import pandas as pd
from scipy import stats

In [2]:
results_df = pd.read_csv('data/all.csv')

In [3]:
results_df.columns

Index(['method', 'dataset', 'accuracy'], dtype='object')

In [4]:
results_df.groupby(['dataset', 'method']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy
dataset,method,Unnamed: 2_level_1
actor,APPNP,100
actor,GATConv,100
actor,GCNConv,100
actor,SAGEConv,100
actor,SGConv,100
actor,baseline,100
citeseer,APPNP,100
citeseer,GATConv,2000
citeseer,GCNConv,2000
citeseer,SAGEConv,2000


In [5]:
for dataset in results_df.dataset.unique():
    for method in results_df.method.unique():
        k2, p = stats.normaltest(results_df[(results_df.dataset==dataset)&(results_df.method==method)]['accuracy'].values)
        alpha = 0.05
        if p < alpha:
            is_normal = 'Not normal'
        else:
            is_normal = 'Normal'
        print(dataset, ' ', method, ' ', is_normal)

cora   baseline   Not normal
cora   GCNConv   Not normal
cora   SAGEConv   Not normal
cora   GATConv   Not normal
cora   SGConv   Not normal
cora   APPNP   Normal
citeseer   baseline   Not normal
citeseer   GCNConv   Not normal
citeseer   SAGEConv   Not normal
citeseer   GATConv   Not normal
citeseer   SGConv   Normal
citeseer   APPNP   Normal
pubmed   baseline   Not normal
pubmed   GCNConv   Not normal
pubmed   SAGEConv   Not normal
pubmed   GATConv   Not normal
pubmed   SGConv   Not normal
pubmed   APPNP   Not normal
cora_full   baseline   Not normal
cora_full   GCNConv   Not normal
cora_full   SAGEConv   Not normal
cora_full   GATConv   Normal
cora_full   SGConv   Not normal
cora_full   APPNP   Normal
texas   baseline   Normal
texas   GCNConv   Not normal
texas   SAGEConv   Not normal
texas   GATConv   Normal
texas   SGConv   Not normal
texas   APPNP   Not normal
wisconsin   baseline   Normal
wisconsin   GCNConv   Not normal
wisconsin   SAGEConv   Normal
wisconsin   GATConv   Normal

In [6]:
results_df.groupby(['dataset', 'method']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy
dataset,method,Unnamed: 2_level_1
actor,APPNP,0.22079
actor,GATConv,0.23181
actor,GCNConv,0.222358
actor,SAGEConv,0.239004
actor,SGConv,0.226316
actor,baseline,0.242177
citeseer,APPNP,0.73626
citeseer,GATConv,0.733422
citeseer,GCNConv,0.72131
citeseer,SAGEConv,0.722808


In [7]:
# not normally distributed and unpaired data --> we use unpaired nonparametric test (Mann-Whitney U test)
# p > alpha --> Fail to Reject H0: Sample distributions are equal. --> differences are not statistically significant
# p <= alpha --> Reject H0: Sample distributions are not equal. --> differences are statistically significant

number_of_tests_per_dataset = 5
alpha = 0.05
bonferroni_corrected_alpha = alpha/number_of_tests_per_dataset

# cora
benchmark = results_df[(results_df.dataset=='cora')&(results_df.method=='baseline')]['accuracy'].values

approach1 = results_df[(results_df.dataset=='cora')&(results_df.method=='APPNP')]['accuracy'].values
approach2 = results_df[(results_df.dataset=='cora')&(results_df.method=='GATConv')]['accuracy'].values
approach3 = results_df[(results_df.dataset=='cora')&(results_df.method=='GCNConv')]['accuracy'].values
approach4 = results_df[(results_df.dataset=='cora')&(results_df.method=='SAGEConv')]['accuracy'].values
approach5 = results_df[(results_df.dataset=='cora')&(results_df.method=='SGConv')]['accuracy'].values

print('cora\n=================')
_, p1 = stats.mannwhitneyu(benchmark, approach1)
print(p1)
if p1 < bonferroni_corrected_alpha:
    print('Cora baseline vs APPNP performance is significantly different.')
else:
    print('Cora baseline vs APPNP performance is not significantly different')
_, p2 = stats.mannwhitneyu(benchmark, approach2)
print(p2)
if p2 < bonferroni_corrected_alpha:
    print('Cora baseline vs GATConv performance is significantly different.')
else:
    print('Cora baseline vs GATConv performance is not significantly different')
_, p3 = stats.mannwhitneyu(benchmark, approach3)
print(p3)
if p3 < bonferroni_corrected_alpha:
    print('Cora baseline vs GCNConv performance is significantly different.')
else:
    print('Cora baseline vs GCNConv performance is not significantly different')
_, p4 = stats.mannwhitneyu(benchmark, approach4)
print(p4)
if p4 < bonferroni_corrected_alpha:
    print('Cora baseline vs SAGEConv performance is significantly different.')
else:
    print('Cora baseline vs SAGEConv performance is not significantly different')
_, p5 = stats.mannwhitneyu(benchmark, approach5)
print(p5)
if p5 < bonferroni_corrected_alpha:
    print('Cora baseline vs SGConv performance is significantly different.')
else:
    print('Cora baseline vs SGConv performance is not significantly different')

cora
1.5801742262206438e-64
Cora baseline vs APPNP performance is significantly different.
0.0
Cora baseline vs GATConv performance is significantly different.
0.0
Cora baseline vs GCNConv performance is significantly different.
0.0
Cora baseline vs SAGEConv performance is significantly different.
1.5801747380092857e-64
Cora baseline vs SGConv performance is significantly different.


In [89]:
# not normally distributed and unpaired data --> we use unpaired nonparametric test (Mann-Whitney U test)
# p > alpha --> Fail to Reject H0: Sample distributions are equal. --> differences are not statistically significant
# p <= alpha --> Reject H0: Sample distributions are not equal. --> differences are statistically significant

results=[]
number_of_tests_per_dataset = 5
alpha = 0.05
bonferroni_corrected_alpha = alpha/number_of_tests_per_dataset

for dataset in results_df.dataset.unique():
    temp_df = pd.DataFrame(results_df.groupby(['dataset', 'method'], as_index = False).mean())
    benchmark = temp_df.loc[(temp_df['dataset']== dataset)].iloc[temp_df.loc[(temp_df['dataset']== dataset)]['accuracy'].argmax()].method
    
    methods_to_evaluate_with = []
    for method in results_df.method.unique():
        if method==benchmark:
            continue
        else:
            methods_to_evaluate_with.append(method)
            
    benchmark_values = results_df[(results_df.dataset==dataset)&(results_df.method==benchmark)]['accuracy'].values
    for method in methods_to_evaluate_with:
        approach_values = results_df[(results_df.dataset==dataset)&(results_df.method==method)]['accuracy'].values
        _, p = stats.mannwhitneyu(benchmark_values, approach_values)
        result = {
            'dataset':dataset,
            'benchmark':benchmark,
            'method':method,
            'p_value':p
        }
        if p < bonferroni_corrected_alpha:
            result['is_significantly_different'] = True
        else:
            result['is_significantly_different'] = False
        results.append(result)

In [91]:
pd.DataFrame(results).to_csv('data/statistical_test_results.csv')

In [92]:
# not normally distributed and unpaired data --> we use unpaired nonparametric test (Mann-Whitney U test)
# p > alpha --> Fail to Reject H0: Sample distributions are equal. --> differences are not statistically significant
# p <= alpha --> Reject H0: Sample distributions are not equal. --> differences are statistically significant

results=[]
number_of_tests_per_dataset = 5
alpha = 0.05
bonferroni_corrected_alpha = alpha/number_of_tests_per_dataset
benchmark = 'baseline'

for dataset in results_df.dataset.unique():
    temp_df = pd.DataFrame(results_df.groupby(['dataset', 'method'], as_index = False).mean())
    
    methods_to_evaluate_with = []
    for method in results_df.method.unique():
        if method==benchmark:
            continue
        else:
            methods_to_evaluate_with.append(method)
            
    benchmark_values = results_df[(results_df.dataset==dataset)&(results_df.method==benchmark)]['accuracy'].values
    for method in methods_to_evaluate_with:
        approach_values = results_df[(results_df.dataset==dataset)&(results_df.method==method)]['accuracy'].values
        _, p = stats.mannwhitneyu(benchmark_values, approach_values)
        result = {
            'dataset':dataset,
            'benchmark':benchmark,
            'method':method,
            'p_value':p
        }
        if p < bonferroni_corrected_alpha:
            result['is_significantly_different'] = True
        else:
            result['is_significantly_different'] = False
        results.append(result)

In [93]:
pd.DataFrame(results).to_csv('data/statistical_test_results_vs_baseline.csv')