In [None]:
import pandas as pd
from scipy.stats import friedmanchisquare, wilcoxon

# Load the CSV file
file_path = "LOC_StatisticalEvaluationResults.csv"
data = pd.read_csv(file_path)

# Extract LOC values as separate groups for analysis
human_eval = data["HumanEval_LOC"]
gpt35s = data["GPT3.5S_LOC"]
gpt35i = data["GPT3.5I"]
gpt35e = data["GPT3.5E"]
gpt4s = data["GPT4S"]
gpt4i = data["GPT4I"]
gpt4e = data["GPT4E"]

# Perform Friedman's test
stat, p_value = friedmanchisquare(human_eval, gpt35s, gpt35i, gpt35e, gpt4s, gpt4i, gpt4e)
print(f"Friedman's test statistic: {stat}, p-value: {p_value}")


# Iterate through all pairwise combinations if needed
from itertools import combinations

models = ["GPT3.5S_LOC", "GPT3.5I", "GPT3.5E", "GPT4S", "GPT4I", "GPT4E"]
human_eval_column = "HumanEval_LOC"


for model in models:
    stat, p = wilcoxon(data[human_eval_column], data[model])
    print(f"Wilcoxon test ({human_eval_column} vs {model}): stat={stat}, p-value={p}")

Friedman's test statistic: 308.88266953713685, p-value: 1.0211782217894384e-63
Wilcoxon test (HumanEval_LOC vs GPT3.5S_LOC): stat=502.0, p-value=1.420628067299295e-22
Wilcoxon test (HumanEval_LOC vs GPT3.5I): stat=4588.5, p-value=0.318894564695125
Wilcoxon test (HumanEval_LOC vs GPT3.5E): stat=3651.0, p-value=0.5751421274133739
Wilcoxon test (HumanEval_LOC vs GPT4S): stat=3352.5, p-value=8.216095195734727e-05
Wilcoxon test (HumanEval_LOC vs GPT4I): stat=3055.0, p-value=0.003502404544579744
Wilcoxon test (HumanEval_LOC vs GPT4E): stat=2047.5, p-value=7.138259819360276e-07


In [None]:
import pandas as pd
from scipy.stats import friedmanchisquare, wilcoxon

# Load the CSV file
file_path = "CC_StatisticalEvaluationResults.csv"
data = pd.read_csv(file_path)

# Extract LOC values as separate groups for analysis
human_eval = data["HumanEval_Cyclomatic Complexity"]
gpt35s = data["GPT3.5S_Cyclomatic Complexity"]
gpt35i = data["GPT3.5I"]
gpt35e = data["GPT3.5E"]
gpt4s = data["GPT4S"]
gpt4i = data["GPT4I"]
gpt4e = data["GPT4E"]

# Perform Friedman's test
stat, p_value = friedmanchisquare(human_eval, gpt35s, gpt35i, gpt35e, gpt4s, gpt4i, gpt4e)
print(f"Friedman's test statistic: {stat}, p-value: {p_value}")

from itertools import combinations

models = ["GPT3.5S_Cyclomatic Complexity", "GPT3.5I", "GPT3.5E", "GPT4S", "GPT4I", "GPT4E"]
human_eval_column = "HumanEval_Cyclomatic Complexity"


for model in models:
    stat, p = wilcoxon(data[human_eval_column], data[model])
    print(f"Wilcoxon test ({human_eval_column} vs {model}): stat={stat}, p-value={p}")

Friedman's test statistic: 19.429813880675635, p-value: 0.003496090809900352
Wilcoxon test (HumanEval_Cyclomatic Complexity vs GPT3.5S_Cyclomatic Complexity): stat=1852.0, p-value=0.2552544850155303
Wilcoxon test (HumanEval_Cyclomatic Complexity vs GPT3.5I): stat=2121.0, p-value=0.8007871137740373
Wilcoxon test (HumanEval_Cyclomatic Complexity vs GPT3.5E): stat=1148.0, p-value=0.01378374602863343
Wilcoxon test (HumanEval_Cyclomatic Complexity vs GPT4S): stat=1343.0, p-value=0.12872155602923252
Wilcoxon test (HumanEval_Cyclomatic Complexity vs GPT4I): stat=1305.0, p-value=0.01973004103675469
Wilcoxon test (HumanEval_Cyclomatic Complexity vs GPT4E): stat=1875.5, p-value=0.00473501481561283


In [None]:
import pandas as pd
from scipy.stats import friedmanchisquare, wilcoxon

# Load the CSV file
file_path = "MI_StatisticalEvaluationResults.csv"
data = pd.read_csv(file_path)

# Extract LOC values as separate groups for analysis
human_eval = data["HumanEval_Maintainability Index"]
gpt35s = data["GPT3.5S_Maintainability Index"]
gpt35i = data["GPT3.5I"]
gpt35e = data["GPT3.5E"]
gpt4s = data["GPT4S"]
gpt4i = data["GPT4I"]
gpt4e = data["GPT4E"]

# Perform Friedman's test
stat, p_value = friedmanchisquare(human_eval, gpt35s, gpt35i, gpt35e, gpt4s, gpt4i, gpt4e)
print(f"Friedman's test statistic: {stat}, p-value: {p_value}")


from itertools import combinations

models = ["GPT3.5S_Maintainability Index", "GPT3.5I", "GPT3.5E", "GPT4S", "GPT4I", "GPT4E"]
human_eval_column = "HumanEval_Maintainability Index"

for model in models:
    stat, p = wilcoxon(data[human_eval_column], data[model])
    print(f"Wilcoxon test ({human_eval_column} vs {model}): stat={stat}, p-value={p}")

Friedman's test statistic: 170.59518599562415, p-value: 3.3631924772573337e-34
Wilcoxon test (HumanEval_Maintainability Index vs GPT3.5S_Maintainability Index): stat=467.0, p-value=1.8811418083423235e-22
Wilcoxon test (HumanEval_Maintainability Index vs GPT3.5I): stat=4566.0, p-value=0.05288720486079904
Wilcoxon test (HumanEval_Maintainability Index vs GPT3.5E): stat=2932.0, p-value=3.005958655961956e-07
Wilcoxon test (HumanEval_Maintainability Index vs GPT4S): stat=2272.0, p-value=1.8473281691106275e-08
Wilcoxon test (HumanEval_Maintainability Index vs GPT4I): stat=2507.5, p-value=4.791099099169721e-06
Wilcoxon test (HumanEval_Maintainability Index vs GPT4E): stat=2308.0, p-value=8.503161857581495e-10


In [1]:
import pandas as pd
from scipy.stats import friedmanchisquare, wilcoxon

# Load the CSV file
file_path = "CogC_StatisticalEvaluationResults.csv"
data = pd.read_csv(file_path)

# Extract LOC values as separate groups for analysis
human_eval = data["HumanEval_Complexity"]
gpt35s = data["GPT3.5S"]
gpt35i = data["GPT3.5I"]
gpt35e = data["GPT3.5E"]
gpt4s = data["GPT4S"]
gpt4i = data["GPT4I"]
gpt4e = data["GPT4E"]

# Perform Friedman's test
stat, p_value = friedmanchisquare(human_eval, gpt35s, gpt35i, gpt35e, gpt4s, gpt4i, gpt4e)
print(f"Friedman's test statistic: {stat}, p-value: {p_value}")

from itertools import combinations

models = ["GPT3.5S", "GPT3.5I", "GPT3.5E", "GPT4S", "GPT4I", "GPT4E"]
human_eval_column = "HumanEval_Complexity"

for model in models:
    stat, p = wilcoxon(data[human_eval_column], data[model])
    print(f"Wilcoxon test ({human_eval_column} vs {model}): stat={stat}, p-value={p}")

Friedman's test statistic: 108.95971074380202, p-value: 3.3657445768802907e-21
Wilcoxon test (HumanEval_Complexity vs GPT3.5S): stat=2247.0, p-value=0.3367755904310903
Wilcoxon test (HumanEval_Complexity vs GPT3.5I): stat=2255.5, p-value=0.5438883799048799
Wilcoxon test (HumanEval_Complexity vs GPT3.5E): stat=651.0, p-value=7.051189198915836e-08
Wilcoxon test (HumanEval_Complexity vs GPT4S): stat=1675.5, p-value=0.016327949435260492
Wilcoxon test (HumanEval_Complexity vs GPT4I): stat=1028.5, p-value=7.954093372341784e-06
Wilcoxon test (HumanEval_Complexity vs GPT4E): stat=961.0, p-value=1.0125017061754096e-09
