# Statistical Evaluation of the LLM 4 Model Completion Paper

## Experiment 4
The last experiments aims at answering the research question "What initial insights can be gained when comparing domain-specific fine-tuning and few-shot learning?"

In [None]:
path_to_finetuning_curie = "../final_output_experiment_4/results_completion_curie_6D20E51p0_curie.csv"
path_to_finetuning_ada ="../final_output_experiment_4/completion_eval_ada.csv"
path_to_few_shot_ada = "../final_output_experiment_4/final_experiment1_fewshot_ada_syntetic.csv"
path_to_few_shot_curie = "../final_output_experiment_4/final_experiment1_fewshot_curie_syntetic.csv"

First we import all libraries. Make sure to have them installed (e.g., via pip).

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

Reading the data from CSV files:

In [None]:
data_finetuning_ada = pd.read_csv(path_to_finetuning_ada, sep=';')
data_finetuning_curie = pd.read_csv(path_to_finetuning_curie, sep=';')
data_few_shot_ada = pd.read_csv(path_to_few_shot_ada)
data_few_shot_curie = pd.read_csv(path_to_few_shot_curie)

We want to understand how correct the completions are: 
Types to test are:
- at least one correct edge but some can be missing
- completely correct completion
- correct completion but additional edges (ground truth is a subgraph of the generated completion)
- incomplete but correct completion (generated completetion is a subgraph of the ground truth)

In [None]:

types_to_test = [
    "type_isomorphic_completion",
    "type_subgraph_isomorphic_generated_in_gt",
    "type_subgraph_isomorphic_gt_in_generated",
    "at_least_one_correct_edge"
]

We define the significance level:

In [None]:
ALPHA = 0.05

Datasets to compare  

In [None]:
datasets = [
        ( data_few_shot_ada,data_finetuning_ada, "Ada"),
        ( data_few_shot_curie,data_finetuning_curie, "Curie")
    ]


Performing Mann-Whitney U tests: 
- first we the conditions for each type in the fine-tuning dataset
- than we collect this data by category

In [None]:
for data1, data2, name in datasets:

    # Check the conditions 
    isomorphic_data2 = data2['completion_best_rank_result'] == 'ISOMORPHIC'
    generated_in_gt_data2 = (data2['completion_best_rank_result'] == 'TOO_SMALL') | (
                data2['completion_best_rank_result'] == 'ISOMORPHIC')
    gt_in_generated_data2 = (data2['completion_best_rank_result'] == 'TOO_LARGE') | (
                data2['completion_best_rank_result'] == 'ISOMORPHIC')
    correct_edge_data2 = data2['completion_best_rank_correct_edges'] >= 1

    # Collect data
    isomorphic_data1 = data1['type_isomorphic_completion']
    generated_in_gt_data1 = data1['type_subgraph_isomorphic_generated_in_gt']
    gt_in_generated_data1 = data1['type_subgraph_isomorphic_gt_in_generated']
    correct_edge_data1 = data1['at_least_one_correct_edge']

    #what to run:
    what_to_compare= [
        (isomorphic_data1, isomorphic_data2, 'Isomorphic'),
        (generated_in_gt_data1, generated_in_gt_data2, 'Generated_in_gt'),
        (gt_in_generated_data1, gt_in_generated_data2, 'Gt_in_generated'),
        (correct_edge_data1, correct_edge_data2, 'At Least One Correct Edge')
    ]

    for data1_subset, data2_subset, test_name in what_to_compare:
       
        # Two-sided
        u_statistic, p_value_two_sided = mannwhitneyu(data1_subset, data2_subset, alternative='two-sided')
        conclusion_two_sided = "likely not from the same distribution" if p_value_two_sided < ALPHA else "not enough evidence to conclude they are from different distributions"

        # One-sided (greater)
        u_statistic, p_value_greater = mannwhitneyu(data1_subset, data2_subset, alternative='greater')
        conclusion_greater = "likely greater" if p_value_greater < ALPHA else "not enough evidence to conclude greater"

        # One-sided (less)
        u_statistic, p_value_less = mannwhitneyu(data1_subset, data2_subset, alternative='less')
        conclusion_less = "likely less" if p_value_less < ALPHA else "not enough evidence to conclude less"

        print(
            f"{test_name}, {name}:\n Two-sided: U={u_statistic}, p-value={p_value_two_sided}. Conclusion: {conclusion_two_sided}\n Greater: U={u_statistic}, p-value={p_value_greater}. Conclusion: {conclusion_greater}\n Less: U={u_statistic}, p-value={p_value_less}. Conclusion: {conclusion_less}")

