# Statistical Evaluation of the LLM 4 Model Completion Paper

## Experiment 1
The first experiments aims at answering the research question "To what extend can pre-trained language models and few-shot learning be used for the completion of software models?"

In [None]:
#PATH_TO_RESULTS = './../../model_completion_dataset/revision/results/results_iso_check.csv'
#PATH_TO_RESULTS = './../../model_completion_dataset/SMO/results/few_shot_samples/stats_combined_new.csv'
PATH_TO_RESULTS = './../../model_completion_dataset/Synthetic/results/results_iso_check.csv'
PATH_TO_RESULTS_RANDOM = './../../model_completion_dataset/SMO/results/few_shot_samples/experiment_retrieval_comparison/stats.csv'

First we import all libraries. Make sure to have them installed (e.g., via pip).

In [None]:
import pandas as pd
from matplotlib import pyplot as plt # for plotting
from scipy import stats # for statistics

First, we load the results file:

In [None]:
results_df = pd.read_csv(PATH_TO_RESULTS)
results_df.head()

For some datasets, we have chosen a subset of samples for model completion and manual evaluation. We select only these:

In [None]:
if 'in_sample' in results_df.columns:
    results_df = results_df[results_df['in_sample'] == True]

### Experiment 1 -- Part 1
As a first step, we want to understand how correct the completions are:

We now want to know how many completions are
 - correct according to the serialization format
 - structurally correct (the generated edge and the ground-truth edge have the same source and target node)
 - change type correct (structurally correct and the nodes and edges have the correct change type, i.e., preserved, removed, added, or change attribute)
 - type correct (i.e., change type correct and the correct reference type and correct classes of source and target nodes)
 - semantically correct (i.e., the meaning of the completion is the same as for the ground truth)
 
  

In [None]:
total_count = len(results_df)
correct_format = len(results_df[results_df['correct_format'] == True])
correct_structure = len(results_df[results_df['structural_isomorphic_completion'] == True])
correct_change_structure = len(results_df[results_df['change_type_isomorphic_completion'] == True])
correct_type = len(results_df[results_df['type_isomorphic_completion'] == True])

result_dict = {'total_count': total_count,
               'correct_format': correct_format,
               'correct_structure': correct_structure,
               'correct_change_structure': correct_change_structure,
               'correct_type': correct_type}

# We only check the semantic correctness for some datasets (because it is quite a lot of manual effort and does even not make sense for the synthetic dataset)
if 'correctness' in results_df.columns:
    conceivable_semantic = len(results_df[results_df['correctness'] >= 1.0])
    correct_semantic = len(results_df[results_df['correctness'] == 2.0])
    result_dict['conceivable_semantic'] = conceivable_semantic
    result_dict['correct_semantic'] = correct_semantic

correctness_eval_df = pd.DataFrame(columns=['property', 'count', 'relative'])
correctness_eval_df['property'] = result_dict.keys()
correctness_eval_df['count'] = result_dict.values()
correctness_eval_df['relative'] = correctness_eval_df['count']/total_count



correctness_eval_df

### Experiment 1 - Part 2
As a next step, we want to understand the relationship between the number of examples provided and the correctness.

In [None]:
if 'correctness' in results_df.columns:
    results_df[['few_shot_count', 'correctness']].corr(method='pearson')

We also can check if the average number of few-shot samples for the correct ones is above average.

In [None]:
original_distribution = results_df['few_shot_count']
if 'correctness' in results_df.columns:
    few_shot_count_correct = results_df[results_df['correctness'] == 2.0]['few_shot_count']
    print(stats.mannwhitneyu(original_distribution, few_shot_count_correct, alternative="greater"))
few_shot_count_type_correct = results_df[results_df['type_isomorphic_completion'] == True]['few_shot_count']
plt.hist(original_distribution, bins=12, color='red')
plt.hist(few_shot_count_type_correct, bins=12, color='green')
plt.show()

print(stats.mannwhitneyu(original_distribution, few_shot_count_type_correct, alternative="greater"))

For other SE tasks (e.g., Code Summarization Ahmed, Toufique, and Premkumar Devanbu. "Few-shot training LLMs for project-specific code-summarization." Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering. 2022.) it has been found that 1-shot learning doesn't perform well. We therefore compare 1-shot learning results to the overall results.

In [None]:
total = len(results_df)
one_shot = results_df[results_df['few_shot_count'] == 1]
one_shot_total = len(one_shot)
if 'correctness' in results_df.columns:
      correct_1_shot_count = len(one_shot[one_shot['correctness'] == 2])
      correct_all_count = len(results_df[results_df['correctness'] == 2])
      
      correctness_all = results_df[results_df['few_shot_count'] > 1]['correctness']
      #avg_correctness_1_shot = correctness_1_shot.mean()
      #avg_correctness_all = correctness_all.mean()

      print(f"Mean correctness for 1-shot: {correct_1_shot_count/one_shot_total}\n" +
            f"Mean correctness all: {correct_all_count/total}")

      print(stats.mannwhitneyu(one_shot['correctness'], correctness_all, alternative='less'))
      
all = results_df['type_isomorphic_completion']
p_correct_type = len(all[all==True])/len(all)

correctness_1_shot_type = results_df[results_df['few_shot_count'] == 1]['type_isomorphic_completion']
p_1_shot = len(correctness_1_shot_type[correctness_1_shot_type==True]) / len(correctness_1_shot_type)

print(f"Type correct all: {p_correct_type}. Type correct 1_shot: {p_1_shot}")

print(stats.binomtest(len(correctness_1_shot_type[correctness_1_shot_type==True]), len(correctness_1_shot_type), p=p_correct_type, alternative='less'))


### Experiment 1 - Part 3
Now, we want to understand how the correctness depends on the presence of a similar example. We record the presence of a similar example only manually. 

In [None]:
all_correct = results_df['correctness']
similar_correct = results_df[results_df['similar_few_shot'] == True]['correctness']
print(stats.mannwhitneyu(all_correct, similar_correct, alternative='less'))

all_correct = results_df[results_df['correctness'] == 2]
similar_correct = all_correct[all_correct['similar_few_shot'] == True]

all_correct_count = len(all_correct)
all_count = len(results_df)
similar_correct_count = len(similar_correct)
all_similar_count = len(results_df[results_df['similar_few_shot'] == True])

stats.binomtest(similar_correct_count, all_similar_count, all_correct_count/all_count, alternative='greater')


### Experiment 1 - Part 4
Last but not least, we compare the results of random retrieval vs. semantic retrieval.

In [None]:
results_df_random = pd.read_csv(PATH_TO_RESULTS_RANDOM)
if 'in_sample' in results_df_random.columns:
    results_df_random = results_df_random[results_df_random['in_sample'] == 'yes']
results_df_random.head()

In [None]:
p_correct = len(results_df[results_df['correctness'] == 2])/len(results_df)



correct_format = results_df['correct_format']
p_correct_format = len(correct_format[correct_format==True])/len(correct_format)


correct_structure = results_df['structural_isomorphic_completion']
correct_change_structure = results_df['change_type_isomorphic_completion']
correct_type = results_df['type_isomorphic_completion']
p_correct_type = len(correct_type[correct_type==True])/len(correct_type)

correct_format_random = results_df_random['correct_format']
correct_structure_random = results_df_random['structural_isomorphic_completion']
correct_change_structure_random = results_df_random['change_type_isomorphic_completion']
correct_type_random = results_df_random['type_isomorphic_completion']

print(stats.binomtest(len(correct_type_random[correct_type_random==True]), len(correct_type_random), p=p_correct, alternative='less'))
print(stats.binomtest(len(correct_type_random[correct_type_random==True]), len(correct_type_random), p=p_correct_type, alternative='less'))
print(stats.binomtest(len(correct_format_random[correct_format_random==True]), len(correct_format_random), p=p_correct_format, alternative='less'))