# WebNLG Evaluation

#### Imports

In [None]:
!pip3 install pandas
!pip3 install bert-score

!pip3 install nltk
import nltk
# nltk.download('wordnet')

# !pip3 install -r ../Evaluation/rouge/requirements.txt
!pip3 install rouge-score
!pip3 install requests

In [None]:
import json
import pandas as pd
from Import import create_dataframe
from Export import *
from Evaluation import *
from Models import *

## Evaluation of Separate Predictions

In [None]:
file_name = "llama-7b_zero-shot-chat_2023-06-17_19-0"
webnlg_file_path = "./webnlg-dataset/release_v3.0/en/json/test/v3.0_test_set_with_refs.json"
data_set_size = 1779
model = ModelType.LLAMA
prompt_type = "zero-shot-chat"

predictions_df = pd.read_csv(f"Results/csv/Predictions/Test/{file_name}.csv")

This is only needed if the lexicalisations are not already stored in predictions_df. This might be the case if you used the test set without references.  
Otherwise, skip the cell

In [None]:
# predictions_size = data_set_size
# webnlg_df = create_dataframe(webnlg_file_path, data_set_size, predictions_size, False)
# predictions_df["lexicalisations"] = webnlg_df["lexicalisations"]
# predictions_df.head()

In [None]:
def create_evaluation_df(predictions_df: pd.DataFrame, model: ModelType, clean: bool) -> pd.DataFrame:
    ''' Adds the results of the given predictions_df to a new evaluation_df '''
    evaluation_df = pd.DataFrame()
    # Add id to the evaluation dataframe
    evaluation_df['id'] = predictions_df['id']

    # Add category to the evaluation dataframe
    evaluation_df['category'] = predictions_df['category']

    # Add triple set size to the evaluation dataframe
    evaluation_df['tripleset_size'] = predictions_df['triplesetsize']

    # Add modified tripleset to the evaluation dataframe
    evaluation_df['tripleset'] = predictions_df['modifiedtripleset']

    # Add lexicalisations to the evaluation dataframe
    evaluation_df['lexicalisations'] = [get_lexicalisation_of_references(x, False) for x in predictions_df["lexicalisations"]]

    # Add predictions to the evaluation dataframe
    if clean:
      evaluation_df[f'prediction_{model.value}'] = [clean_response(pred) for pred in predictions_df[f'prediction_{model.value}']]
    else: 
      evaluation_df[f'prediction_{model.value}'] = predictions_df[f'prediction_{model.value}']

    # Add BLEU scores to the evaluation dataframe
    all_bleu_scores = [get_bleu_score_for_prediction(x[0], x[1], False) for x in zip(evaluation_df[f'prediction_{model}'], evaluation_df['lexicalisations'])]
    evaluation_df[f'bleu_nltk_{model}'] = all_bleu_scores

    # Add METEOR scores to the evaluation dataframe
    all_meteor_scores = [get_meteor_score_for_prediction(x[0], x[1], False) for x in zip(evaluation_df[f'prediction_{model}'], evaluation_df['lexicalisations'])]
    evaluation_df[f'meteor_{model}'] = all_meteor_scores

    # Add Bert scores to the evaluation dataframe
    # all_bert_scores = [get_all_bert_scores_for_prediction(x[0], x[1], False) for x in zip(evaluation_df[f'prediction_{model}'], evaluation_df['lexicalisations'])]
    # evaluation_df[f'bert_precision_{model}'] = [x[0] for x in all_bert_scores]
    # evaluation_df[f'bert_recall_{model}'] = [x[1] for x in all_bert_scores]
    # evaluation_df[f'bert_f1_{model}'] = [x[2] for x in all_bert_scores]

    # Add execution time to the evaluation dataframe
    evaluation_df[f'execution_time_{model}'] =predictions_df[f'execution_time_{model}']
    
    return evaluation_df

evaluation_df = create_evaluation_df(predictions_df, model, True)

In [None]:
evaluation_df.head()

#### Export

In [None]:
export_dataframe_to_csv(evaluation_df, "Results/csv/Evaluations/Test", f"{file_name}_cleaned")

## Evaluation of Overall Predictions

### Setup for Official Evaluation Script

#### Transform Hypothesis to .txt

In [None]:
file_name = "copy-baseline_2023-06-18_14-47"
prediction_column = "prediction_copy-baseline"
predictions_df = pd.read_csv(f"Results/csv/Predictions/Test/{file_name}.csv")

In [None]:
print(clean_response("Output text: Mermaid (Train song) was followed by Imagine (John Lennon song) which was produced by Espionage (production team) and written by Pat Monahan. USER: Input triples: [{'object': 'Mike_Pence', 'property': '"))

In [None]:
predictions_df[prediction_column] = [clean_response(x) for x in predictions_df[prediction_column]]

In [None]:
export_predictions_to_file(predictions_df, "Results/txt/Predictions/Test", f"{file_name}", prediction_column)

To execute the overall evaluation you can use the [official script](https://github.com/WebNLG/GenerationEval)