In [1]:
from datasets import load_from_disk
from openai import OpenAI
from datasets import DatasetDict, Dataset

import sys
sys.path.append('..')

from src.data_processing.construct_messages import ALPACA_INTROMESSAGE_INPUT

  from .autonotebook import tqdm as notebook_tqdm


# Evaluate Scripts

| Task      | Metric |
|-----------|----------------------|
| Classification | Accuracy |
| Classification | F1 Score |
| Classification | Missing Ratio |
| Classification | Matthews Correlation Coefficient (MCC) |
| Sequential Labeling | F1 score |
| Sequential Labeling | Label F1 score |
| Relation Extraction | Precision |
| Relation Extraction | Recall |
| Relation Extraction | F1 score |
| Extractive and Abstractive Summarization | Rouge-N |
| Extractive and Abstractive Summarization | Rouge-L |
| Question Answering | EmACC |

| Data      | Task |
|-------------|-----------------|
| FPB | sentiment analysis |
| FiQA-SA | sentiment analysis |
| Headline | news headline classification |
| NER | named entity recognition |
| FinQA | question answering |
| ConvFinQA | question answering |

In [2]:
client = OpenAI(api_key='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')

def get_prediction(prompt):
  response = client.chat.completions.create(
    messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="gpt-3.5-turbo",    
      temperature=0,        
      max_tokens=2048         
  )
  return response.choices[0].message.content

def get_prediction_on_message(message):
  response = client.chat.completions.create(
    messages=message,
      model="gpt-3.5-turbo",    
      temperature=0,        
      max_tokens=2048         
  )
  return response.choices[0].message.content

def get_prompt(instruction,input):
    return ALPACA_INTROMESSAGE_INPUT.replace('{instruction}',instruction).replace('{input}',input)

## Final ConvFinQA

In [3]:
from src.evaluation.evaluator_convfinqa import ConvFinQaEvaluator
from src.data_processing.construct_messages import add_prediction_messages
from datasets import DatasetDict, Dataset

In [6]:
## add additional information to the message

def get_predictions(path, task = None):

    ds = load_from_disk(path)
    if task:
        test_ds = ds['test'].map(lambda x: {'instruction': x['instruction'] + task })
    else:
        test_ds = ds['test']

    updated_dataset = test_ds.map(add_prediction_messages)
    test_ds = updated_dataset.to_pandas().head(30)

    test_ds['prediction'] = test_ds.messages.apply(lambda x : get_prediction_on_message(x))

    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(test_ds)

    return new_test_ds


In [6]:
task =  '- antwoord op de vraag door juist te antwoorden met het getal, niets anders'
 
eval_convfinqa = get_predictions('../data/final_filtered/fingpt-convfinqa', task)

Map: 100%|██████████| 1453/1453 [00:00<00:00, 1480.09 examples/s]


In [7]:
# eval

eval_convfinqa_score = ConvFinQaEvaluator('CohereForAI/aya-101')._evaluate(eval_convfinqa['test'])

In [9]:
eval_convfinqa_score.metrics

[Metric(name='Accuracy', value=0.26666666666666666)]

## Final Sentiment

In [10]:
from src.evaluation.evaluator_sentiment import SentimentEvaluator

In [11]:
eval_sentiment = get_predictions('../data/final_filtered/fingpt-sentiment')

Map: 100%|██████████| 5788/5788 [00:00<00:00, 21870.78 examples/s]


In [13]:
# eval

eval_sentiment_score = SentimentEvaluator('CohereForAI/aya-101')._evaluate(eval_sentiment['test'])

In [17]:
eval_sentiment_score.metrics

[Metric(name='acc', value=0.5333333333333333),
 Metric(name='f1_macro', value=0.5438095238095239),
 Metric(name='f1_micro', value=0.5333333333333333),
 Metric(name='f1_weighted', value=0.5251428571428571)]

## Final Headline

In [18]:
from src.evaluation.evaluator_headline import HeadlineEvaluator

In [19]:
eval_headline = get_predictions('../data/final_filtered/fingpt-headline')

Map: 100%|██████████| 9094/9094 [00:00<00:00, 23099.05 examples/s]


In [20]:
eval_headline_score = HeadlineEvaluator('CohereForAI/aya-101')._evaluate(eval_headline['test'])

In [22]:
eval_headline_score.metrics

[Metric(name='Acc', value=0.5),
 Metric(name='F1 binary', value=0.4827586206896552)]

## Final NER

In [7]:
from src.evaluation.evaluator_ner import NEREvaluator

In [19]:
task = ' - antwoord op de vraag door te anwoorden in het volgende formaat: [] is een [], [] is een [], ...'

eval_ner = get_predictions('../data/final_filtered/fingpt-ner', task)

In [22]:
# more tweaking needed
eval_ner['test'] = eval_ner['test'].map(lambda x : {'prediction' : x['prediction'].replace('[','').replace(']','').replace('- ','').replace('\n','').strip()})

Map: 100%|██████████| 30/30 [00:00<00:00, 6018.80 examples/s]


In [24]:
eval_ner_score = NEREvaluator('CohereForAI/aya-101')._evaluate(eval_ner['test'])

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
eval_ner_score.metrics

[Metric(name='Accuracy', value=0.9228843264897346),
 Metric(name='F1', value=0.18987341772151897),
 Metric(name='Classification Report', value='              precision    recall  f1-score   support\n\n         LOC       0.20      0.20      0.20        10\n           O       0.00      0.00      0.00         0\n         ORG       0.00      0.00      0.00         5\n         PER       0.59      0.20      0.30        66\n\n   micro avg       0.19      0.19      0.19        81\n   macro avg       0.20      0.10      0.12        81\nweighted avg       0.51      0.19      0.27        81\n')]