In [205]:
from openai import OpenAI
from datasets import DatasetDict, Dataset, load_from_disk
import pandas as pd 
from tqdm import tqdm
import sys
sys.path.append('..')

from src.evaluation.answer_extractor import extracted_answers, extracted_answers_ner, extracted_answers_finred, extracted_answers_convfinqa

# Evaluation

In [206]:
# models
data_folder = '/home/sandernoels/fingeit/data/final/responses/final/'
client = OpenAI(api_key='<token_here>')

models = {
    'fingeit' : '/home/sandernoels/fingeit/data/final/responses/nl/FinGEITje-sft_responses_b72dde7c-ead1-4741-93ee-e089b49809d4.txt',
    'geitje-ultra' : '/home/sandernoels/fingeit/data/final/responses/nl/GEITje-7B-ultra_responses_3a614c9f-de6f-44d8-b4a8-5debf1ea61c6.txt',
    'geitje' : '/home/sandernoels/fingeit/data/final/responses/nl/GEITje-7B-ultra_responses_3a614c9f-de6f-44d8-b4a8-5debf1ea61c6.txt',
    'fingpt-llama' : '/home/sandernoels/fingeit/data/final/responses/nl/fingpt_llama2_responses_897d1bc3-9a03-4a94-abb9-235f9ad150ec.txt',
    'pixiu' : '/home/sandernoels/fingeit/data/final/responses/nl/pixiu_responses_1ed952c6-112e-47b1-8491-e2cc7913d21a.txt',
}

eval_df = load_from_disk('/home/sandernoels/fingeit/data/final/sampled_eval_df_ext')['test'].to_pandas()

In [207]:
def read_lines(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

In [208]:
for model, path in models.items():
    eval_df[f'prediction_{model}_raw'] = read_lines(path)

## Sentiment

### Extraction

In [209]:
task = 'fingpt-sentiment'

sentiment_base = eval_df[eval_df['task'] == task]

In [210]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers(sentiment_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    sentiment_base[f'prediction_{model}'] = extracted_answer

In [112]:
sentiment_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [211]:
from src.evaluation.evaluator_sentiment import SentimentEvaluator

In [212]:
sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [213]:
prediction_cols = [el for el in sentiment.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    df = sentiment.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_sentiment_score = SentimentEvaluator()._evaluate(new_test_ds['test'])
    eval_sentiment_score.metrics

    evals[col] = eval_sentiment_score.metrics


In [214]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit,0.79,0.783196,0.79,0.790308
prediction_fingeit_raw,0.79,0.783196,0.79,0.790308
prediction_gpt-3.5-turbo,0.752,0.724415,0.752,0.740101
prediction_gpt-3.5-turbo_raw,0.742,0.713072,0.742,0.729132
prediction_geitje-ultra,0.674,0.638646,0.674,0.661714
prediction_pixiu,0.632,0.640086,0.632,0.644571
prediction_geitje-ultra_raw,0.564,0.529673,0.564,0.555212
prediction_geitje,0.54,0.497929,0.54,0.520434
prediction_geitje_raw,0.454,0.448226,0.454,0.466206
prediction_fingpt-llama,0.35,0.32094,0.35,0.276341


## Headline

### Extraction

In [215]:
task = 'fingpt-headline'

headline_base = eval_df[eval_df['task'] == task]

In [216]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers(headline_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    headline_base[f'prediction_{model}'] = extracted_answer

In [None]:
headline_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [217]:
from src.evaluation.evaluator_headline import HeadlineEvaluator

In [218]:
headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [220]:
prediction_cols = [el for el in headline.columns if el.startswith('prediction')]
evals = {}

# eval
for col in prediction_cols:
    df = headline.copy().fillna('')
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_headline_score = HeadlineEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_headline_score.metrics


In [221]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T

df.sort_values(by='Acc', ascending=False)

Unnamed: 0,Acc,F1 binary
prediction_fingeit,0.92,0.836066
prediction_fingeit_raw,0.92,0.836066
prediction_fingpt-llama,0.696,0.0
prediction_pixiu,0.67,0.459016
prediction_gpt-3.5-turbo,0.64,0.485714
prediction_gpt-3.5-turbo_raw,0.606,0.466125
prediction_geitje_raw,0.314,0.215103
prediction_geitje,0.298,0.166271
prediction_geitje-ultra_raw,0.082,0.068966
prediction_geitje-ultra,0.064,0.025


## NER

### Extraction

In [222]:
task = 'fingpt-ner'

ner_base = eval_df[eval_df['task'] == task]

In [223]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers_ner(ner_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    ner_base[f'prediction_{model}'] = extracted_answer

In [185]:
ner_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [224]:
from src.evaluation.evaluator_ner import NEREvaluator

In [225]:
ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [226]:
prediction_cols = [el for el in ner.columns if el.startswith('prediction')]


evals = {}

# eval
for col in prediction_cols:
    
    df = ner.copy()
    df[col] = df[col].astype(str).fillna('nan')
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_score = NEREvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_score.metrics


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [227]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='F1', ascending=False)

Unnamed: 0,F1,Classification Report
prediction_fingeit_raw,0.432836,precision recall f1-score ...
prediction_fingeit,0.417266,precision recall f1-score ...
prediction_gpt-3.5-turbo,0.315217,precision recall f1-score ...
prediction_pixiu,0.253846,precision recall f1-score ...
prediction_geitje,0.154472,precision recall f1-score ...
prediction_geitje-ultra,0.099585,precision recall f1-score ...
prediction_fingpt-llama,0.010695,precision recall f1-score ...
prediction_geitje-ultra_raw,0.0,precision recall f1-score ...
prediction_geitje_raw,0.0,precision recall f1-score ...
prediction_fingpt-llama_raw,0.0,precision recall f1-score ...


## NER (CLS)

### Extraction

In [228]:
task = 'fingpt-ner-cls'

ner_cls_base = eval_df[eval_df['task'] == task]

In [None]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers(ner_cls_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client, ['organisatie', 'locatie', 'persoon'])
    ner_cls_base[f'prediction_{model}'] = extracted_answer

In [None]:
ner_cls_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [229]:
from src.evaluation.evaluator_ner_cls import NERCLSEvaluator

In [230]:
ner_cls = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [231]:
prediction_cols = [el for el in ner_cls.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = ner_cls.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_cls_classification_score = NERCLSEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_cls_classification_score.metrics


In [232]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit,0.912,0.890392,0.912,0.915893
prediction_fingeit_raw,0.84,0.814938,0.84,0.850624
prediction_pixiu,0.776,0.745935,0.776,0.791732
prediction_gpt-3.5-turbo,0.69,0.675122,0.69,0.711552
prediction_geitje,0.67,0.655376,0.67,0.692576
prediction_geitje-ultra,0.662,0.649341,0.662,0.684253
prediction_fingpt-llama,0.386,0.382242,0.386,0.356993
prediction_geitje-ultra_raw,0.238,0.192246,0.238,0.091509
prediction_gpt-3.5-turbo_raw,0.238,0.192246,0.238,0.091509
prediction_geitje_raw,0.238,0.192246,0.238,0.091509


## FinRED

### Extraction

In [233]:
task = 'fingpt-finred'

finred_base = eval_df[eval_df['task'] == task]

In [None]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers_finred(finred_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    finred_base[f'prediction_{model}'] = extracted_answer

In [196]:
finred_base.to_csv(f'{data_folder}{task}-cls.csv', index=False, sep=';')

### Evaluation

In [234]:
from src.evaluation.evaluator_finred import FinRedEvaluator

In [235]:
finred = pd.read_csv(f'{data_folder}{task}-cls.csv', sep=';')

In [236]:
prediction_cols = [el for el in finred.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = finred.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_finred_classification_score = FinRedEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_finred_classification_score.metrics


In [237]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit_raw,0.569277,0.468741,0.569277,0.559711
prediction_fingeit,0.569277,0.479854,0.569277,0.557941
prediction_fingpt-llama,0.259036,0.179006,0.259036,0.232336
prediction_geitje,0.123494,0.104194,0.123494,0.147762
prediction_geitje-ultra,0.111446,0.078931,0.111446,0.12122
prediction_pixiu,0.045181,0.040753,0.045181,0.056767
prediction_fingpt-llama_raw,0.021084,0.0289,0.021084,0.014581
prediction_pixiu_raw,0.003012,0.001949,0.003012,0.00539
prediction_geitje-ultra_raw,0.0,0.0,0.0,0.0
prediction_geitje_raw,0.0,0.0,0.0,0.0


## ConvFinQA

### Extraction

In [238]:
task = 'fingpt-convfinqa'

convfinqa_base = eval_df[eval_df['task'] == task]

In [None]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers_convfinqa(convfinqa_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    convfinqa_base[f'prediction_{model}'] = extracted_answer

In [None]:
convfinqa_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [239]:
from src.evaluation.evaluator_convfinqa import ConvFinQaEvaluator

In [240]:
convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [241]:
prediction_cols = [el for el in convfinqa.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = convfinqa.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_convfinqa_score = ConvFinQaEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_convfinqa_score.metrics


In [242]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy
prediction_fingeit_raw,0.324
prediction_fingeit,0.324
prediction_pixiu_raw,0.294
prediction_pixiu,0.286
prediction_gpt-3.5-turbo_raw,0.196
prediction_geitje,0.056
prediction_geitje_raw,0.056
prediction_geitje-ultra,0.036
prediction_geitje-ultra_raw,0.016
prediction_fingpt-llama_raw,0.010554
