In [126]:
from openai import OpenAI
from datasets import DatasetDict, Dataset, load_from_disk
import pandas as pd 
from tqdm import tqdm 
import sys
sys.path.append('..')

from src.evaluation.answer_extractor import extracted_answers, extracted_answers_convfinqa
from src.evaluation.answer_extractor_en import extracted_answers_ner, extracted_answers_finred

# Evaluation (EN)

In [127]:
# models
data_folder = '/home/sandernoels/fingeit/data/final/responses/final/en-'
client = OpenAI(api_key='<token_here>')

models = {
    'fingeit' : '/home/sandernoels/fingeit/data/final/responses/en/en_FinGEITje-sft_responses_e0f909d4-e04d-4441-86a3-5e529f007d53.txt',
    'fingpt-llama' : '/home/sandernoels/fingeit/data/final/responses/en/en_fingpt_llama2_responses_c53d172a-dc48-4ea9-86c5-0063e9ff10a1.txt',
    'pixiu' : '/home/sandernoels/fingeit/data/final/responses/en/en_pixiu_responses_711bf9bc-13e3-4273-9ad7-568749105fcd.txt',
}

eval_df = load_from_disk('/home/sandernoels/fingeit/data/final/en_sampled_eval_df_ext')['test'].to_pandas()

In [128]:
def read_lines(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

In [129]:
for model, path in models.items():
    eval_df[f'prediction_{model}_raw'] = read_lines(path)

## Sentiment

### Extraction

In [130]:
task = 'fingpt-sentiment'

sentiment_base = eval_df[eval_df['task'] == task]

In [131]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers(sentiment_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    sentiment_base[f'prediction_{model}'] = extracted_answer

In [42]:
sentiment_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [132]:
from src.evaluation.evaluator_sentiment import SentimentEvaluator

In [134]:
sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [135]:
prediction_cols = [el for el in sentiment.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    df = sentiment.copy()
    df[col] = df[col].astype(str)
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_sentiment_score = SentimentEvaluator(language = 'EN')._evaluate(new_test_ds['test'])
    eval_sentiment_score.metrics

    evals[col] = eval_sentiment_score.metrics


In [136]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_pixiu_raw,0.728,0.731204,0.728,0.728553
prediction_pixiu,0.728,0.731204,0.728,0.728553
prediction_en-fingeit,0.692,0.697012,0.692,0.69234
prediction_en-fingeit_raw,0.684,0.686843,0.684,0.682951
prediction_en-fingpt-llama_raw,0.402,0.210271,0.402,0.247337
prediction_en-fingpt-llama,0.402,0.210545,0.402,0.247659


## Headline

### Extraction

In [137]:
task = 'fingpt-headline'

headline_base = eval_df[eval_df['task'] == task]

In [138]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers(headline_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    headline_base[f'prediction_{model}'] = extracted_answer

In [47]:
headline_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [139]:
from src.evaluation.evaluator_headline import HeadlineEvaluator

In [140]:
headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [141]:
prediction_cols = [el for el in headline.columns if el.startswith('prediction')]
evals = {}

# eval
for col in prediction_cols:
    df = headline.copy()
    df[col] = df[col].astype(str)
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_headline_score = HeadlineEvaluator(language = 'EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_headline_score.metrics


In [142]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T

df.sort_values(by='Acc', ascending=False)

Unnamed: 0,Acc,F1 binary
prediction_pixiu,0.884,0.797203
prediction_pixiu_raw,0.884,0.797203
prediction_fingpt-llama,0.798,0.204724
prediction_fingpt-llama_raw,0.798,0.204724
prediction_fingeit,0.638,0.290196
prediction_fingeit_raw,0.366,0.189258


## NER

### Extraction

In [143]:
task = 'fingpt-ner'

ner_base = eval_df[eval_df['task'] == task]

In [144]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers_ner(ner_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    ner_base[f'prediction_{model}'] = extracted_answer

In [50]:
ner_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [145]:
from src.evaluation.evaluator_ner import NEREvaluator

In [146]:
ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [147]:
prediction_cols = [el for el in ner.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = ner.copy()
    df[col] = df[col].astype(str).fillna('nan')
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_score = NEREvaluator(language = 'EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_score.metrics


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [148]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='F1', ascending=False)

Unnamed: 0,F1,Classification Report
prediction_pixiu,0.675768,precision recall f1-score ...
prediction_en-fingeit,0.483986,precision recall f1-score ...
prediction_en-fingpt-llama,0.221198,precision recall f1-score ...
prediction_en-fingpt-llama_raw,0.172249,precision recall f1-score ...
prediction_en-fingeit_raw,0.021164,precision recall f1-score ...
prediction_pixiu_raw,0.0,precision recall f1-score ...


## NER (CLS)

### Extraction

In [149]:
task = 'fingpt-ner-cls'

ner_cls_base = eval_df[eval_df['task'] == task]

In [150]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers(ner_cls_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client,['person', 'organization', 'location'])
    ner_cls_base[f'prediction_{model}'] = extracted_answer

In [53]:
ner_cls_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [151]:
from src.evaluation.evaluator_ner_cls import NERCLSEvaluator

In [152]:
ner_cls = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [153]:
prediction_cols = [el for el in ner_cls.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = ner_cls.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_cls_classification_score = NERCLSEvaluator('EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_cls_classification_score.metrics


In [154]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit,0.88,0.874789,0.88,0.882861
prediction_fingeit_raw,0.806,0.548698,0.806,0.847469
prediction_fingpt-llama,0.492,0.484067,0.492,0.463851
prediction_pixiu,0.484,0.469738,0.484,0.442258
prediction_fingpt-llama_raw,0.432,0.40527,0.432,0.365428
prediction_pixiu_raw,0.402,0.360759,0.402,0.309451


## FinRED

### Extraction

In [155]:
task = 'fingpt-finred'

finred_base = eval_df[eval_df['task'] == task]

In [156]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers_finred(finred_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    finred_base[f'prediction_{model}'] = extracted_answer

In [56]:
finred_base.to_csv(f'{data_folder}{task}-cls.csv', index=False, sep=';')

### Evaluation

In [157]:
from src.evaluation.evaluator_finred import FinRedEvaluator

In [158]:
finred = pd.read_csv(f'{data_folder}{task}-cls.csv', sep=';')

In [159]:
prediction_cols = [el for el in finred.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = finred.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_finred_classification_score = FinRedEvaluator(language='EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_finred_classification_score.metrics


In [160]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingpt-llama,0.614035,0.478076,0.614035,0.629823
prediction_fingpt-llama_raw,0.589474,0.464294,0.589474,0.612903
prediction_en-fingeit,0.308772,0.197892,0.308772,0.332644
prediction_en-fingeit_raw,0.105263,0.096145,0.105263,0.115203
prediction_pixiu,0.087719,0.044343,0.087719,0.084766
prediction_pixiu_raw,0.003509,0.002083,0.003509,0.006798


## ConvFinQA

### Extraction

In [161]:
task = 'fingpt-convfinqa'

convfinqa_base = eval_df[eval_df['task'] == task]

In [162]:
for model, _ in tqdm(models.items()):
    extracted_answer = extracted_answers_convfinqa(convfinqa_base.rename(columns={f'prediction_{model}_raw' : 'prediction'}), client)
    convfinqa_base[f'prediction_{model}'] = extracted_answer

In [59]:
convfinqa_base.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [163]:
from src.evaluation.evaluator_convfinqa import ConvFinQaEvaluator

In [164]:
convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [165]:
prediction_cols = [el for el in convfinqa.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = convfinqa.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_convfinqa_score = ConvFinQaEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_convfinqa_score.metrics


In [166]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy
prediction_pixiu_raw,0.494781
prediction_pixiu,0.466
prediction_fingeit_raw,0.42
prediction_fingeit,0.408
prediction_fingpt-llama_raw,0.004292
prediction_fingpt-llama,0.004
