In [1]:
from openai import OpenAI
from datasets import DatasetDict, Dataset
import pandas as pd 

import sys
sys.path.append('..')

from src.evaluation.llm_extractor import extracted_answers, extracted_answers_ner, extracted_answers_finred, extracted_answers_convfinqa

# Evaluation

In [2]:
# models
data_folder = '/home/sandernoels/fingeit/data/'
client = OpenAI(api_key='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')

models = {
    'fingeit' : '/home/sandernoels/fingeit/data/responses/fingeitje_responses_1e8277ee-acdf-48bf-9f3e-ba538c2d22d3.txt',
    'geitje-ultra' : '/home/sandernoels/fingeit/data/responses/GEITje-7B-ultra_responses_6787f010-26bd-4757-97db-106a67e67411.txt',
    'geitje' : '/home/sandernoels/fingeit/data/responses/GEITje-7B-chat-v2_responses_0d329199-13e8-4620-b0a4-eb6a9cf20913.txt',
    'fingpt-llama' : '/home/sandernoels/fingeit/data/responses/fingpt_llama2_responses_897d1bc3-9a03-4a94-abb9-235f9ad150ec.txt',
    'pixiu' : '/home/sandernoels/fingeit/data/responses/pixiu_responses_1ed952c6-112e-47b1-8491-e2cc7913d21a.txt'
}

## Sentiment

### Extraction

In [3]:
task = 'fingpt-sentiment'

model_name = list(models.keys())[4]
path = list(models.values())[4]

sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [4]:
# LLM-based extraction
extracted_answer = extracted_answers(sentiment.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
sentiment[f'prediction_{model_name}'] = extracted_answer

KeyboardInterrupt: 

In [102]:
sentiment.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [96]:
from src.evaluation.evaluator_sentiment import SentimentEvaluator

In [8]:
sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [16]:
prediction_cols = [el for el in sentiment.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    df = sentiment.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_sentiment_score = SentimentEvaluator()._evaluate(new_test_ds['test'])
    eval_sentiment_score.metrics

    evals[col] = eval_sentiment_score.metrics


In [17]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit,0.79,0.783196,0.79,0.790308
prediction_fingeit_raw,0.79,0.783196,0.79,0.790308
prediction_geitje-ultra,0.674,0.638646,0.674,0.661714
prediction_geitje-ultra_raw,0.564,0.529673,0.564,0.555212
prediction_geitje,0.54,0.497929,0.54,0.520434
prediction_geitje_raw,0.454,0.448226,0.454,0.466206
prediction_fingpt-llama,0.35,0.32094,0.35,0.276341
prediction_fingpt-llama_raw,0.268,0.140904,0.268,0.113287


## Headline

### Extraction

In [13]:
task = 'fingpt-headline'

model_name = list(models.keys())[3]
path = list(models.values())[3]

headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [15]:
# LLM-based extraction
extracted_answer = extracted_answers(headline.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
headline[f'prediction_{model_name}'] = extracted_answer

In [129]:
headline.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [133]:
from src.evaluation.evaluator_headline import HeadlineEvaluator

In [20]:
headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [146]:
prediction_cols = [el for el in headline.columns if el.startswith('prediction')]
evals = {}

# eval
for col in prediction_cols:
    df = headline.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_headline_score = HeadlineEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_headline_score.metrics


In [147]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T

df.sort_values(by='Acc', ascending=False)

Unnamed: 0,Acc,F1 binary
prediction_fingeit,0.92,0.836066
prediction_fingeit_raw,0.92,0.836066
prediction_fingpt-llama,0.696,0.0
prediction_geitje_raw,0.314,0.215103
prediction_geitje,0.298,0.166271
prediction_geitje-ultra_raw,0.082,0.068966
prediction_geitje-ultra,0.064,0.025
prediction_fingpt-llama_raw,0.0,0.0


## NER

### Extraction

In [4]:
task = 'fingpt-ner'

model_name = list(models.keys())[0]
path = list(models.values())[0]

ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [5]:
# LLM-based extraction
extracted_answer = extracted_answers_ner(ner.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
ner[f'prediction_{model_name}'] = extracted_answer

In [15]:
ner.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [16]:
from src.evaluation.evaluator_ner import NEREvaluator

In [5]:
ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [17]:
prediction_cols = [el for el in ner.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = ner.copy()
    df[col] = df[col].astype(str).fillna('nan')
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_score = NEREvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_score.metrics


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='F1', ascending=False)

Unnamed: 0,F1,Classification Report
prediction_fingeit_raw,0.432836,precision recall f1-score ...
prediction_fingeit,0.417266,precision recall f1-score ...
prediction_geitje,0.154472,precision recall f1-score ...
prediction_geitje-ultra,0.099585,precision recall f1-score ...
prediction_fingpt-llama,0.010695,precision recall f1-score ...
prediction_geitje-ultra_raw,0.0,precision recall f1-score ...
prediction_geitje_raw,0.0,precision recall f1-score ...
prediction_fingpt-llama_raw,0.0,precision recall f1-score ...


## FinRED

### Extraction

In [3]:
task = 'fingpt-finred'

model_name = list(models.keys())[0]
path = list(models.values())[0]

finred = pd.read_csv(f'{data_folder}{task}-classification.csv', sep=';')

In [4]:
# LLM-based extraction
extracted_answer = extracted_answers_finred(finred.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
finred[f'prediction_{model_name}'] = extracted_answer

In [75]:
finred.to_csv(f'{data_folder}{task}-classification.csv', index=False, sep=';')

### Evaluation

In [165]:
from src.evaluation.evaluator_finred import FinRedEvaluator

In [6]:
finred = pd.read_csv(f'{data_folder}{task}-classification.csv', sep=';')

In [7]:
prediction_cols = [el for el in finred.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = finred.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_finred_classification_score = FinRedEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_finred_classification_score.metrics


In [74]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit_raw,0.569277,0.468741,0.569277,0.559711
prediction_fingeit,0.569277,0.479854,0.569277,0.557941
prediction_fingpt-llama,0.259036,0.179006,0.259036,0.232336
prediction_geitje,0.123494,0.104194,0.123494,0.147762
prediction_geitje-ultra,0.111446,0.078931,0.111446,0.12122
prediction_fingpt-llama_raw,0.021084,0.0289,0.021084,0.014581
prediction_geitje_raw,0.0,0.0,0.0,0.0
prediction_geitje-ultra_raw,0.0,0.0,0.0,0.0


## ConvFinQA

### Extraction

In [3]:
task = 'fingpt-convfinqa'

model_name = list(models.keys())[0]
path = list(models.values())[0]

convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [5]:
# LLM-based extraction
extracted_answer = extracted_answers_convfinqa(convfinqa.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
convfinqa[f'prediction_{model_name}'] = extracted_answer

In [57]:
convfinqa.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [173]:
from src.evaluation.evaluator_convfinqa import ConvFinQaEvaluator

In [9]:
convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [10]:
prediction_cols = [el for el in convfinqa.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = convfinqa.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_convfinqa_score = ConvFinQaEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_convfinqa_score.metrics


In [61]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy
prediction_fingeit_raw,0.324
prediction_fingeit,0.324
prediction_geitje,0.056
prediction_geitje_raw,0.056
prediction_geitje-ultra,0.036
prediction_geitje-ultra_raw,0.016
prediction_fingpt-llama_raw,0.010554
prediction_fingpt-llama,0.008


# GPT-3.5

In [None]:
def create_message(instruction, input):    
    return [{'content': 'Je bent een behulpzame financiële assistent. help met zorg, respect en waarheid. Reageer met de grootste nuttigheid maar wel veilig. Vermijd schadelijke, onethische, bevooroordeelde of negatieve inhoud. Zorg ervoor dat antwoorden eerlijkheid en positiviteit promoten.',
    'role': 'system'},
    {'content': f'Hieronder staat een instructie die een taak beschrijft, samen met een input die context voorziet\nSchrijf een reactie die op een passende manier voldoet aan de vraag.\n\n\n### Instructie:\n{instruction}\n\n### Input:\n{input}\n\n### Reactie:\n',
    'role': 'user'}]

def get_prediction_on_message(message, client):
  response = client.chat.completions.create(
    messages=message,
      model="gpt-3.5-turbo",    
      temperature=0,        
      max_tokens=4096         
  )
  return response.choices[0].message.content

In [None]:
model_name = 'gpt-3.5-turbo'

## Sentiment

### Prediction

In [104]:
task = 'fingpt-sentiment'

sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [87]:
# prediction
sentiment['messages'] = sentiment.apply(lambda x : create_message(x['instruction'],x['input']), axis = 1)
sentiment[f'prediction_{model_name}_raw'] = sentiment['messages'].apply(lambda x : get_prediction_on_message(x,client))

In [116]:
# LLM-based extraction
extracted_answer = extracted_answers(sentiment.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
sentiment[f'prediction_{model_name}'] = extracted_answer

In [120]:
sentiment.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [118]:
prediction_cols = [el for el in sentiment.columns if el.startswith('prediction')]
evals = {}

# eval
for col in prediction_cols:
    df = sentiment.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_sentiment_score = SentimentEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_sentiment_score.metrics


In [119]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit_raw,0.79,0.783196,0.79,0.790308
prediction_fingeit,0.79,0.783196,0.79,0.790308
prediction_gpt-3.5-turbo,0.752,0.724415,0.752,0.740101
prediction_gpt-3.5-turbo_raw,0.742,0.713072,0.742,0.729132
prediction_geitje-ultra,0.674,0.638646,0.674,0.661714
prediction_geitje-ultra_raw,0.564,0.529673,0.564,0.555212
prediction_geitje,0.54,0.497929,0.54,0.520434
prediction_geitje_raw,0.454,0.448226,0.454,0.466206
prediction_fingpt-llama,0.35,0.32094,0.35,0.276341
prediction_fingpt-llama_raw,0.268,0.140904,0.268,0.113287


## Headline

### Prediction

In [122]:
task = 'fingpt-headline'

headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [123]:
# prediction
headline['messages'] = headline.apply(lambda x : create_message(x['instruction'],x['input']), axis = 1)
headline[f'prediction_{model_name}_raw'] = headline['messages'].apply(lambda x : get_prediction_on_message(x,client))

In [125]:
# LLM-based extraction
extracted_answer = extracted_answers(headline.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
headline[f'prediction_{model_name}'] = extracted_answer

In [132]:
headline.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [136]:
prediction_cols = [el for el in headline.columns if el.startswith('prediction')]
evals = {}

# eval
for col in prediction_cols:
    df = headline.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_headline_score = HeadlineEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_headline_score.metrics


In [138]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T

df.sort_values(by='Acc', ascending=False)

Unnamed: 0,Acc,F1 binary
prediction_fingeit,0.92,0.836066
prediction_fingeit_raw,0.92,0.836066
prediction_fingpt-llama,0.696,0.0
prediction_gpt-3.5-turbo,0.64,0.485714
prediction_gpt-3.5-turbo_raw,0.606,0.466125
prediction_geitje_raw,0.314,0.215103
prediction_geitje,0.298,0.166271
prediction_geitje-ultra_raw,0.082,0.068966
prediction_geitje-ultra,0.064,0.025
prediction_fingpt-llama_raw,0.0,0.0


## NER

### Prediction

In [148]:
task = 'fingpt-ner'

ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [151]:
# prediction
ner['messages'] = ner.apply(lambda x : create_message(x['instruction'],x['input']), axis = 1)
ner[f'prediction_{model_name}_raw'] = ner['messages'].apply(lambda x : get_prediction_on_message(x,client))

In [152]:
# LLM-based extraction
extracted_answer = extracted_answers_ner(ner.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
ner[f'prediction_{model_name}'] = extracted_answer

In [159]:
ner.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [156]:
prediction_cols = [el for el in ner.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = ner.copy()
    df[col] = df[col].astype(str).fillna('nan')
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_score = NEREvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_score.metrics

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [157]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='F1', ascending=False)

Unnamed: 0,F1,Classification Report
prediction_fingeit_raw,0.432836,precision recall f1-score ...
prediction_fingeit,0.417266,precision recall f1-score ...
prediction_gpt-3.5-turbo,0.315217,precision recall f1-score ...
prediction_geitje,0.154472,precision recall f1-score ...
prediction_geitje-ultra,0.099585,precision recall f1-score ...
prediction_fingpt-llama,0.010695,precision recall f1-score ...
prediction_geitje-ultra_raw,0.0,precision recall f1-score ...
prediction_geitje_raw,0.0,precision recall f1-score ...
prediction_fingpt-llama_raw,0.0,precision recall f1-score ...
prediction_gpt-3.5-turbo_raw,0.0,precision recall f1-score ...


## FinRED

### Prediction

In [162]:
task = 'fingpt-finred'

finred = pd.read_csv(f'{data_folder}{task}-classification.csv', sep=';')

In [163]:
# prediction
finred['messages'] = finred.apply(lambda x : create_message(x['instruction'],x['input']), axis = 1)
finred[f'prediction_{model_name}_raw'] = finred['messages'].apply(lambda x : get_prediction_on_message(x,client))

In [164]:
# LLM-based extraction
extracted_answer = extracted_answers_finred(finred.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
finred[f'prediction_{model_name}'] = extracted_answer

In [None]:
finred.to_csv(f'{data_folder}{task}-classification.csv', index=False, sep=';')

### Evaluation

In [168]:
prediction_cols = [el for el in finred.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = finred.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_finred_classification_score = FinRedEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_finred_classification_score.metrics


In [169]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_fingeit_raw,0.569277,0.468741,0.569277,0.559711
prediction_fingeit,0.569277,0.479854,0.569277,0.557941
prediction_fingpt-llama,0.259036,0.179006,0.259036,0.232336
prediction_gpt-3.5-turbo,0.156627,0.136517,0.156627,0.155826
prediction_geitje,0.123494,0.104194,0.123494,0.147762
prediction_geitje-ultra,0.111446,0.078931,0.111446,0.12122
prediction_fingpt-llama_raw,0.021084,0.0289,0.021084,0.014581
prediction_geitje-ultra_raw,0.0,0.0,0.0,0.0
prediction_geitje_raw,0.0,0.0,0.0,0.0
prediction_gpt-3.5-turbo_raw,0.0,0.0,0.0,0.0


In [None]:
finred

## ConvFinQA 

### Prediction

In [170]:
task = 'fingpt-convfinqa'

convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [171]:
# prediction
convfinqa['messages'] = convfinqa.apply(lambda x : create_message(x['instruction'],x['input']), axis = 1)
convfinqa[f'prediction_{model_name}_raw'] = convfinqa['messages'].apply(lambda x : get_prediction_on_message(x,client))

In [172]:
# LLM-based extraction
extracted_answer = extracted_answers_convfinqa(convfinqa.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
convfinqa[f'prediction_{model_name}'] = extracted_answer

In [None]:
# convfinqa.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [175]:
prediction_cols = [el for el in convfinqa.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = convfinqa.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_convfinqa_score = ConvFinQaEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_convfinqa_score.metrics


In [176]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy
prediction_fingeit_raw,0.324
prediction_fingeit,0.324
prediction_gpt-3.5-turbo,0.244
prediction_gpt-3.5-turbo_raw,0.196
prediction_geitje,0.056
prediction_geitje_raw,0.056
prediction_geitje-ultra,0.036
prediction_geitje-ultra_raw,0.016
prediction_fingpt-llama_raw,0.010554
prediction_fingpt-llama,0.008
