In [1]:
from openai import OpenAI
from datasets import DatasetDict, Dataset
import pandas as pd 

import sys
sys.path.append('..')

from src.evaluation.llm_extractor import extracted_answers, extracted_answers_convfinqa
from src.evaluation.llm_extractor_en import extracted_answers_ner, extracted_answers_finred

# Evaluation (EN)

In [2]:
# models
data_folder = '/home/sandernoels/fingeit/data/en-'
client = OpenAI(api_key='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')

models = {
    'en-fingeit' : '/home/sandernoels/fingeit/data/responses/en_fingeitje_responses_35cd1d84-e9bc-48a2-8e18-07db134acf32.txt',
    'en-fingpt-llama' : '/home/sandernoels/fingeit/data/responses/en_fingpt_llama2_responses_c53d172a-dc48-4ea9-86c5-0063e9ff10a1.txt',
}

## Sentiment

### Extraction

In [5]:
task = 'fingpt-sentiment'

model_name = list(models.keys())[1]
path = list(models.values())[1]

sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [23]:
# LLM-based extraction
extracted_answer = extracted_answers(sentiment.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
sentiment[f'prediction_{model_name}'] = extracted_answer

In [25]:
sentiment.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [6]:
from src.evaluation.evaluator_sentiment import SentimentEvaluator

In [7]:
sentiment = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [9]:
prediction_cols = [el for el in sentiment.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    df = sentiment.copy()
    df[col] = df[col].astype(str)
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_sentiment_score = SentimentEvaluator(language = 'EN')._evaluate(new_test_ds['test'])
    eval_sentiment_score.metrics

    evals[col] = eval_sentiment_score.metrics


In [22]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_en-fingeit,0.692,0.697012,0.692,0.69234
prediction_en-fingeit_raw,0.684,0.686843,0.684,0.682951
prediction_en-fingpt-llama_raw,0.402,0.210271,0.402,0.247337
prediction_en-fingpt-llama,0.402,0.210545,0.402,0.247659


## Headline

### Extraction

In [3]:
task = 'fingpt-headline'

model_name = list(models.keys())[1]
path = list(models.values())[1]

headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [33]:
# LLM-based extraction
extracted_answer = extracted_answers(headline.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
headline[f'prediction_{model_name}'] = extracted_answer

In [35]:
headline.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [4]:
from src.evaluation.evaluator_headline import HeadlineEvaluator

In [5]:
headline = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [6]:
prediction_cols = [el for el in headline.columns if el.startswith('prediction')]
evals = {}

# eval
for col in prediction_cols:
    df = headline.copy()
    df[col] = df[col].astype(str)
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_headline_score = HeadlineEvaluator(language = 'EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_headline_score.metrics


In [28]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T

df.sort_values(by='Acc', ascending=False)

Unnamed: 0,Acc,F1 binary
prediction_en-fingpt-llama_raw,0.798,0.204724
prediction_en-fingpt-llama,0.798,0.204724
prediction_en-fingeit,0.688,0.446809
prediction_en-fingeit_raw,0.582,0.376119


## NER

### Extraction

In [8]:
task = 'fingpt-ner'

model_name = list(models.keys())[1]
path = list(models.values())[1]

ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [53]:
# LLM-based extraction
extracted_answer = extracted_answers_ner(ner.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
ner[f'prediction_{model_name}'] = extracted_answer

In [56]:
ner.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

In [None]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='F1', ascending=False)

### Evaluation

In [9]:
from src.evaluation.evaluator_ner import NEREvaluator

In [10]:
ner = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [12]:
prediction_cols = [el for el in ner.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = ner.copy()
    df[col] = df[col].astype(str).fillna('nan')
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_ner_score = NEREvaluator(language = 'EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_ner_score.metrics


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='F1', ascending=False)

Unnamed: 0,F1,Classification Report
prediction_en-fingeit,0.483986,precision recall f1-score ...
prediction_en-fingpt-llama,0.221198,precision recall f1-score ...
prediction_en-fingpt-llama_raw,0.172249,precision recall f1-score ...
prediction_en-fingeit_raw,0.021164,precision recall f1-score ...


## FinRED

### Extraction

In [14]:
task = 'fingpt-finred'

model_name = list(models.keys())[0]
path = list(models.values())[0]

finred = pd.read_csv(f'{data_folder}{task}-classification.csv', sep=';')

In [12]:
# LLM-based extraction
extracted_answer = extracted_answers_finred(finred.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
finred[f'prediction_{model_name}'] = extracted_answer

In [16]:
finred.to_csv(f'{data_folder}{task}-classification.csv', index=False, sep=';')

### Evaluation

In [15]:
from src.evaluation.evaluator_finred import FinRedEvaluator

In [16]:
finred = pd.read_csv(f'{data_folder}{task}-classification.csv', sep=';')

In [17]:
prediction_cols = [el for el in finred.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = finred.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_finred_classification_score = FinRedEvaluator(language='EN')._evaluate(new_test_ds['test'])

    evals[col] = eval_finred_classification_score.metrics


In [25]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='acc', ascending=False)

Unnamed: 0,acc,f1_macro,f1_micro,f1_weighted
prediction_en-fingeit,0.308772,0.197892,0.308772,0.332644
prediction_en-fingeit_raw,0.105263,0.096145,0.105263,0.115203
prediction_en-fingpt-llama_raw,0.02807,0.014104,0.02807,0.037184
prediction_en-fingpt-llama,0.02807,0.013885,0.02807,0.036603


## ConvFinQA

### Extraction

In [28]:
task = 'fingpt-convfinqa'

model_name = list(models.keys())[1]
path = list(models.values())[1]

convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [11]:
# LLM-based extraction
extracted_answer = extracted_answers_convfinqa(convfinqa.rename(columns={f'prediction_{model_name}_raw' : 'prediction'}), client)
convfinqa[f'prediction_{model_name}'] = extracted_answer

In [13]:
convfinqa.to_csv(f'{data_folder}{task}.csv', index=False, sep=';')

### Evaluation

In [29]:
from src.evaluation.evaluator_convfinqa import ConvFinQaEvaluator

In [30]:
convfinqa = pd.read_csv(f'{data_folder}{task}.csv', sep=';')

In [31]:
prediction_cols = [el for el in convfinqa.columns if el.startswith('prediction')]

evals = {}

# eval
for col in prediction_cols:
    
    df = convfinqa.copy()
    df = df.rename(columns={col : 'prediction'})
    
    new_test_ds = DatasetDict()
    new_test_ds['test'] = Dataset.from_pandas(df)

    eval_convfinqa_score = ConvFinQaEvaluator()._evaluate(new_test_ds['test'])

    evals[col] = eval_convfinqa_score.metrics


In [32]:
metrics_dict = {key: {metric.name: metric.value for metric in value} for key, value in evals.items()}
df = pd.DataFrame(metrics_dict).T
df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy
prediction_en-fingeit_raw,0.42
prediction_en-fingeit,0.416
prediction_en-fingpt-llama_raw,0.004292
prediction_en-fingpt-llama,0.004
