In [1]:
import time
from collections import defaultdict
from pathlib import Path
from typing import List

import evaluate
import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo.database import Database
from tqdm import tqdm



In [2]:
def get_env_path() -> Path:
    return Path('..\\.env')


load_dotenv(get_env_path())

CONNECTION_STRING = f'mongodb://localhost:27017'
CLIENT = MongoClient(CONNECTION_STRING)
DB: Database = CLIENT['thesis']
# noinspection SpellCheckingInspection
TABLE_MITI: Collection = DB['miti']

In [3]:
_df_docs_poor = pd.DataFrame.from_records(list(TABLE_MITI.find(filter={'prompt.therapist_level': 'poor'})))
_df_docs_mediocre = pd.DataFrame.from_records(list(TABLE_MITI.find(filter={'prompt.therapist_level': 'average'})))
_df_docs_expert = pd.DataFrame.from_records(list(TABLE_MITI.find(filter={'prompt.therapist_level': 'expert'})))

In [4]:
_df_docs_poor.head()

Unnamed: 0,_id,_response_ms,_retrieve_params,api_version,api_type,organization,api_base_override,engine,_previous,prompt,timestamp
0,658da31c864df60a8413a918,2978,{},,,stavnlp,,,{'id': 'chatcmpl-8anazJeqWBWib6l3EndfC2bW1jbZe...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
1,658da325864df60a8413a919,7516,{},,,stavnlp,,,{'id': 'chatcmpl-8anb3ZkRHa43gWZonDx0ML2GXALMo...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
2,658da328864df60a8413a91a,3711,{},,,stavnlp,,,{'id': 'chatcmpl-8anbBf6I7cwMtLZ5vzFePmJxN8FW7...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
3,658da32c864df60a8413a91b,3586,{},,,stavnlp,,,{'id': 'chatcmpl-8anbFJwK65fHMzbOYIRljGDsG2s6j...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
4,658da32f864df60a8413a91c,2921,{},,,stavnlp,,,{'id': 'chatcmpl-8anbJs0EcEMBV0ffWwfBxvh3env6Q...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0


In [5]:
_df_docs_mediocre.head()

Unnamed: 0,_id,_response_ms,_retrieve_params,api_version,api_type,organization,api_base_override,engine,_previous,prompt,timestamp
0,658da363864df60a8413a92a,2712,{},,,stavnlp,,,{'id': 'chatcmpl-8anc998YZB1XSK3a5kZUVOKu8gAgD...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
1,658da367864df60a8413a92b,4039,{},,,stavnlp,,,{'id': 'chatcmpl-8ancCQhREC92zfrNnWfr4yTfuIQGU...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
2,658da36e864df60a8413a92c,6111,{},,,stavnlp,,,{'id': 'chatcmpl-8ancG8w4QFbGHbR7ZmrcaygwinZVJ...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
3,658da373864df60a8413a92d,5365,{},,,stavnlp,,,{'id': 'chatcmpl-8ancMFsUv752NBJankIjYpzhTGAZV...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0
4,658da377864df60a8413a92e,3325,{},,,stavnlp,,,{'id': 'chatcmpl-8ancSUOXMEylIYSX5521HEtGZofsR...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0


In [6]:
_df_docs_expert.head()

Unnamed: 0,_id,_response_ms,_retrieve_params,api_version,api_type,organization,api_base_override,engine,_previous,prompt,timestamp,llama2
0,658da3ab864df60a8413a93c,2766,{},,,stavnlp,,,{'id': 'chatcmpl-8andJ6r0SqsHQLGIlhlb0AlNxLPq0...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0,{'finetuning/miti_alexander_street/meta-llama-...
1,658da3b2864df60a8413a93d,6350,{},,,stavnlp,,,{'id': 'chatcmpl-8andMWEhZ8ieM3u2IaANc7W6x1BA6...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0,{'finetuning/miti_alexander_street/meta-llama-...
2,658da3b6864df60a8413a93e,3665,{},,,stavnlp,,,{'id': 'chatcmpl-8andS21da6im3DIRkjOt6z0GNNYzY...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0,{'finetuning/miti_alexander_street/meta-llama-...
3,658da3ba864df60a8413a93f,3767,{},,,stavnlp,,,{'id': 'chatcmpl-8andWhV0CsIv9JmJNqqTvvGQisAVa...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0,{'finetuning/miti_alexander_street/meta-llama-...
4,658da3bf864df60a8413a940,5490,{},,,stavnlp,,,{'id': 'chatcmpl-8anda5QkpfV5Yxx4e7eFgFawZF2C8...,"{'model': 'gpt-3.5-turbo-1106', 'request_timeo...",1703781000.0,{'finetuning/miti_alexander_street/meta-llama-...


In [9]:
for _df_docs in [_df_docs_poor, _df_docs_mediocre, _df_docs_expert]:
    _df_docs['filename'] = _df_docs['prompt'].apply(lambda _x: _x['filename'])
    _df_docs['count_messages'] = _df_docs['prompt'].apply(lambda _x: len(_x['messages']))
    _df_docs['therapist_level'] = _df_docs['prompt'].apply(lambda _x: _x['therapist_level'])
    _df_docs['predicted'] = _df_docs['_previous'].apply(lambda _x: _x['choices'][0]['message']['content'])
    _df_docs['reference'] = _df_docs['prompt'].apply(lambda _x: _x['true_response'])
    _df_docs['predicted_words_count'] = _df_docs['predicted'].apply(lambda _x: len(_x.split()))
    _df_docs['reference_words_count'] = _df_docs['reference'].apply(lambda _x: len(_x.split()))

In [18]:
_cols = ['therapist_level', 'predicted', 'reference']
pd.concat([_df_docs_poor[_cols], _df_docs_mediocre[_cols], _df_docs_expert[_cols]]).to_csv('data to eval mauve.csv',
                                                                                           index=False)

In [9]:
_bleu = evaluate.load("bleu")
_rouge = evaluate.load("rouge")
_meteor = evaluate.load("meteor")
_bertscore = evaluate.load("bertscore", device='cuda:0')
_frugalscore = evaluate.load("frugalscore", "moussaKam/frugalscore_medium_bert-base_mover-score")
_google_bleu = evaluate.load("google_bleu")
_bleurt = evaluate.load("bleurt", module_type="metric")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stav3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stav3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\stav3\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  return self.fget.__get__(instance, owner)()
Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').



INFO:tensorflow:Reading checkpoint C:\Users\stav3\.cache\huggingface\metrics\bleurt\default\downloads\extracted\1dfb731fe2846298242021b3971e53cd2b22233cb0c4fdd87721b208630c396f\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [10]:
_dfs = {
    'poor': _df_docs_poor,
    'mediocre': _df_docs_mediocre,
    'expert': _df_docs_expert
}

_eval = defaultdict(defaultdict)

for _k, _df in tqdm(_dfs.items(), desc='Evaluating'):
    _predictions = list(_df['predicted'])
    _references = list(_df['reference'])

    _eval[_k]['predicted_utterances_count'] = len(_df)
    _eval[_k]['predicted_words_count_mean'] = np.mean(_df['predicted_words_count'])
    _eval[_k]['predicted_words_count_std'] = np.std(_df['predicted_words_count'])
    _eval[_k]['reference_words_count_mean'] = np.mean(_df['reference_words_count'])
    _eval[_k]['reference_words_count_std'] = np.std(_df['reference_words_count'])

    _eval[_k]['bleu'] = _bleu.compute(predictions=_predictions, references=_references)
    _eval[_k]['rouge'] = _rouge.compute(predictions=_predictions, references=_references)
    _eval[_k]['meteor'] = _meteor.compute(predictions=_predictions, references=_references)
    _eval[_k]['bertscore'] = _bertscore.compute(predictions=_predictions, references=_references, lang="en")
    _eval[_k]['frugalscore'] = _frugalscore.compute(predictions=_predictions, references=[_x[0] for _x in _references],
                                                    batch_size=16, max_length=64, device="gpu")
    _eval[_k]['google_bleu'] = _google_bleu.compute(predictions=_predictions, references=_references)
    _eval[_k]['bleurt'] = _bleurt.compute(predictions=_predictions, references=[_x[0] for _x in _references])

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6652 [00:00<?, ? examples/s]



Map:   0%|          | 0/6652 [00:00<?, ? examples/s]

Evaluating:  67%|██████▋   | 2/3 [07:32<03:46, 226.53s/it]

Map:   0%|          | 0/6652 [00:00<?, ? examples/s]

Evaluating: 100%|██████████| 3/3 [11:09<00:00, 223.29s/it]


In [16]:
_df_result = pd.DataFrame.from_dict(_eval, orient='index')
_df_result

Unnamed: 0,predicted_utterances_count,predicted_words_count_mean,predicted_words_count_std,reference_words_count_mean,reference_words_count_std,bleu,rouge,meteor,bertscore,frugalscore,google_bleu,bleurt
poor,6652,32.564943,25.818824,25.904991,29.444664,"{'bleu': 0.014572493807103633, 'precisions': [...","{'rouge1': 0.17350544135109638, 'rouge2': 0.02...",{'meteor': 0.16170356837938385},"{'precision': [0.8835488557815552, 0.842295408...","{'scores': [-0.32910156, -0.2919922, -0.329101...",{'google_bleu': 0.043398185133269394},"{'scores': [-2.2996315956115723, -2.1110601425..."
mediocre,6652,30.733464,23.318381,25.904991,29.444664,"{'bleu': 0.015430180836158321, 'precisions': [...","{'rouge1': 0.17817508589510853, 'rouge2': 0.02...",{'meteor': 0.16146440462682232},"{'precision': [0.8938807249069214, 0.875382423...","{'scores': [-0.3552246, -0.2841797, -0.3276367...",{'google_bleu': 0.0446229537624029},"{'scores': [-2.424213171005249, -2.41297531127..."
expert,6652,32.985869,23.53478,25.904991,29.444664,"{'bleu': 0.015577436781898855, 'precisions': [...","{'rouge1': 0.17897434545731436, 'rouge2': 0.03...",{'meteor': 0.16763884330223833},"{'precision': [0.9014953970909119, 0.857551753...","{'scores': [-0.33984375, -0.2944336, -0.329101...",{'google_bleu': 0.044768307157906424},"{'scores': [-1.892256498336792, -1.99957525730..."


In [17]:
_list = _df_result.columns.tolist()

for _i in range(0, len(_list), 3):
    _list[_i], _list[_i + 1], _list[_i + 2] = _list[_i + 2], _list[_i], _list[_i + 1]

_df_result = _df_result[_list]
_df_result

Unnamed: 0,predicted_words_count_std,predicted_utterances_count,predicted_words_count_mean,bleu,reference_words_count_mean,reference_words_count_std,bertscore,rouge,meteor,bleurt,frugalscore,google_bleu
poor,25.818824,6652,32.564943,"{'bleu': 0.014572493807103633, 'precisions': [...",25.904991,29.444664,"{'precision': [0.8835488557815552, 0.842295408...","{'rouge1': 0.17350544135109638, 'rouge2': 0.02...",{'meteor': 0.16170356837938385},"{'scores': [-2.2996315956115723, -2.1110601425...","{'scores': [-0.32910156, -0.2919922, -0.329101...",{'google_bleu': 0.043398185133269394}
mediocre,23.318381,6652,30.733464,"{'bleu': 0.015430180836158321, 'precisions': [...",25.904991,29.444664,"{'precision': [0.8938807249069214, 0.875382423...","{'rouge1': 0.17817508589510853, 'rouge2': 0.02...",{'meteor': 0.16146440462682232},"{'scores': [-2.424213171005249, -2.41297531127...","{'scores': [-0.3552246, -0.2841797, -0.3276367...",{'google_bleu': 0.0446229537624029}
expert,23.53478,6652,32.985869,"{'bleu': 0.015577436781898855, 'precisions': [...",25.904991,29.444664,"{'precision': [0.9014953970909119, 0.857551753...","{'rouge1': 0.17897434545731436, 'rouge2': 0.03...",{'meteor': 0.16763884330223833},"{'scores': [-1.892256498336792, -1.99957525730...","{'scores': [-0.33984375, -0.2944336, -0.329101...",{'google_bleu': 0.044768307157906424}


In [18]:
for _k in ['precision', 'recall', 'f1']:
    _df_result[f'bertscore_{_k}_mean'] = _df_result['bertscore'].apply(lambda _x: np.mean(_x[_k]))
_df_result[f'bertscore_{_k}_std'] = _df_result['bertscore'].apply(lambda _x: np.std(_x[_k]))

_df_result[f'frugalscore_{_k}_mean'] = _df_result['frugalscore'].apply(lambda _x: np.mean(_x['scores']))
_df_result[f'frugalscore_{_k}_std'] = _df_result['frugalscore'].apply(lambda _x: np.std(_x['scores']))

_df_result[f'bleurt_{_k}_mean'] = _df_result['bleurt'].apply(lambda _x: np.mean(_x['scores']))
_df_result[f'bleurt_{_k}_std'] = _df_result['bleurt'].apply(lambda _x: np.std(_x['scores']))
_df_result = _df_result[sorted(_df_result.columns.tolist())]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df_result[f'bertscore_{_k}_mean'] = _df_result['bertscore'].apply(lambda _x: np.mean(_x[_k]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df_result[f'bertscore_{_k}_mean'] = _df_result['bertscore'].apply(lambda _x: np.mean(_x[_k]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df_result[f'be

In [19]:
_df_result

Unnamed: 0,bertscore,bertscore_f1_mean,bertscore_f1_std,bertscore_precision_mean,bertscore_recall_mean,bleu,bleurt,bleurt_f1_mean,bleurt_f1_std,frugalscore,frugalscore_f1_mean,frugalscore_f1_std,google_bleu,meteor,predicted_utterances_count,predicted_words_count_mean,predicted_words_count_std,reference_words_count_mean,reference_words_count_std,rouge
poor,"{'precision': [0.8835488557815552, 0.842295408...",0.849103,0.026098,0.848257,0.850691,"{'bleu': 0.014572493807103633, 'precisions': [...","{'scores': [-2.2996315956115723, -2.1110601425...",-1.938766,0.442929,"{'scores': [-0.32910156, -0.2919922, -0.329101...",-0.297933,0.056519,{'google_bleu': 0.043398185133269394},{'meteor': 0.16170356837938385},6652,32.564943,25.818824,25.904991,29.444664,"{'rouge1': 0.17350544135109638, 'rouge2': 0.02..."
mediocre,"{'precision': [0.8938807249069214, 0.875382423...",0.849891,0.025526,0.849094,0.851425,"{'bleu': 0.015430180836158321, 'precisions': [...","{'scores': [-2.424213171005249, -2.41297531127...",-1.979516,0.420758,"{'scores': [-0.3552246, -0.2841797, -0.3276367...",-0.299029,0.054311,{'google_bleu': 0.0446229537624029},{'meteor': 0.16146440462682232},6652,30.733464,23.318381,25.904991,29.444664,"{'rouge1': 0.17817508589510853, 'rouge2': 0.02..."
expert,"{'precision': [0.9014953970909119, 0.857551753...",0.849881,0.025005,0.848371,0.85207,"{'bleu': 0.015577436781898855, 'precisions': [...","{'scores': [-1.892256498336792, -1.99957525730...",-1.997368,0.367337,"{'scores': [-0.33984375, -0.2944336, -0.329101...",-0.302099,0.048178,{'google_bleu': 0.044768307157906424},{'meteor': 0.16763884330223833},6652,32.985869,23.53478,25.904991,29.444664,"{'rouge1': 0.17897434545731436, 'rouge2': 0.03..."
