## Overview
In this folder, we will try to have a single or maybe like a script to compute all the required metrics given the input along with its target column name for IFT and Also for EFT data

### IFT Metrics
1. ROUGE-1-R
2. ROUGE-L-R
3. ROUGE-L-F
4. BLEURT
5. BERTScore-F
6. BERTScore-R - 
7. UMLS-F
8. LLM-as-a-Judge rating - with GPT-4o-mini
9. Winrate (with GPT-4o it might be reliable)

all in one environment: eval_metrics, added following
- bert_score==0.3.13
- bleurt==0.0.2
- tensorflow==2.16.1
- rouge-score==0.1.2

In [None]:
import pandas as pd
import torch

# load rouge score module
from rouge_score import rouge_scorer

# load bert-score module
from bert_score import BERTScorer
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True, model_type="microsoft/deberta-xlarge-mnli")

# # load bleurt-score module
# from bleurt import score
# checkpoint = "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Evaluation_Metrics/ClinicalBLEURT/BLEURT-20"
# # checkpoint = "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Evaluation_Metrics/ClinicalBLEURT/ClinicalBLEURT"
# bleurt_scorer = score.BleurtScorer(checkpoint)
# print("Loaded BLEURT-20 checkpoint!!!!")

KeyboardInterrupt: 

: 

In [20]:
# from umls import AutomaticFactEval
# umls_scorer = AutomaticFactEval()

from gpt4o_mini_as_a_judge import llm_as_a_judge_prompt, get_openai_response, get_final_score

In [18]:
class IFTEvalMetrics:
    def __init__(self, datapath, target_name, prediction_name):
        self.data = pd.read_csv(datapath)
        self.target_name = target_name
        self.prediction_name = prediction_name
        self.target = self.data[target_name].fillna("").tolist()
        self.generated = self.data[prediction_name].fillna("").tolist()
        self.res = {}

    def _rouge(self):

        r1 = {'p': [], 'r' : [], 'f': []}
        r2 = {'p': [], 'r' : [], 'f': []}
        rl = {'p': [], 'r' : [], 'f': []}
        
        rouge_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

        for i in range(len(self.data)):
            all_rouge_scores = rouge_instance.score(self.target[i], self.generated[i])
            r1['p'].append(all_rouge_scores['rouge1'][0])
            r1['r'].append(all_rouge_scores['rouge1'][1])
            r1['f'].append(all_rouge_scores['rouge1'][2])

            r2['p'].append(all_rouge_scores['rouge2'][0])
            r2['r'].append(all_rouge_scores['rouge2'][1])
            r2['f'].append(all_rouge_scores['rouge2'][2])

            rl['p'].append(all_rouge_scores['rougeL'][0])
            rl['r'].append(all_rouge_scores['rougeL'][1])
            rl['f'].append(all_rouge_scores['rougeL'][2])

        self.res['rouge-1'] = (sum(r1['p'])/len(r1['p']), sum(r1['r'])/len(r1['r']), sum(r1['f'])/len(r1['f']))
        self.res['rouge-2'] = (sum(r2['p'])/len(r2['p']), sum(r2['r'])/len(r2['r']), sum(r2['f'])/len(r2['f']))
        self.res['rouge-l'] = (sum(rl['p'])/len(rl['p']), sum(rl['r'])/len(rl['r']), sum(rl['f'])/len(rl['f']))
    
    def _bleurt(self):
        bleurt_scores = bleurt_scorer.score(references=self.target, candidates=self.generated)
        torch.cuda.empty_cache()

        self.res['bleurt'] = sum(bleurt_scores)/len(bleurt_scores)

    def _bertscore(self):
        bert_P, bert_R, bert_F = bert_scorer.score(self.target, self.generated)
        self.res['bertscore_p'] = bert_P.mean().item()
        self.res['bertscore_r'] = bert_R.mean().item()
        self.res['bertscore_f'] = bert_F.mean().item()
        torch.cuda.empty_cache()


    
    def _umls(self):
        umls_scores = umls_scorer.run_source_concept_faithfulness(ref_sums=self.target, gen_sums=self.generated)
        self.res['umls_f'] = umls_scores['UMLS_cuis_f']

    def _llm_as_a_judge(self, judge_model='gpt-4o-mini'):
        if 'dialogue' not in self.data.columns:
            print("LLM as a judge is not applicable")
            return 
        print(self.data.iloc[0]["dialogue"])
        print(self.data.columns)
        self.data['judge_prompt'] = self.data.apply(lambda x: llm_as_a_judge_prompt(x['dialogue'], x[self.prediction_name]), 1)

        all_responses = []
        all_scores = []
        for i in range(len(self.data)):
            
            judge_prompt = self.data.iloc[i]['judge_prompt']

            try:
                response = get_openai_response(prompt=judge_prompt, model=judge_model)
            except:
                print("########## Issue with OpenAI API")
                response = ""

            all_responses.append(response)
            
            score = get_final_score(response)
            all_scores.append(score)

        self.data['gpt_4o_mini_responses'] = all_responses
        self.data['gpt_4o_mini_scores'] = all_scores

        self.res['llm_as_a_judge'] = self.data['gpt_4o_mini_scores'].describe()['mean']

    def _winrate(self):
        pass


    def run(self, judge_model='gpt-4o-mini'):
        # self._rouge()
        # print("Rouge Computed")
        # print(self.res)

        # self._bleurt()
        # print("BLEURT Computed")
        # print(self.res)

        # self._bertscore()
        # print("BERTScore Computed")
        # print(self.res)

        # self._umls()
        # print("UMLS Computed")
        # print(self.res)

        print("LLM-as-a-Judge computing.....")
        self._llm_as_a_judge(judge_model=judge_model)
        print("LLM-as-a-Judge Computed")
        # print(self.res)
        
        return self.res
        

In [16]:
def get_final_results(results_dict):
    # print("Rouge-1-R: ", results_dict['rouge-1'][1])
    # print("Rouge-L-R: ", results_dict['rouge-l'][1])
    # print("Rouge-L-F: ", results_dict['rouge-l'][2])
    # print("BLEURT: ", results_dict['bleurt'])
    # print("BERTScore-F: ", results_dict['bertscore_f'])
    # print("UMLS-F: ", results_dict['umls_f'])
    
    
    print("LLM-as-a-Judge: ", results_dict['llm_as_a_judge'])

In [6]:
all_ift_models = [("gpt4", "gpt4_predictions", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/self_rewarding/GPT4_Test_Predictions/GPT4_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023_correct_prompt.csv"),
    ('opus', "claude_opus_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/Claude_Opus_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ('sonet', "claude_sonet_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/Claude_Sonet_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ("gpt-4o-mini", "gpt_4o_mini_predictions", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT-4o-mini/GPT-4o-mini_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ('gpt3', "gpt3_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT3/GPT3_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ("haiku", "claude_haiku_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/Claude_Haiku_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ("R1-M0", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m0_mistral/ift_performance/m0_mistral.csv"),
             ("R1-M1", "biomistral_responses", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m1_mistral/ift_performance/M1_low_eft_data-0.4_ep-2_rank-32.csv"),
             ("R1-M2", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m2_mistral/ift_performance/M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2.csv"),
             ("R1-M3", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m3_mistral/ift_performance/M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16.csv"),
             ("R1-M4", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m4_mistral/ift_performance/M4_model_M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-16.csv"),
             ("R1-M5", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m5_mistral/ift_performance/M5_model_M4_model_M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-64.csv")
             ]

In [19]:
import pandas as pd
idx=5 #3,5 - computing 4o-mini and haiku models BERTScore performances
eval_metric = IFTEvalMetrics(datapath=all_ift_models[idx][2],
                              target_name='section_text', 
                              prediction_name=all_ift_models[idx][1])

results = eval_metric.run(judge_model='gpt-4o-mini')
get_final_results(results)

LLM-as-a-Judge computing.....
Doctor: Hello, how are you today?
Patient: Not good. 
Doctor: What happened? 
Patient: I have a lot of congestion. I also am coughing a lot. It feels like I am choking on something.
Index(['ID', 'section_header', 'section_text', 'dialogue',
       'gpt4_few_shot_prompt', 'max_tokens', 'claude_haiku_predictions'],
      dtype='object')
LLM-as-a-Judge Computed
LLM-as-a-Judge:  -1.0


In [None]:
results

In [None]:
import pandas as pd
idx=3
# eval_metric = IFTEvalMetrics(datapath=all_ift_models[idx][2],
#                               target_name='section_text', 
#                               prediction_name=all_ift_models[idx][1])

curr_model = '/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/mix_models/experiments/m2_mistral_tuning/ift_performance/M2_model_base-M1_low_eft_data-0.4_ep-2_rank-32_gen-M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_beta-0.1.csv'
eval_metric = IFTEvalMetrics(datapath=curr_model,
                              target_name='section_text', 
                              prediction_name="biomistral_responses")
results = eval_metric.run(judge_model='gpt-4o-mini')
get_final_results(results)

In [None]:
# gpt-4o-mini - 'bertscore_r': 0.3961123526096344, 'bertscore_f': 0.3974961042404175

In [None]:
# datapath="/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/self_rewarding/GPT4_Test_Predictions/GPT4_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023_correct_prompt.csv"
# target_name='section_text'
# prediction_name='gpt4_predictions'
# print(f"sbatch eval.sh {datapath} {target_name} {prediction_name} no_judge")

In [None]:
def get_final_results(results_dict):
    try:
        print("Rouge-1-R: ", results_dict['rouge-1'][1])
        print("Rouge-L-R: ", results_dict['rouge-l'][1])
        print("Rouge-L-F: ", results_dict['rouge-l'][2])
        print("BLEURT: ", results_dict['bleurt'])
        print("BERTScore-F: ", results_dict['bertscore_f'])
        print("UMLS-F: ", results_dict['umls_f'])
    except:
        pass
    
    print("LLM-as-a-Judge: ", results_dict['llm_as_a_judge'])

In [None]:
get_final_results(gpt4_results)

### All IFT Models

In [None]:
all_ift_models = [("gpt4", "gpt4_predictions", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/self_rewarding/GPT4_Test_Predictions/GPT4_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023_correct_prompt.csv"),
    ('opus', "claude_opus_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/Claude_Opus_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ('sonet', "claude_sonet_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/Claude_Sonet_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ("gpt-4o-mini", "gpt_4o_mini_predictions", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT-4o-mini/GPT-4o-mini_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ('gpt3', "gpt3_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT3/GPT3_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ("haiku", "claude_haiku_predictions","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/Claude_Haiku_MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv"),
             ("R1-M0", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m0_mistral/ift_performance/m0_mistral.csv"),
             ("R1-M1", "biomistral_responses", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m1_mistral/ift_performance/M1_low_eft_data-0.4_ep-2_rank-32.csv"),
             ("R1-M2", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m2_mistral/ift_performance/M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2.csv"),
             ("R1-M3", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m3_mistral/ift_performance/M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16.csv"),
             ("R1-M4", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m4_mistral/ift_performance/M4_model_M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-16.csv"),
             ("R1-M5", 'biomistral_responses', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m5_mistral/ift_performance/M5_model_M4_model_M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-64.csv")
             ]

In [None]:
import pandas as pd
eval_metric = IFTEvalMetrics(datapath=all_ift_models[5][2],
                              target_name='section_text', 
                              prediction_name=all_ift_models[5][1])

gpt4_results = eval_metric.run(judge_model='gpt-4o-mini')

In [None]:
get_final_results(gpt4_results)

### EFT Metrics
1. Pearson
2. Spearman Rank
3. Kendall's Tau
4. Micro F1
5. Macro F1

In [None]:
all_eft_models = [('opus', "claude_opus_scores","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/val_data_rank_responses_with_gpt4_scores_and_claude_opus_scores.csv"),
             ('sonet', "claude_sonet_scores","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/val_data_rank_responses_with_gpt4_scores_and_claude_sonet_scores.csv"),
             ("gpt-4o-mini", "gpt_4o_mini_scores", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT-4o-mini/val_data_rank_responses_with_gpt4_scores_and_gpt_4o_mini_scores.csv"),
             ('gpt3', "gpt3_scores","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT3/val_data_rank_responses_with_gpt4_scores_and_gpt3_scores.csv"),
             ("haiku", "claude_haiku_scores","/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/Claude/val_data_rank_responses_with_gpt4_scores_and_claude_haiku_scores.csv"),
             ("R1-M0", 'biomistral_scores', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m0_mistral/eft_performance/m0_mistral.csv"),
             ("R1-M1", "biomistral_scores", "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m1_mistral/eft_performance/M1_low_eft_data-0.4_ep-2_rank-32.csv"),
             ("R1-M2", 'biomistral_scores', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m2_mistral/eft_performance/M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2.csv"),
             ("R1-M3", 'biomistral_scores', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m3_mistral/eft_performance/M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16.csv"),
             ("R1-M4", 'biomistral_scores', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m4_mistral/eft_performance/M4_model_M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-16.csv"),
             ("R1-M5", 'biomistral_scores', "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/summer_2024/low_eft_data/experiments/m5_mistral/eft_performance/M5_model_M4_model_M3_model_M2_model_M1_low_eft_data-0.4_ep-2_rank-32_temp-1.0_lr-5e-05_ep-2_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-16_temp-1.0_lr-5e-05_ep-_rank-64.csv")
             ]

In [None]:
print(f"python eft_eval.py {all_eft_models[2][2]} {all_eft_models[2][1]}")

In [None]:
# balanced_eft_data = pd.read_csv("/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Self-Rewarding-LM/analysis/GPT3_vs_Sonet/eft_test_data_balanced_with_gpt3_scores.csv")
# # balanced_eft_data[['ID', 'section_header', 'dialogue', 'section_text', "llm_as_a_judge_prompt", "gpt4_scores"]].to_csv("balanced_eft_test_data.csv", index=False)