In [3]:
import pandas as pd
import numpy as np
import json
import re
import os
from openai import OpenAI
from anthropic import Anthropic

#### Initialize Stuff

In [4]:
together = OpenAI(api_key="API_KEY", base_url="https://api.together.xyz/v1")
client = Anthropic( api_key = "API_KEY")
gpt = OpenAI(api_key="API_KEY")

In [5]:
# Specify the directory path
directory = "../final"

# Get the list of file names in the directory
file_list = os.listdir(directory)

In [43]:
able = ['_ll70B.json','_mix.json','_gpt4.json','_mis.json','_gpt3.json','_ll7B.json','_ll13B.json']

In [7]:
file_list

['Llama-2-70b-chat-hf_results.json',
 'Mixtral-8x7B-Instruct-v0.1_results.json',
 'gpt-4-0125-preview_results.json',
 'Mistral-7B-Instruct-v0.2_results.json',
 'gpt-3.5-turbo-0125_results.json',
 'Llama-2-7b-chat-hf_results.json',
 'Llama-2-13b-chat-hf_results.json']

#### Claude Eval

In [65]:
def get_eval(context,question,prediction,answer) -> float:
    QUESTION = f"""{context}\n Given the above context, Answer the question that follows:\n {question}"""
    MODEL_OUTPUT = prediction
    GROUND_TRUTH = answer

    instruction = f'''<Instructions> You will be evaluating whether a language model's output to a question means essentially the same thing as the provided ground truth answer, even if not worded identically.

The question is: <question> {QUESTION} </question>

The language model's output is: <model_output> {MODEL_OUTPUT} </model_output>

The ground truth answer is: <ground_truth> {GROUND_TRUTH} </ground_truth>

To determine if the model output means the same thing as the ground truth:

Carefully read and understand the meaning of the model output and ground truth. Ignore minor wording or formatting differences.

Consider whether the key facts, ideas, and opinions expressed in the model output align with those in the ground truth.

The model output doesn't need to cover every single detail in the ground truth. As long as it captures the main points without saying anything contradictory, that is sufficient.

First, explain your reasoning: <reasoning> </reasoning>

Then, output your final result, which should be either: <result>YES</result> if the model output captures the key meaning of the ground truth, or <result>NO</result> if the model output fails to capture the essential meaning, or contradicts the ground truth in some way. </Instructions>'''
    
    message = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=1024,
    temperature=0.2,
    messages=[
        {"role": "user", "content": instruction}
    ]
    )

    text = message.content[0].text

    pattern_reason = r'<reasoning>\s*(.*?)\s*</reasoning>'
    pattern_result = r'<result>\s*(.*?)\s*</result>'
    match_reason = re.search(pattern_reason, text, re.DOTALL)
    match_result = re.search(pattern_result, text, re.DOTALL)
    if match_reason:
        reason = match_reason.group(1)
    else:
        reason = " "

    if match_result:
        extracted_text = match_result.group(1)
        if extracted_text == "YES":
            result = 1.0
        if extracted_text == "NO":
            result = 0.0
    else:
        result = -1.0
    
    return result,reason

#### Open Source LLM eval

function that defines the usage of Open Source LLMs from the together api. 

In [None]:
MODEL = 'NousResearch/Nous-Hermes-2-Yi-34B'

In [48]:
def get_eval_op(context,question,prediction,answer) -> float:
    QUESTION = f"""{context}\n Given the above context, Answer the question that follows:\n {question}"""
    #QUESTION = f"""{question}"""
    MODEL_OUTPUT = prediction
    GROUND_TRUTH = answer

    instruction = f'''<Instructions> You will be evaluating whether a language model's output to a question means essentially the same thing as the provided ground truth answer, even if not worded identically.

The question is: <question> {QUESTION} </question>

The language model's output is: <model_output> {MODEL_OUTPUT} </model_output>

The ground truth answer is: <ground_truth> {GROUND_TRUTH} </ground_truth>

To determine if the model output means the same thing as the ground truth:

Carefully read and understand the meaning of the model output and ground truth. Ignore minor wording or formatting differences.

Consider whether the key facts, ideas, and opinions expressed in the model output align with those in the ground truth.

The model output doesn't need to cover every single detail in the ground truth. As long as it captures the main points without saying anything contradictory, that is sufficient.

First, explain your reasoning within the tags <reasoning> </reasoning>

Then, output your final result, which should be either: <result>YES</result> if the model output captures the key meaning of the ground truth, or <result>NO</result> if the model output fails to capture the essential meaning, or contradicts the ground truth in some way. </Instructions>'''
    response = together.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "user", "content": instruction}],
  temperature=0.0
    )

    text = response.choices[0].message.content
    print(text)
    pattern_reason = r'<reasoning>\s*(.*?)\s*</reasoning>'
    pattern_result = r'<result>\s*(.*?)\s*</result>'
    match_reason = re.search(pattern_reason, text, re.DOTALL)
    match_result = re.search(pattern_result, text, re.DOTALL)
    if match_reason:
        reason = match_reason.group(1)
    else:
        reason = " "

    if match_result:
        extracted_text = match_result.group(1)
        if extracted_text == "YES":
            result = 1.0
        if extracted_text == "NO":
            result = 0.0
    else:
        result = -1.0
    
    return result,reason


#### GPT_eval

The function that uses GPT-4 to perform evaluation

In [None]:
def get_eval_op(context,question,prediction,answer) -> float:
    QUESTION = f"""{context}\n Given the above context, Answer the question that follows:\n {question}"""
    #QUESTION = f"""{question}"""
    MODEL_OUTPUT = prediction
    GROUND_TRUTH = answer

    instruction = f'''<Instructions> You will be evaluating whether a language model's output to a question means essentially the same thing as the provided ground truth answer, even if not worded identically.

The question is: <question> {QUESTION} </question>

The language model's output is: <model_output> {MODEL_OUTPUT} </model_output>

The ground truth answer is: <ground_truth> {GROUND_TRUTH} </ground_truth>

To determine if the model output means the same thing as the ground truth:

Carefully read and understand the meaning of the model output and ground truth. Ignore minor wording or formatting differences.

Consider whether the key facts, ideas, and opinions expressed in the model output align with those in the ground truth.

The model output doesn't need to cover every single detail in the ground truth. As long as it captures the main points without saying anything contradictory, that is sufficient.

First, explain your reasoning within the tags <reasoning> </reasoning>

Then, output your final result, which should be either: <result>YES</result> if the model output captures the key meaning of the ground truth, or <result>NO</result> if the model output fails to capture the essential meaning, or contradicts the ground truth in some way. </Instructions>'''
    response = gpt.chat.completions.create(
  model='gpt-4-0125-preview',
  messages=[
    {"role": "user", "content": instruction}],
  temperature=0.0
    )

    text = response.choices[0].message.content
    print(text)
    pattern_reason = r'<reasoning>\s*(.*?)\s*</reasoning>'
    pattern_result = r'<result>\s*(.*?)\s*</result>'
    match_reason = re.search(pattern_reason, text, re.DOTALL)
    match_result = re.search(pattern_result, text, re.DOTALL)
    if match_reason:
        reason = match_reason.group(1)
    else:
        reason = " "

    if match_result:
        extracted_text = match_result.group(1)
        if extracted_text == "YES":
            result = 1.0
        if extracted_text == "NO":
            result = 0.0
    else:
        result = -1.0
    
    return result,reason


#### Reason Ablation  

In [69]:
def get_eval_able(context,question,prediction,answer) -> float:
    QUESTION = f"""{context}\n Given the above context, Answer the question that follows:\n {question}"""
    #QUESTION = f"""{question}"""
    MODEL_OUTPUT = prediction
    GROUND_TRUTH = answer

    instruction = f'''<Instructions> You will be evaluating whether a language model's output to a question means essentially the same thing as the provided ground truth answer, even if not worded identically.

The question is <question> {QUESTION} </question>    
The language model's output is: <model_output> {MODEL_OUTPUT} </model_output>

The ground truth answer is: <ground_truth> {GROUND_TRUTH} </ground_truth>

To determine if the model output means the same thing as the ground truth:

Carefully read and understand the meaning of the model output and ground truth. Ignore minor wording or formatting differences.

Consider whether the key facts, ideas, and opinions expressed in the model output align with those in the ground truth.

The model output doesn't need to cover every single detail in the ground truth. As long as it captures the main points without saying anything contradictory, that is sufficient.

Output your final result, which should be either: <result>YES</result> if the model output captures the key meaning of the ground truth or <result>NO</result> if the model output fails to capture the essential meaning, or contradicts the ground truth in some way. </Instructions>'''
    response = together.chat.completions.create(
  model='NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
  messages=[
    {"role": "user", "content": instruction}],
  temperature=0.0
    )

    text = response.choices[0].message.content
    print(text)
    pattern_yes = r'(?i)YES'
    pattern_no = r'(?i)NO'
    match_yes = re.search(pattern_yes, text, re.DOTALL)
    match_no = re.search(pattern_no, text, re.DOTALL)
    
    if match_yes:
            result = 1.0
    
    elif match_no:
            result = 0.0
       
    else:
        result = -1.0
    
    return result

#### Traditional Metrics

In [None]:
bertscore = load("bertscore")
bleurt = load("bleurt",module_type="metric")

In [None]:
def exact_match(predictions,answer):
    em = load("exact_match")
    result = em.compute(predictions=[predictions], references=[answer])
    score = result["exact_match"]
    #print(score)
    return score

    
def BLEU(predictions,answer):
    bleu = load("bleu")
    result = bleu.compute(predictions=[predictions], references=[answer])
    score = result["bleu"]
    #print(score)
    return score
   

def BLEURT(predictions,answer):
    result = bleurt.compute(predictions=[predictions], references=[answer])
    score = result['scores'][0]
    return score


def BERTscore(predictions,answer):
    result = bertscore.compute(predictions=[predictions], references=[answer], lang= 'en')
    p = np.array(result['precision'])
    r = np.array(result['recall'])
    score = 2*((p*r)/(p+r))
    return score[0]

#### Run on predictions

In [44]:
df = pd.read_json('../results/original.json',orient='records',lines=True)

In [None]:
df['human_eval'] = df['human_eval'].replace({'Yes': 1, 'No': 0})

In [71]:
# Default
for i in range(0,len(file_list)):
    predictions = pd.read_json( "../final/"+file_list[i],orient='records',lines=True)
    try:
        df['prediction'] = predictions['prediction']
    except:
        df['prediction'] = predictions[file_list[i][:-13]+'_pred']
        predictions = predictions.rename(columns={file_list[i][:-13]+'_pred': 'prediction'})
    
    results = df.apply(lambda row: get_eval_able(row['llm_context'], row['question'],row[f'prediction'],row['answer']), axis=1)
    predictions[f'mix_eval'] = results
    
    # Traditional Metrics
    predictions['exact_match'] = predictions.apply(lambda row: exact_match(row['predictions'],row['answer']), axis=1)
    print('EM Done')
    predictions['BLEU'] = predictions.apply(lambda row: BLEU(row['predictions'],row['answer']), axis=1)
    print('BLEU Done')
    predictions['BERTScore'] = predictions.apply(lambda row: BERTscore(row['predictions'],row['answer']), axis=1)
    print('BERTscore Done')
    predictions['BLEURT'] = predictions.apply(lambda row: BLEURT(row['predictions'],row['answer']), axis=1)
    predictions['BLEURT_norm'] = (predictions['BLEURT'] - predictions['BLEURT'].min()) / (predictions['BLEURT'].max() - predictions['BLEURT'].min())
    print('BLEURT Done')
    print(f'[+] {i} Done...')


# Reason Ablations
for i in range(0,len(file_list)):
    predictions = pd.read_json( "../final/"+file_list[i],orient='records',lines=True)
    try:
        df['prediction'] = predictions['prediction']
    except:
        df['prediction'] = predictions[file_list[i][:-13]+'_pred']
        predictions = predictions.rename(columns={file_list[i][:-13]+'_pred': 'prediction'})
    
    results = df.apply(lambda row: get_eval_able(row['llm_context'], row['question'],row[f'prediction'],row['answer']), axis=1)
    predictions[f'mix_eval'] = results
    print(f'[+] reason{able[i]} Done...')

<result>NO</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>NO</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
<result>YES</result>
