# Example of how to use Azure OpenAI to compare two responses

In [1]:
import openai
import os

AZURE_OPENAI_RESOURCE = os.environ.get("AZURE_OPENAI_RESOURCE")
AZURE_OPENAI_KEY = os.environ.get("AZURE_OPENAI_KEY")
AZURE_OPENAI_TEMPERATURE = os.environ.get("AZURE_OPENAI_TEMPERATURE")
AZURE_OPENAI_MAX_TOKENS = os.environ.get("AZURE_OPENAI_MAX_TOKENS")
AZURE_OPENAI_CHAT_COMPLETION_MODEL = os.environ.get("AZURE_OPENAI_CHAT_COMPLETION_MODEL")
AZURE_OPENAI_CHAT_COMPLETION_ENGINE = os.environ.get("AZURE_OPENAI_CHAT_COMPLETION_ENGINE")

openai.api_key  = AZURE_OPENAI_KEY
openai.api_type = "azure"
openai.api_version = "2023-08-01-preview"
openai.api_base = f"https://{AZURE_OPENAI_RESOURCE}.openai.azure.com/"    


def get_completion(input_prompt, 
                   printResult = True, 
                   returnWholeObject = False):
    completion = openai.Completion.create(
        prompt=input_prompt, 
        model=AZURE_OPENAI_CHAT_COMPLETION_MODEL,
        engine=AZURE_OPENAI_CHAT_COMPLETION_ENGINE, 
        temperature=float(AZURE_OPENAI_TEMPERATURE), 
        max_tokens=int(AZURE_OPENAI_MAX_TOKENS), 
        stop=["<|im_end|>", "<|im_start|>"])

    if printResult:
        print(completion)
        
    if returnWholeObject:
        return completion
    else:
        return completion["choices"][0]["text"]


In [None]:
qa_guidelines = """<|im_start|>user
Compare the robot resposne versus human response. The robot response is delimited by ###ROBOT### and 
the human response is delimited by ###HUMAN###.
Answer the next questions:
Question-1 : Is the robot's response correct?
Question-2 : Is the robot's response similar to the human answer?
For your responses follow the next format:
Question-1--Correctness--YES or NO--Summarize your arguments in 10 words
Question-2--Similarity--YES or NO--Summarize your arguments in 10 words

###ROBOT###
{robot_response}
###ROBOT###

###HUMAN###
{human_response}
###HUMAN###
<|im_end|>"""
assistant_prompt = """<|im_start|>assistant
"""

robot_answer = """Estimado Silvina Pliego, según la documentación consultada, 
es posible cambiar el rango de horario una vez que se ha optado por uno en el programa ANTI TRÁFICO (LATAM) 1 . 
Sin embargo, se espera que una vez que se haya optado por un rango, se mantenga estable por al menos un mes 
para una fluida organización. Por favor, si necesitas más información o si esta respuesta no responde 
completamente a tu pregunta, no dudes en proporcionar más detalles para que pueda ayudarte mejor."""

human_answer = """Sí, pero se requiere que el rango de horario laboral se mantenga por un mes"""

prompt = qa_guidelines.format(robot_response=robot_answer, human_response=human_answer) + assistant_prompt

print(prompt)

In [None]:
result = get_completion(input_prompt=prompt, printResult=False)

print(result)

In [None]:
result

In [None]:
parts = result.replace("\n", "").replace("\r", "").split("Question-")

In [None]:
parts = result.replace("\n", "").replace("\r", "").split("Question-")
data = []
for i, part in enumerate(parts, start=1):
    if part:
        info = part.split("--")
        print(f"i: {i} part: {part} info: {info}")
        test_id = info[0]
        test = info[1]
        result = 1 if info[2] == "YES" else 0
        reason = info[3]
        item = {
            "test_id": test_id,
            "test": test,
            "result": result,
            "reason": reason
        }
        print(f"Item to add : {item}")
        data.append(item)
    


In [None]:
import json

def transform_open_ai_compare_to_json(input_string, debug=False):
    data = []
    if debug: print(f" input string to process {input_string}")
    parts = input_string.replace("\n", "").replace("\r", "").split("Question-")
    if debug: print(f"parts : {parts}")
    for i, part in enumerate(parts, start=1):
        if part:
            info = part.split("--")
            if debug: print(f"i: {i} part: {part} info: {info}")
            test_id = info[0]
            test = info[1]
            result = 1 if info[2] == "YES" else 0
            reason = info[3]
            item = {
                "test_id": test_id,
                "test": test,
                "result": result,
                "reason": reason
            }
            if debug: print(f"Item to add : {item}")
            data.append(item)
    if debug: print(f"data to json : {data}")
    return json.dumps(data, indent=2)

In [None]:
result_to_test = get_completion(input_prompt=prompt, printResult=False)

print(result_to_test)
output_json = transform_open_ai_compare_to_json(result_to_test,debug=True)
print(output_json)

In [None]:
result_to_test = get_completion(input_prompt=prompt, printResult=False)

print(result_to_test)
output_json = transform_open_ai_compare_to_json(result_to_test,debug=True)
print(output_json)

In [2]:
EVAL_QUERY_BETTER_RESPONSE = """<|im_start|>user
    Evaluate the response A against the query.
    Evaluate the response B against the query.
    Compare response A versus response B.
    The query is delimited by ###QUERY###.
    The response A is delimited by ###RESPONSE_A###. 
    The response B is delimited by ###RESPONSE_B###.
    Answer the next questions:
    Question-1 : Is the response A similar to response B?
    Question-2 : Is the response A correct?
    Question-3 : Is the response B correct?
    Question-4 : Which response is better?
    For the next questions, on a scale of 1 to 10, where 1 is the lowest score and 10 the highest, score simplicity 
    and relevant information.
    Question-5 : What would be the score for response A?
    Question-6 : What would be the score for response B?
    For your responses follow the next format:
    Question-1--Similarity--YES or NO--Summarize your arguments in 10 words
    Question-2--Response A Correctness--YES or NO--Summarize your arguments in 10 words
    Question-3--Response B Correctness--YES or NO--Summarize your arguments in 10 words
    Question-4--Better response--A or B--Summarize your arguments in 10 words
    Question-5--Score A --Number between 1 and 10--Summarize your arguments in 10 words
    Question-6--Score B --Number between 1 and 10--Summarize your arguments in 10 words

    ###QUERY###
    {query}
    ###QUERY###

    ###RESPONSE_A###
    {response_a}
    ###RESPONSE_A###
    
    ###RESPONSE_B###
    {response_b}
    ###RESPONSE_B###
    <|im_end|><|im_start|>assistant"""


In [3]:
response_a = """Estimado Silvina Pliego, según la documentación consultada, 
es posible cambiar el rango de horario una vez que se ha optado por uno en el programa ANTI TRÁFICO (LATAM) 1 . 
Sin embargo, se espera que una vez que se haya optado por un rango, se mantenga estable por al menos un mes 
para una fluida organización. Por favor, si necesitas más información o si esta respuesta no responde 
completamente a tu pregunta, no dudes en proporcionar más detalles para que pueda ayudarte mejor."""

response_b = """Sí, pero se requiere que el rango de horario laboral se mantenga por un mes"""

question_1 = """¿Se puede cambiar el rango de horario una vez que elegí uno?"""

prompt = EVAL_QUERY_BETTER_RESPONSE.format(
    response_a=response_a, 
    response_b=response_b,
    query=question_1)

print(prompt)

<|im_start|>user
    Evaluate the response A against the query.
    Evaluate the response B against the query.
    Compare response A versus response B.
    The query is delimited by ###QUERY###.
    The response A is delimited by ###RESPONSE_A###. 
    The response B is delimited by ###RESPONSE_B###.
    Answer the next questions:
    Question-1 : Is the response A similar to response B?
    Question-2 : Is the response A correct?
    Question-3 : Is the response B correct?
    Question-4 : Which response is better?
    For the next questions, on a scale of 1 to 10, where 1 is the lowest score and 10 the highest, score simplicity 
    and relevant information.
    Question-5 : What would be the score for response A?
    Question-6 : What would be the score for response B?
    For your responses follow the next format:
    Question-1--Similarity--YES or NO--Summarize your arguments in 10 words
    Question-2--Response A Correctness--YES or NO--Summarize your arguments in 10 words
    Q

In [4]:
result = get_completion(input_prompt=prompt, printResult=False)

print(result)


Question-1--Similarity--YES--Both responses mention the possibility of changing the schedule and the requirement of maintaining it for at least a month.
    Question-2--Response A Correctness--YES--The response provides a detailed explanation and offers further assistance.
    Question-3--Response B Correctness--YES--The response is concise and accurate.
    Question-4--Better response--A--Provides more information and offers further assistance.
    Question-5--Score A--8--The response is detailed and informative, but could be more concise.
    Question-6--Score B--7--The response is accurate and concise, but lacks additional information.
