#  04 Evaluation

### Amazon Bedrock의 fine tuning된 모델 가져오기

In [None]:
import boto3
import sagemaker
import json

sess = sagemaker.Session()
region = sess.boto_region_name
client = boto3.client("bedrock-runtime", region_name=region)
model_id = "<ENTER_YOUR_MODEL_ARN_HERE>"

assert model_id != "<ENTER_YOUR_MODEL_ARN_HERE>", "ERROR: Please enter your model id"

def get_sql_query(system_prompt, user_question):
    """
    Generate a SQL query using Llama 3 8B
    Remember to use the same template used in fine tuning
    """
    formatted_prompt = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>>\n\n[INST]Human: Return the SQL query that answers the following question: {user_question}[/INST]\n\nAssistant:"
    native_request = {
        "prompt": formatted_prompt,
        "max_tokens": 100,
        "top_p": 0.9,
        "temperature": 0.1
    }
    response = client.invoke_model(modelId=model_id,
                                   body=json.dumps(native_request))
    response_text = json.loads(response.get('body').read())["outputs"][0]["text"]

    return response_text

### 테스트 데이터셋 가져오기

In [None]:
import pandas as pd

test_df = pd.read_json("../datasets/ko_test_dataset.json", lines=True)["messages"]

def extract_content(dicts, role):
    for d in dicts:
        if d['role'] == role:
            return d['content']
    return None

df = pd.DataFrame()
for role in ['system', 'user', 'assistant']:
    df[role] = test_df.apply(lambda x: extract_content(x, role))
del test_df

df = df[:100]

In [None]:
df['llama'] = df.apply(lambda row: get_sql_query(row['system'], row['user']), axis=1)
df.head()

- pandas로 데이터셋 미리보기

### Claude 3.5 Sonnet을 이용해 정확도 측정하기

In [None]:
# Helper function because Claude requires the Messages API

#for connecting with Bedrock, use Boto3
import boto3, time, json
from botocore.config import Config

my_config = Config(connect_timeout=60*3, read_timeout=60*3)
bedrock = boto3.client(service_name='bedrock-runtime',config=my_config)
bedrock_service = boto3.client(service_name='bedrock',config=my_config)

MAX_ATTEMPTS = 3 #how many times to retry if Claude is not working.

def ask_claude(messages,system="", model_version="haiku"):
    '''
    Send a prompt to Bedrock, and return the response
    '''
    raw_prompt_text = str(messages)
    
    if type(messages)==str:
        messages = [{"role": "user", "content": messages}]
    
    promt_json = {
        "system":system,
        "messages": messages,
        "max_tokens": 3000,
        "temperature": 0.7,
        "anthropic_version":"",
        "top_k": 250,
        "top_p": 0.7,
        "stop_sequences": ["\n\nHuman:"]
    }
    
    modelId = 'anthropic.claude-3-5-sonnet-20240620-v1:0'
    
    attempt = 1
    while True:
        try:
            response = bedrock.invoke_model(body=json.dumps(promt_json), modelId=modelId, accept='application/json', contentType='application/json')
            response_body = json.loads(response.get('body').read())
            results = response_body.get("content")[0].get("text")
            break
        except Exception as e:
            print("Error with calling Bedrock: "+str(e))
            attempt+=1
            if attempt>MAX_ATTEMPTS:
                print("Max attempts reached!")
                results = str(e)
                break
            else: #retry in 2 seconds
                time.sleep(2)
    return [raw_prompt_text,results]

In [None]:
import re

def get_score(system, user, assistant, llama):
    db_schema = system[139:] # Remove generic instructions
    question = user[58:] # Remove generic instructions
    correct_answer = assistant
    test_answer = llama
    formatted_prompt = f"""You are a data science teacher that is introducing students to SQL. Consider the following question and schema:
<question>{question}</question>
<schema>{db_schema}</schema>
    
Here is the correct answer:
<correct_answer>{correct_answer}</correct_answer>
    
Here is the student's answer:
<student_answer>{test_answer}<student_answer>

Please provide a numeric score from 0 to 100 on how well the student's answer matches the correct answer for this question.
The score should be high if the answers say essentially the same thing.
The score should be lower if some parts are missing, or if extra unnecessary parts have been included.
The score should be 0 for an entirely wrong answer. Put the score in <SCORE> XML tags.
Do not consider your own answer to the question, but instead score based only on the correct answer above.
"""
    _, result = ask_claude(formatted_prompt, model_version="sonnet")
    pattern = r'<SCORE>(.*?)</SCORE>'
    match = re.search (pattern, result)
    
    return match.group(1)

### Score 보기

In [None]:
scores = []
for ix in range(len(df)):
    response = float(get_score(df["system"][ix], df["user"][ix], df["assistant"][ix], df["llama"][ix]))
    scores.append(response)
print("Assigned scores: ", scores)
average_score = sum(scores) / len(scores)
print("Average score:", average_score)

- 참고: 테스트 결과 아래와 같이 90.45%의 평균 정확도를 보였습니다.
    ```
    Assigned scores:  [80.0, 90.0, 90.0, 95.0, 50.0, 40.0, 100.0, 90.0, 100.0, 75.0, 80.0, 100.0, 100.0, 80.0, 80.0, 100.0, 60.0, 100.0, 90.0, 100.0, 100.0, 100.0, 80.0, 60.0, 100.0, 100.0, 100.0, 95.0, 75.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 95.0, 75.0, 60.0, 95.0, 100.0, 100.0, 100.0, 100.0, 90.0, 40.0, 100.0, 0.0, 90.0, 100.0, 100.0, 100.0, 100.0, 75.0, 80.0, 100.0, 100.0, 100.0, 100.0, 60.0, 95.0, 100.0, 60.0, 100.0, 100.0, 100.0, 100.0, 100.0, 50.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 80.0, 100.0, 100.0, 90.0, 100.0, 100.0, 100.0, 100.0, 100.0, 90.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 50.0, 100.0, 60.0, 100.0, 100.0, 100.0]
    Average score: 90.45

    ```