# Code to evaluate the performance of an LLM for the purpose of a Smart Home Assistant

In [1]:
%pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [2]:
import langchain, json, os
from IPython.display import display, Markdown
from langchain.llms import HuggingFaceTextGenInference
from langchain import PromptTemplate
from langchain.chains import LLMChain


In [1]:
#INFERENCE_SERVER_URL = 'http://localhost:8080'
INFERENCE_SERVER_URL = "https://7u2objd8gw5y1t-80.proxy.runpod.net" # falcon-40b-instruct
EVAL_PROMPT_DIR = "./prompts/"
RETRYS = 5


In [3]:
def get_all_file_names(directory_path):
    return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

def clean(in_string):
    out_string = in_string.strip("<pre><code>").strip("</code></pre>")
    return out_string

def evaluate_single_model(llm, instruction_token = "", user_token = "", response_token = ""):
    # load all files in EVAL_PROMPT_DIR and convert JSON 
    eval_prompts = {}
    for file_name in get_all_file_names(EVAL_PROMPT_DIR):
        with open(EVAL_PROMPT_DIR + file_name) as json_file:
            evaluation_prompts_for_single_file = json.load(json_file)
            eval_prompts[file_name] = evaluation_prompts_for_single_file
    
    results = {"pass": 0, "fail": 0, "category_results": {}}

    for category in eval_prompts:
        category_values = eval_prompts[category]

        category_results = {"pass": 0, "fail": 0}

        for single_prompt_case in category_values:
            # create the actuall promte template
            prompt = PromptTemplate(
                input_variables = ["input", "user_request_token"],
                template = "".join([instruction_token, single_prompt_case["prompt"], response_token]))
            
            test_cases = single_prompt_case["in_out_pairs"]
            for case in test_cases:
                llm_input = case["input"]
                llm_expected_output = case["expected_answer"]
                chain = LLMChain(llm=llm, prompt=prompt)
                    
                for _ in range(RETRYS):
                    llm_expected_output_json = json.loads(llm_expected_output)
                    llm_output = chain.run({"input": llm_input,"user_request_token": user_token})
                    #llm_output = clean(llm_output)
                    try:
                        llm_output_json = json.loads(llm_output)
                    except:
                        results["fail"] += 1
                        category_results["fail"] += 1

                        print(llm_input)
                        print(llm_output)
                        print(llm_expected_output_json)
                        print("------------------------------------------------------------------------------------")
                        continue

                    if llm_expected_output_json == llm_output_json:
                        results["pass"] += 1
                        category_results["pass"] += 1
                        
                    else:
                        results["fail"] += 1
                        category_results["fail"] += 1

                        print(llm_input)
                        print(llm_output_json)
                        print(llm_expected_output_json)
                        print(llm_output_json == llm_expected_output_json)
                        print("------------------------------------------------------------------------------------")
        results["category_results"][category] = category_results
                    

    return results

def eval():
    # get llm
    llm = HuggingFaceTextGenInference(
        inference_server_url=INFERENCE_SERVER_URL,
        max_new_tokens=100,
        stop_sequences=["USER REQUEST", "USER", "\n\n"],
        top_k=10,
        top_p=0.95,
        typical_p=0.95,
        temperature=0.1,
        repetition_penalty=1.1,
    )
    langchain.debug = False
    results = evaluate_single_model(llm, "","### Insturction: How would the JSON look for this USER REQUEST: ", "\n\n### Response:\n")
    print(results)
    

eval()

  from .autonotebook import tqdm as notebook_tqdm


Kill the Study Light.
{'device': 'Study Light', 'action': 'kill'}
{'device': 'Study Light', 'action': 'turn off'}
False
------------------------------------------------------------------------------------
Kill the Study Light.
{'device': 'Study Light', 'action': 'kill'}
{'device': 'Study Light', 'action': 'turn off'}
False
------------------------------------------------------------------------------------
Kill the Study Light.
{'device': 'Study Light', 'action': 'kill'}
{'device': 'Study Light', 'action': 'turn off'}
False
------------------------------------------------------------------------------------
Kill the Study Light.
{'device': 'Study Light', 'action': 'kill'}
{'device': 'Study Light', 'action': 'turn off'}
False
------------------------------------------------------------------------------------
Kill the Study Light.
{'device': 'Study Light', 'action': 'kill'}
{'device': 'Study Light', 'action': 'turn off'}
False
------------------------------------------------------------

# Results

!!!!!! rerun all medium_wo_examples !!!!!!

format: model_name: Pass - Fail

## easy.json

EleutherAI/gpt-j-6b: 55 - 45

pythia-12b-deduped: 63 - 37

Platypus2-7b: 81 - 19 

Platypus2-13b: 96 - 4

Platypus2-70b-instruct: 100 - 0

Falcon-40b-instruct: 92 - 8

Falcon-40b: 100 - 0

## easy_wo_examples.json

EleutherAI/gpt-j-6b: 5 - 95

pythia-12b-deduped: 19 - 81

Platypus2-7b: 38 - 64

Platypus2-13b: 50 - 50

Platypus2-70b-instruct: 50 - 50

Falcon-40b-instruct: 48 - 52

Falcon-40b: 50 - 50

## medium.json

EleutherAI/gpt-j-6b: 73 - 27

pythia-12b-deduped: 74 - 26

Platypus2-7b: 100 - 0

Platypus2-13b: 100 - 0

Platypus2-70b-instruct: 100 - 0

Falcon-40b-instruckt: 100 - 0

Falcon-40b: 100 - 0

## medium_wo_examples.json

EleutherAI/gpt-j-6b: 75 - 25

pythia-12b-deduped: 2 - 98 (updated)

Platypus2-7b: 100 - 0

Platypus2-13b: 100 - 0

Platypus2-70b-instruct: 100 - 0

Falcon-40b-instruckt: 100 - 0

Falcon-40b: 100 - 0

## medium_w_mistakes.json

pythia-12b-deduped: 79 - 21

Platypus2-7b: 100 - 0

Platypus2-13b: 100 - 0

## long.json

pythia-12b-deduped: 73 - 27

Platypus2-7b: 95 - 5

Platypus2-13b: 95 - 5


## long_wo_examples.json

pythia-12b-deduped: 15 - 85

Platypus2-7b: 49 - 51

Platypus2-13b: 85 - 15


# Interference Hardware

Platypus2-7b: 1x RTX 4090

Platypus2-13b: 2x RTX 3090

Falcon-40b(-instruct): 4x RTX 3090

Platypus2-70b-instruct: 2x A100 80GB