# Code to evaluate the performance of an LLM for the purpose of a Smart Home Assistant

In [1]:
%pip install langchain rwkv tokenizer openai

Note: you may need to restart the kernel to use updated packages.


In [1]:
import langchain, json, os, importlib
from IPython.display import display, Markdown
from langchain.llms import HuggingFaceTextGenInference, OpenAIChat
from langchain import PromptTemplate
from langchain.chains import LLMChain 


In [2]:
INFERENCE_SERVER_URL = 'http://localhost:8080'
#INFERENCE_SERVER_URL = 'http://localhost:8000' # RWKV-Runner
#INFERENCE_SERVER_URL = 'https://6q7c8186vl9z3s-80.proxy.runpod.net' # runpod
EVAL_PROMPT_DIR = "./prompts/"
RETRYS = 5


In [8]:
def get_all_file_names(directory_path):
    return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

def fake_json_for_UL2(in_string):
    out_string = "{" + in_string.replace("[", "[{").replace("]", "}]") + "}"
    return out_string
 
def clean(in_string):
    first_position = in_string.find("{")

    if first_position >= 0:
        # Keep everything after the delimiter
        out_string = in_string[first_position:]
    else:
        # The delimiter was not found, so there's nothing to keep
        out_string = in_string

    last_position = out_string.rfind("}")

    if first_position >= 0:
        # Keep everything after the delimiter
        out_string = out_string[:1+last_position]
    else:
        # The delimiter was not found, so there's nothing to keep
        out_string = in_string
        
    #out_string = fake_json_for_UL2(out_string)
    return out_string

def evaluate_single_model(llm, instruction_token = "", user_token = "", response_token = ""):
    # load all files in EVAL_PROMPT_DIR and convert JSON 
    eval_prompts = {}
    for file_name in get_all_file_names(EVAL_PROMPT_DIR):
        with open(EVAL_PROMPT_DIR + file_name) as json_file:
            evaluation_prompts_for_single_file = json.load(json_file)
            eval_prompts[file_name] = evaluation_prompts_for_single_file
    
    results = {"pass": 0, "fail": 0, "category_results": {}}

    for category in eval_prompts:
        category_values = eval_prompts[category]

        category_results = {"pass": 0, "fail": 0}

        for single_prompt_case in category_values:
            # create the actuall promte template
            prompt = PromptTemplate(
                input_variables = ["input", "user_request_token"],
                template = "".join([instruction_token, single_prompt_case["prompt"].replace("\n\n", "\n"), response_token]))
            
            test_cases = single_prompt_case["in_out_pairs"]
            for case in test_cases:
                llm_input = case["input"]
                llm_expected_output = case["expected_answer"]
                chain = LLMChain(llm=llm, prompt=prompt)
                    
                for _ in range(RETRYS):
                    llm_expected_output_json = json.loads(llm_expected_output)
                    llm_output = chain.run({"input": llm_input,"user_request_token": user_token})
                    llm_output = clean(llm_output)
                    try:
                        llm_output_json = json.loads(llm_output)
                    except:
                        results["fail"] += 1
                        category_results["fail"] += 1

                        print(llm_input)
                        print(llm_output)
                        print(llm_expected_output_json)
                        print("------------------------------------------------------------------------------------")
                        continue

                    if llm_expected_output_json == llm_output_json:
                        results["pass"] += 1
                        category_results["pass"] += 1
                        
                    else:
                        results["fail"] += 1
                        category_results["fail"] += 1

                        print(llm_input)
                        print(llm_output_json)
                        print(llm_expected_output_json)
                        print(llm_output_json == llm_expected_output_json)
                        print("------------------------------------------------------------------------------------")
        results["category_results"][category] = category_results
                    

    return results

def eval():
    # get llm
    llm = HuggingFaceTextGenInference(
        inference_server_url=INFERENCE_SERVER_URL,
        max_new_tokens=100,
        stop_sequences=["USER REQUEST", "USER", "\n\n", "Q:"],
        top_k=10,
        top_p=0.95,
        typical_p=0.95,
        temperature=0.1,
        repetition_penalty=1.1,
    )

    llm_openai = OpenAIChat(openai_api_base=INFERENCE_SERVER_URL, openai_api_key="sk-", model="rwkv")

    langchain.debug = False
    #results = evaluate_single_model(llm, "### Instruction: \n","USER REQUEST: ", "\n### Response:\n")
    #results = evaluate_single_model(llm, "", "### Instruction:\n How would the JSON look for this USER REQUEST: ", "\n### Response:\n") # used for mpt
    #results = evaluate_single_model(llm, "Instruction: \n", "Input: How would the JSON look for this USER REQUEST: ", "\nResponse: \n") # used for RWKV and OpenLLaMA and bloom
    results = evaluate_single_model(llm, "Instruction: \n", "Q: How would the JSON look for this USER REQUEST: ", "\nA:") # used for Pythia and mpt(long and long wo examples)
    print(results)
    

eval()

Turn on the Study Room Light.
 The response will be similar to that of turning on any other lights with some additional fields like studyRoomLight which can have values such as true or false
{'type': 'command', 'commands': [{'device': 'light.study_room', 'action': 'power', 'state': 'on'}]}
------------------------------------------------------------------------------------
Switch off the Bedroom Light.
 The response will be empty because there's no such command
{'type': 'command', 'commands': [{'device': 'light.bedroom', 'action': 'power', 'state': 'off'}]}
------------------------------------------------------------------------------------
Change the color of the Garage Light to red.
 The response will be similar to below with one difference that instead of state we use action which has value 'color'
{'type': 'command', 'commands': [{'device': 'light.garage', 'action': 'color', 'state': 'red'}]}
------------------------------------------------------------------------------------
Chang

# Results

format: model_name: Pass - Fail




## easy.json

EleutherAI/gpt-j-6b: 55 - 45

pythia-12b-deduped: 64 - 36

Platypus2-7b: 81 - 19 

Platypus2-13b: 96 - 4

Platypus2-70b-instruct: 100 - 0

Falcon-40b-instruct: 100 - 0

Falcon-40b: 100 - 0

mpt-30b-instruct: 95 - 5

stablelm-base-alpha-7b-v2: 49 - 51

Cerebras-GPT-13B: 40 - 60

Bloom-7b1: 27 - 73

Bloom-7b1 (custom prompt): 76 - 24

open_llama_13b: 95 - 5

flan-ul2: 0 - 100

flan-ul2 (fake json): 100 - 0

## easy_wo_examples.json

EleutherAI/gpt-j-6b: 5 - 95

pythia-12b-deduped: 19 - 81

Platypus2-7b: 38 - 64

Platypus2-13b: 50 - 50

Platypus2-70b-instruct: 50 - 50

Falcon-40b-instruct: 49 - 51

Falcon-40b: 50 - 50

mpt-30b-instruct: 51 - 49

stablelm-base-alpha-7b-v2: 5 - 95

Cerebras-GPT-13B: 15 - 85

Bloom-7b1: 26 - 74

Bloom-7b1 (custom prompt): 30 - 61

open_llama_13b: 47 - 53

flan-ul2: 0 - 100

flan-ul2 (fake json): 52 - 48

## medium.json

EleutherAI/gpt-j-6b: 73 - 27

pythia-12b-deduped: 74 - 26

Platypus2-7b: 100 - 0

Platypus2-13b: 100 - 0

Platypus2-70b-instruct: 100 - 0

Falcon-40b-instruct: 97 - 3

Falcon-40b: 100 - 0

mpt-30b-instruct: 57 - 43

mpt-30b-instruct (custom prompt): 100 - 0

stablelm-base-alpha-7b-v2: 54 - 46

Cerebras-GPT-13B: 0 - 100

Bloom-7b1: 17 - 83

Bloom-7b1 (custom prompt): 63 - 37

open_llama_13b: 100 - 0

flan-ul2: 0 - 100

flan-ul2 (fake json): 95 - 0

## medium_wo_examples.json

EleutherAI/gpt-j-6b: 75 - 25

pythia-12b-deduped: 3 - 97 (updated)

Platypus2-7b: 100 - 0

Platypus2-13b: 100 - 0

Platypus2-70b-instruct: 92 - 8 (updated)

Falcon-40b-instruct: 92 -  (updated)

Falcon-40b: 82 - 18 (updated)

mpt-30b-instruct: 0 - 100 (updated)

mpt-30b-instruct (custom prompt): 40 - 60

stablelm-base-alpha-7b-v2: 0 - 100

Cerebras-GPT-13B: 13 - 87

Bloom-7b1: 0 - 100

Bloom-7b1 (custom prompt): 0 - 100

open_llama_13b: 47 - 53

flan-ul2: 0 - 100

flan-ul2 (fake json): 0 - 100

## medium_w_mistakes.json

pythia-12b-deduped: 79 - 21

Platypus2-7b: 100 - 0

Platypus2-13b: 100 - 0

Platypus2-70b-instruct: 100 - 0

Falcon-40b-instruct: 100 - 0

Falcon-40b: 100 - 0

mpt-30b-instruct: 96 - 4

stablelm-base-alpha-7b-v2: 39 - 61

Cerebras-GPT-13B: 0 - 100

Bloom-7b1: 13 - 87

Bloom-7b1 (custom prompt): 64 - 36

open_llama_13b: 100 - 0

flan-ul2: 0 - 100

flan-ul2 (fake json): 95 - 5

## long.json

pythia-12b-deduped: 95 - 5

Platypus2-7b: 95 - 5

Platypus2-13b: 95 - 5

Platypus2-70b-instruct: 95 - 5

Falcon-40b-instruct: 95 - 5

Falcon-40b: 100 - 0

mpt-30b-instruct: 46 - 54

stablelm-base-alpha-7b-v2: 21 - 79

Cerebras-GPT-13B: 0 - 100

Bloom-7b1: 0 - 100

Bloom-7b1 (custom prompt): 60 - 40

open_llama_13b: 95 - 5

flan-ul2: 0 - 100

flan-ul2 (fake json): 97 - 3

## long_wo_examples.json

pythia-12b-deduped: 15 - 85

Platypus2-7b: 49 - 51

Platypus2-13b: 85 - 15

Platypus2-70b-instruct:  90 - 10

Falcon-40b-instruct: 79 - 21

Falcon-40b: 66 - 34

mpt-30b-instruct: 0 - 100

stablelm-base-alpha-7b-v2: 0 - 100

Cerebras-GPT-13B: 0 - 100

Bloom-7b1: 0 - 100

Bloom-7b1 (custom prompt): 8 - 92

open_llama_13b: 66 - 34

flan-ul2: 0 - 100

flan-ul2 (fake json): 36 - 64

# Interference Hardware

gpt-j-6b: 1x 4090

Platypus2-7b: 1x RTX 4090

Platypus2-13b: 2x RTX 3090

Platypus2-70b-instruct: 2x A100 80GB

pythia-12b-deduped: 2x RTX 3090

Falcon-40b(-instruct): 4x RTX 3090



# Notes

## mpt-30b-instruct

custom input prompt: "### Instruction\n How would the response look for this USER REQUEST: "
Verbessert Ergebnisse. Aber trotzdem noch schlecht. Hauptproblem: Kommentare.
Beispiel:

input: Switch on the Bedroom Light.
output: { 
  "device":"light.bedroom",   // Device id of bedroom light
  "action":"turn on"         // Action requested by user - turn on
}
expected: {'device': 'light.bedroom', 'action': 'turn on'}