# Evaluation
Evalutate LLM responses when there is `single right answer`

In [16]:
import os
import json
import openai
import sys
sys.path.append('../')
import scripts.utils as utils

# GUI
import panel as pn
pn.extension()

# Getting credentials
from Config import openaiConfig
openai.organization = openaiConfig.OPENAI_ORGANISATION
openai.api_key = openaiConfig.OPENAI_API_KEY

In [2]:
def getCompletionfromMessages(messages, 
                              model="gpt-3.5-turbo",
                              temperature=0,
                              max_tokens=500):
    response = openai.ChatCompletion.create(model=model,
                                            messages = messages,
                                            temperature = temperature,
                                            max_tokens = max_tokens)
    return response.choices[0].message["content"]

### Get the relevant products and categories

In [3]:
productnCategory = utils.getProductnCategory()
productnCategory

{'Computers and Laptops': ['TechPro Ultrabook',
  'BlueWave Gaming Laptop',
  'PowerLite Convertible',
  'TechPro Desktop',
  'BlueWave Chromebook'],
 'Smartphones and Accessories': ['SmartX ProPhone',
  'MobiTech PowerCase',
  'SmartX MiniPhone',
  'MobiTech Wireless Charger',
  'SmartX EarBuds'],
 'Televisions and Home Theater Systems': ['CineView 4K TV',
  'SoundMax Home Theater',
  'CineView 8K TV',
  'SoundMax Soundbar',
  'CineView OLED TV'],
 'Gaming Consoles and Accessories': ['GameSphere X',
  'ProGamer Controller',
  'GameSphere Y',
  'ProGamer Racing Wheel',
  'GameSphere VR Headset'],
 'Audio Equipment': ['AudioPhonic Noise-Canceling Headphones',
  'WaveSound Bluetooth Speaker',
  'AudioPhonic True Wireless Earbuds',
  'WaveSound Soundbar',
  'AudioPhonic Turntable'],
 'Cameras and Camcorders': ['FotoSnap DSLR Camera',
  'ActionCam 4K',
  'FotoSnap Mirrorless Camera',
  'ZoomMaster Camcorder',
  'FotoSnap Instant Camera']}

## Find relevant product and category names(v1)

In [8]:
# scripts/utils.py has the first version of the code 
# i.e. findCategoryProductsOnly(userInput, productsByCategory)
# Modification is adding 'FEW SHOT LEARNING'
def findCategoryProductsOnlyV1(userInput, productsByCategory):
    delimiter = "####"
    systemMessage = f"""
    You will be provided with customer service queries. \
    The customer service query will be delimited with {delimiter} characters. \
    Output a python list of objects, where each object has the following format:
    'category': <one of Computers and Laptops, Smartphones and Accessories, Television and Home Theater Systems, \
    Gaming Consoles and Accessories, Audio Equipment, Cameras and Camcorders>,
    OR
    'products': <a list of products that must be found in the allowed products below>
    
    Where categories and products must be found in the customer service query.
    If a product is mentioned, it must be associated with the correct category in the allowed products list below.
    If no products or categories are found, output an empty list.
    
    Allowed products: {productsByCategory}
    
    Only output the list of objects, nothing else.
    """
    
    # Adding FEW SHOT LEARNING
    fewShotUser = """I want the most expensive computer."""
    fewShotAssitant =  """ 
    [{'category': 'Computers and Laptops', \
'products': ['TechPro Ultrabook', 'BlueWave Gaming Laptop', 'PowerLite Convertible', 'TechPro Desktop', 'BlueWave Chromebook']}]
    """
    
    messages = [
        {'role':'system', 'content':systemMessage},
        {'role':'user','content':f"{delimiter}{fewShotUser}{delimiter}"},
        {'role':'assistant', 'content':fewShotAssitant},
        {'role':'user', 'content':f"{delimiter}{userInput}{delimiter}"},
         ]
    return getCompletionfromMessages(messages)

### Evaluate on some queries

In [9]:
customerMsg0 = f"""Which TV can I buy if I'm on a budget?"""
# API Call
productsbyCategory0 = findCategoryProductsOnlyV1(customerMsg0, productnCategory)
productsbyCategory0

"    [{'category': 'Televisions and Home Theater Systems', 'products': ['CineView 4K TV', 'SoundMax Home Theater', 'CineView 8K TV', 'SoundMax Soundbar', 'CineView OLED TV']}]"

In [10]:
customerMsg1 = f"""
tell me about the smartx pro phone and the fotosnap camera, the dslr one.
Also, what TVs do you have?"""
# API Call
productsbyCategory1 = findCategoryProductsOnlyV1(customerMsg1, productnCategory)
productsbyCategory1

"    [{'category': 'Smartphones and Accessories', 'products': ['SmartX ProPhone']},\n     {'category': 'Cameras and Camcorders', 'products': ['FotoSnap DSLR Camera']},\n     {'category': 'Televisions and Home Theater Systems', 'products': ['CineView 4K TV', 'SoundMax Home Theater', 'CineView 8K TV', 'SoundMax Soundbar', 'CineView OLED TV']}]\n"

In [11]:
# Harder test cases
customerMsg2=f"""
tell me about the CineView TV, the 8K one, Gamesphere console, the X one.
I'm on a budget, what computers do you have?"""
# API Call
productsbyCategory2 = findCategoryProductsOnlyV1(customerMsg2, productnCategory)
productsbyCategory2

"    [{'category': 'Televisions and Home Theater Systems', 'products': ['CineView 8K TV']},\n     {'category': 'Gaming Consoles and Accessories', 'products': ['GameSphere X']},\n     {'category': 'Computers and Laptops', 'products': ['BlueWave Chromebook', 'TechPro Desktop']}]\n"

Note : 
1. This returns the expected output. 
2. **If the model fails to output the expected results then add few more examples.**
3. After changing the code, perform **Regression testing** i.e. again check the examples for which the previous code was performing correctly(sanity check).

## Automated testing

In [12]:
msgIdealPairsSet = [
    
    # eg 0
    {'customer_msg':"""Which TV can I buy if I'm on a budget?""",
     'ideal_answer':{
        'Televisions and Home Theater Systems':set(
            ['CineView 4K TV', 'SoundMax Home Theater', 'CineView 8K TV', 'SoundMax Soundbar', 'CineView OLED TV']
        )}
    },

    # eg 1
    {'customer_msg':"""I need a charger for my smartphone""",
     'ideal_answer':{
        'Smartphones and Accessories':set(
            ['MobiTech PowerCase', 'MobiTech Wireless Charger', 'SmartX EarBuds']
        )}
    },
    # eg 2
    {'customer_msg':f"""What computers do you have?""",
     'ideal_answer':{
           'Computers and Laptops':set(
               ['TechPro Ultrabook', 'BlueWave Gaming Laptop', 'PowerLite Convertible', 'TechPro Desktop', 'BlueWave Chromebook'
               ])
                }
    },

    # eg 3
    {'customer_msg':f"""tell me about the smartx pro phone and \
    the fotosnap camera, the dslr one.\
    Also, what TVs do you have?""",
     'ideal_answer':{
        'Smartphones and Accessories':set(
            ['SmartX ProPhone']),
        'Cameras and Camcorders':set(
            ['FotoSnap DSLR Camera']),
        'Televisions and Home Theater Systems':set(
            ['CineView 4K TV', 'SoundMax Home Theater','CineView 8K TV', 'SoundMax Soundbar', 'CineView OLED TV'])
        }
    }, 
    
    # eg 4
    {'customer_msg':"""tell me about the CineView TV, the 8K one, Gamesphere console, the X one.
I'm on a budget, what computers do you have?""",
     'ideal_answer':{
        'Televisions and Home Theater Systems':set(
            ['CineView 8K TV']),
        'Gaming Consoles and Accessories':set(
            ['GameSphere X']),
        'Computers and Laptops':set(
            ['TechPro Ultrabook', 'BlueWave Gaming Laptop', 'PowerLite Convertible', 'TechPro Desktop', 'BlueWave Chromebook'])
        }
    },
    
    # eg 5
    {'customer_msg':f"""What smartphones do you have?""",
     'ideal_answer':{
           'Smartphones and Accessories':set(
               ['SmartX ProPhone', 'MobiTech PowerCase', 'SmartX MiniPhone', 'MobiTech Wireless Charger', 'SmartX EarBuds'
               ])
                    }
    },
    # eg 6
    {'customer_msg':f"""I'm on a budget.  Can you recommend some smartphones to me?""",
     'ideal_answer':{
        'Smartphones and Accessories':set(
            ['SmartX EarBuds', 'SmartX MiniPhone', 'MobiTech PowerCase', 'SmartX ProPhone', 'MobiTech Wireless Charger']
        )}
    },

    # eg 7 # this will output a subset of the ideal answer
    {'customer_msg':f"""What Gaming consoles would be good for my friend who is into racing games?""",
     'ideal_answer':{
        'Gaming Consoles and Accessories':set([
            'GameSphere X',
            'ProGamer Controller',
            'GameSphere Y',
            'ProGamer Racing Wheel',
            'GameSphere VR Headset'
     ])}
    },
    # eg 8
    {'customer_msg':f"""What could be a good present for my videographer friend?""",
     'ideal_answer': {
        'Cameras and Camcorders':set([
        'FotoSnap DSLR Camera', 'ActionCam 4K', 'FotoSnap Mirrorless Camera', 'ZoomMaster Camcorder', 'FotoSnap Instant Camera'
        ])}
    },
    
    # eg 9
    {'customer_msg':f"""I would like a hot tub time machine.""",
     'ideal_answer': []
    }
    
]

### Evaluate test cases by comparing to the ideal answers

In [21]:
def evalResponseWithIdeal(response, ideal, debug=False):
    if debug: print(f"Response: {response}")
    
    # json expects double quotes not single quotes
    jsonStr = response.replace("'",'"')
    # Parse into list of dictionaries
    listofDict = json.loads(jsonStr)
    
    # special case when response is emmpyt
    if listofDict == [] and ideal == []:
        return 1
    elif listofDict == [] or ideal == []:
        return 0
    
    correct = 0
    if debug: print(f"listofDict: {listofDict}")
    for d in listofDict:
        cat = d.get('category')
        prodList = d.get('products')
        if cat and prodList:
            # convert list fo product for comparision
            prodSet = set(prodList)
            # retrieve ideal set
            idealCat = ideal.get(cat)
            if idealCat:
                idealProd = set(ideal.get(cat))
            else:
                if debug: print(f'Did not find category {cat} in ideal {ideal}')
                continue
            if debug: print(f'Model prods:\n{prodSet}\n, Idea prods:\n{idealProd}\n')
            if prodSet == idealProd:
                if debug: print('Correct')
                correct += 1
            else:
                print('Incorrect')
                print(f'Model prods:\n{prodSet}\n, Idea prods:\n{idealProd}\n')
    # 
    pcCorrect = correct/len(listofDict)
    return pcCorrect

In [14]:
print(f'Customer message: {msgIdealPairsSet[7]["customer_msg"]}')
print(f'Ideal answer: {msgIdealPairsSet[7]["ideal_answer"]}')

Customer message: What Gaming consoles would be good for my friend who is into racing games?
Ideal answer: {'Gaming Consoles and Accessories': {'ProGamer Racing Wheel', 'GameSphere X', 'ProGamer Controller', 'GameSphere VR Headset', 'GameSphere Y'}}


In [17]:
response = findCategoryProductsOnlyV1(msgIdealPairsSet[7]["customer_msg"],
                                      productnCategory)
print(f'Resonse: {response}')

evalResponseWithIdeal(response, msgIdealPairsSet[7]["ideal_answer"])

Resonse:     [{'category': 'Gaming Consoles and Accessories', 'products': ['ProGamer Controller', 'ProGamer Racing Wheel', 'GameSphere VR Headset']}]
Incorrect


0.0

Response is the subset of the ideal answer

In [22]:
# Run over all the examples
# Note, this will not work if any of the api calls time out
scoreAccum = 0
for i, pair in enumerate(msgIdealPairsSet):
    print(f"example {i}")
    
    customerMsg = pair['customer_msg']
    ideal = pair['ideal_answer']
    
    # print("Customer message",customer_msg)
    # print("ideal:",ideal)
    response = findCategoryProductsOnlyV1(customerMsg,
                                          productnCategory)

    
    # print("products_by_category",products_by_category)
    score = evalResponseWithIdeal(response,ideal,debug=False)
    print(f"{i}: {score}")
    scoreAccum += score
    

n_examples = len(msgIdealPairsSet)
fraction_correct = scoreAccum / n_examples
print(f"Fraction correct out of {n_examples}: {fraction_correct}")

example 0
0: 1.0
example 1
1: 1.0
example 2
2: 1.0
example 3
Incorrect
Model prods:
{'CineView 8K TV', 'CineView OLED TV', 'CineView 4K TV'}
, Idea prods:
{'CineView OLED TV', 'CineView 4K TV', 'SoundMax Home Theater', 'CineView 8K TV', 'SoundMax Soundbar'}

3: 0.6666666666666666
example 4
Incorrect
Model prods:
{'BlueWave Chromebook'}
, Idea prods:
{'BlueWave Chromebook', 'PowerLite Convertible', 'BlueWave Gaming Laptop', 'TechPro Desktop', 'TechPro Ultrabook'}

4: 0.6666666666666666
example 5
5: 1.0
example 6
Incorrect
Model prods:
{'SmartX ProPhone', 'SmartX EarBuds', 'SmartX MiniPhone'}
, Idea prods:
{'SmartX ProPhone', 'MobiTech Wireless Charger', 'SmartX MiniPhone', 'MobiTech PowerCase', 'SmartX EarBuds'}

6: 0.0
example 7
Incorrect
Model prods:
{'GameSphere VR Headset', 'ProGamer Controller', 'ProGamer Racing Wheel'}
, Idea prods:
{'GameSphere VR Headset', 'GameSphere Y', 'ProGamer Racing Wheel', 'GameSphere X', 'ProGamer Controller'}

7: 0.0
example 8
8: 1.0
example 9
9: 1
Frac