In [1]:
from tqdm import tqdm
import pandas as pd
import requests
import json
import time
import os
import re

### Endpoint Configuration 

In [2]:
# Define the Azure API URL and key
azure_api_url = 'https://openai-resource-chabot.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview'
azure_api_key = os.environ['openai-resource-chabot-key']
# Define the URL for your bot
bot_url = 'http://52.185.31.224:3000/api/v1/prediction/2728b5f4-4749-470f-ac25-dd249df879bf'


### Load test data

In [3]:

# # Load Test data
test_data = pd.read_excel('TestData.xlsx', sheet_name='TEALS Questions')[['Question','Ground_Truth']]
test_data[:10]

Unnamed: 0,Question,Ground_Truth
0,Is TEALS a Microsoft program?,Yes. The TEALS (Technology Education and Learn...
1,Who can volunteer for TEALS?,Anyone who resides in the US with a computer s...
2,Do I need a teaching certification or to take ...,No. We provide teaching support through our re...
3,Do I have to work for Microsoft to volunteer?,No. TEALS is designed for everyone. If you hav...
4,How will Covid-19 affect my volunteer experience?,Schools are still being affected by the Covid-...
5,Will I get reimbursed for expenses related to ...,The TEALS Program does not reimburse volunteer...
6,How does TEALS work?,TEALS (Technology Education and Learning Suppo...
7,What is the process for becoming a TEALS School?,Watch a pre-recorded TEALS info session (from ...
8,Where does TEALS operate?,TEALS partners with schools across the US. Vis...
9,What kind of schools does TEALS partner with?,TEALS works with many different types of high ...


### Generate bot responses for test questions

In [4]:
results_data = pd.DataFrame(columns=['Question','Ground_Truth','Response', 'Source_Documents'])

for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
    try:
        # Create the data dictionary
        data = {
            'question': row['Question']
        }

        # Make the POST request
        response = requests.post(bot_url, json=data)

        # Parse response
        answer = response.json()['text']
        sources = response.json()['sourceDocuments']
        sources = [item['pageContent'] for item in sources]
        
        # Add items to results dataframe
        results_data.loc[index] = [row['Question'], row['Ground_Truth'], answer, sources]
    except Exception as e:
        print(e)
        print('Row' + index + ':' + row['Question'])
        print(answer)
        break
results_data.to_csv('Results/responses.csv', index=False)


  0%|          | 0/52 [00:00<?, ?it/s]

100%|██████████| 52/52 [03:00<00:00,  3.46s/it]


### Calculate ratings for all metrics

In [26]:
# list of metrics to use
metrics = ["Relevance", "Groundedness", "Retrieval", "RAI"]
# metrics = ["Retrieval"]
for metric_name in metrics:
    # Read the prompt for the metric
    with open(f'metrics/{metric_name}.txt', 'r') as file:
        metric_prompt = file.read()

    
    # Iterate over each row of the DataFrame
    for index, row in tqdm(results_data.iterrows(), total=results_data.shape[0]):
        # Try to rate each row, set as None if error
        try:
            if metric_name == "Relevance":
                request_data = f"Question: '{row['Question']}'\n\nGround Truth Answer: '{row['Ground_Truth']}'\n\nGenerated Answer: '{row['Response']}'"
            elif metric_name == "RAI":
                request_data = f"Question: '{row['Question']}'\n\nGenerated Answer: '{row['Response']}"
            elif metric_name == "Retrieval":
                request_data = f"Question: '{row['Question']}'\n\nSource Documents: {row['Source_Documents']}"
            else:
                request_data = f"Question: '{row['Question']}'\n\nGenerated Answer: '{row['Response']}'\n\nSource Documents: {row['Source_Documents']}"
            # Prepare the body of the request
            body = {
                "messages": [
                    {
                        "role": "user",
                        "content": f"{metric_prompt}"
                    },
                    {
                        "role": "user",
                        "content":  f"{request_data}"
                    },
                    {
                        "role": "user",
                        "content":  f"Rate the {metric_name} using the instructions previously provided. Remember that the rating produced must only be 1 or 2 or 3 or 4 or 5."
                    }
                ],
                "temperature": 0,
                "top_p": 1,
                "max_tokens": 800,
                "stop": None,
                "stream": False
            }

            # Set the headers of the request
            headers = {
                "API-Key": azure_api_key, 
                "Content-Type": "application/json"
            }

            # Send POST request to the OpenAI API

            # Define the timeout duration and the maximum number of retries
            timeout_duration = 15  # 5 seconds
            max_retries = 5  # Retry up to 5 times

            # Initialize the retry count
            retry_count = 0

            while retry_count < max_retries:                
                # Send POST request to the OpenAI API
                response = requests.post(azure_api_url, data=json.dumps(body), headers=headers)

                if response.status_code == 429:
                    # If a timeout error occurred, wait for the timeout duration and then retry
                    retry_count += 1
                    print('Rate Limit Error. Waiting', timeout_duration, 'seconds. Retry attempt', retry_count,'/', max_retries, 'for row', index)
                    time.sleep(timeout_duration)
                    continue
                else:
                    # Parse the response to get the rating value
                    rating = re.search('[1-5]', response.json()["choices"][0]["message"]["content"]).group(0)
                    if rating:
                        results_data.loc[index, metric_name] = int(rating)
                        break
                    else:
                        retry_count += 1
                        print('Error: Could not find rating. Retry attempt', retry_count,'/', max_retries, 'for row', index)
                        continue
        except Exception as e:
            display(e)
            # Set the rating to None if there was an error
            display(f"Error with row {index}")
            display(response.json())
            results_data.loc[index, metric_name] = None

        # save results
        results_data.to_csv(f'Results/results.csv', mode='w', header=True, index=False)

 23%|██▎       | 12/52 [00:51<02:44,  4.11s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 12


 46%|████▌     | 24/52 [01:57<02:00,  4.32s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 24


 67%|██████▋   | 35/52 [02:53<00:59,  3.48s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 35


 69%|██████▉   | 36/52 [03:12<02:09,  8.11s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 36


 92%|█████████▏| 48/52 [04:17<00:14,  3.68s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 48


 98%|█████████▊| 51/52 [04:44<00:06,  6.12s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 51


100%|██████████| 52/52 [05:04<00:00,  5.86s/it]
 15%|█▌        | 8/52 [00:13<01:15,  1.72s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 8


 17%|█▋        | 9/52 [00:31<04:46,  6.67s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 9


 21%|██        | 11/52 [00:50<05:06,  7.47s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 11


 38%|███▊      | 20/52 [01:21<01:12,  2.27s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 20


 40%|████      | 21/52 [01:39<03:32,  6.86s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 21


 44%|████▍     | 23/52 [01:59<03:44,  7.75s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 23


 62%|██████▏   | 32/52 [02:35<00:49,  2.46s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 32


 67%|██████▋   | 35/52 [03:02<01:28,  5.18s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 35


 77%|███████▋  | 40/52 [03:28<00:44,  3.72s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 40


 90%|█████████ | 47/52 [04:06<00:17,  3.42s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 47


100%|██████████| 52/52 [04:35<00:00,  5.30s/it]
  0%|          | 0/52 [00:00<?, ?it/s]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 0


  6%|▌         | 3/52 [00:20<04:12,  5.15s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 3


 13%|█▎        | 7/52 [00:43<02:59,  3.99s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 7


 23%|██▎       | 12/52 [01:07<02:07,  3.19s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 12


 29%|██▉       | 15/52 [01:28<02:49,  4.57s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 15


 37%|███▋      | 19/52 [01:51<02:13,  4.05s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 19


 46%|████▌     | 24/52 [02:15<01:29,  3.20s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 24


 52%|█████▏    | 27/52 [02:37<01:57,  4.70s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 27


 60%|█████▉    | 31/52 [03:00<01:25,  4.05s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 31


 69%|██████▉   | 36/52 [03:25<00:53,  3.33s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 36


 75%|███████▌  | 39/52 [03:46<01:00,  4.64s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 39


 83%|████████▎ | 43/52 [04:09<00:37,  4.13s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 43


 92%|█████████▏| 48/52 [04:35<00:13,  3.46s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 48


 98%|█████████▊| 51/52 [04:56<00:04,  4.80s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 51


100%|██████████| 52/52 [05:14<00:00,  6.05s/it]
 21%|██        | 11/52 [00:52<02:58,  4.35s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 11


 44%|████▍     | 23/52 [02:03<02:41,  5.56s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 23


 67%|██████▋   | 35/52 [03:18<01:09,  4.11s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 35


100%|██████████| 52/52 [05:12<00:00,  6.01s/it]


In [27]:
# Load Results data
final_data = pd.read_csv('Results/results.csv')
# Calculate the average
for metric in metrics:
    print(f"Average {metric}: {final_data[metric].mean()}")

Average Relevance: 4.134615384615385
Average Groundedness: 4.75
Average Retrieval: 4.538461538461538
Average RAI: 4.346153846153846
