In [2]:
from tqdm import tqdm
import pandas as pd
import requests
import json
import time
import os
import re

### Endpoint Configuration 

In [None]:
# Define the Azure API URL and key
azure_api_url = 'https://openai-resource-chabot.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview'
azure_api_key = os.environ['openai-resource-chabot-key']
# Define the URL for your bot
bot_url = 'http://52.185.31.224:3000/api/v1/prediction/2728b5f4-4749-470f-ac25-dd249df879bf'


### Load test data

In [3]:

# # Load Test data
test_data = pd.read_excel('TestData.xlsx', sheet_name='TEALS Questions')[['Question','Ground_Truth']]
test_data[:10]

Unnamed: 0,Question,Ground_Truth
0,Is TEALS a Microsoft program?,Yes. The TEALS (Technology Education and Learn...
1,Who can volunteer for TEALS?,Anyone who resides in the US with a computer s...
2,Do I need a teaching certification or to take ...,No. We provide teaching support through our re...
3,Do I have to work for Microsoft to volunteer?,No. TEALS is designed for everyone. If you hav...
4,How will Covid-19 affect my volunteer experience?,Schools are still being affected by the Covid-...
5,Will I get reimbursed for expenses related to ...,The TEALS Program does not reimburse volunteer...
6,How does TEALS work?,TEALS (Technology Education and Learning Suppo...
7,What is the process for becoming a TEALS School?,Watch a pre-recorded TEALS info session (from ...
8,Where does TEALS operate?,TEALS partners with schools across the US. Vis...
9,What kind of schools does TEALS partner with?,TEALS works with many different types of high ...


### Generate bot responses for test questions

In [56]:
results_data = pd.DataFrame(columns=['Question','Ground_Truth','Response', 'Source_Documents'])

for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
    try:
        # Create the data dictionary
        data = {
            'question': row['Question']
        }

        # Make the POST request
        response = requests.post(bot_url, json=data)

        # Parse response
        answer = response.json()['text']
        sources = response.json()['sourceDocuments']
        sources = [item['pageContent'] for item in sources]
        
        # Add items to results dataframe
        results_data.loc[index] = [row['Question'], row['Ground_Truth'], answer, sources]
    except Exception as e:
        print(e)
        print('Row' + index + ':' + row['Question'])
        print(answer)
        break
results_data.to_csv('Results/responses.csv', index=False)


100%|██████████| 21/21 [01:12<00:00,  3.46s/it]


### Calculate ratings for all metrics

In [62]:
# list of metrics to use
metrics = ["Relevance", "Groundedness", "Retrieval"]
# metrics = ["Relevance"]
for metric_name in metrics:
    # Read the prompt for the metric
    with open(f'metrics/{metric_name}.txt', 'r') as file:
        metric_prompt = file.read()

    
    # Iterate over each row of the DataFrame
    for index, row in tqdm(results_data.iterrows(), total=results_data.shape[0]):
        # Try to rate each row, set as None if error
        try:
            if metric_name == "Relevance":
                request_data = f"Question: '{row['Question']}'\n\nGround Truth Answer: '{row['Ground_Truth']}'\n\nGenerated Answer: '{row['Response']}'"
            else:
                request_data = f"Question: '{row['Question']}'\n\nGenerated Answer: '{row['Response']}'\n\nSource Documents: {row['Source_Documents']}"
            # Prepare the body of the request
            body = {
                "messages": [
                    {
                        "role": "user",
                        "content": f"{metric_prompt}"
                    },
                    {
                        "role": "user",
                        "content":  f"{request_data}"
                    },
                    {
                        "role": "user",
                        "content":  f"Rate the {metric_name} using the instructions previously provided. Remember that the rating produced must only be 1 or 2 or 3 or 4 or 5."
                    }
                ],
                "temperature": 0,
                "top_p": 1,
                "max_tokens": 800,
                "stop": None,
                "stream": False
            }

            # Set the headers of the request
            headers = {
                "API-Key": azure_api_key, 
                "Content-Type": "application/json"
            }

            # Send POST request to the OpenAI API

            # Define the timeout duration and the maximum number of retries
            timeout_duration = 15  # 5 seconds
            max_retries = 5  # Retry up to 5 times

            # Initialize the retry count
            retry_count = 0

            while retry_count < max_retries:                
                # Send POST request to the OpenAI API
                response = requests.post(azure_api_url, data=json.dumps(body), headers=headers)

                if response.status_code == 429:
                    # If a timeout error occurred, wait for the timeout duration and then retry
                    retry_count += 1
                    print('Rate Limit Error. Waiting', timeout_duration, 'seconds. Retry attempt', retry_count,'/', max_retries, 'for row', index)
                    time.sleep(timeout_duration)
                    continue
                else:
                    # Parse the response to get the rating value
                    rating = re.search('[1-5]', response.json()["choices"][0]["message"]["content"]).group(0)
                    if rating:
                        results_data.loc[index, metric_name] = int(rating)
                        break
                    else:
                        retry_count += 1
                        print('Error: Could not find rating. Retry attempt', retry_count,'/', max_retries, 'for row', index)
                        continue
        except Exception as e:
            display(e)
            # Set the rating to None if there was an error
            display(f"Error with row {index}")
            display(response.json())
            results_data.loc[index, metric_name] = None

        # save results
        results_data.to_csv(f'Results/results.csv', mode='w', header=True, index=False)

  0%|          | 0/21 [00:00<?, ?it/s]

 57%|█████▋    | 12/21 [00:53<00:46,  5.20s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 12


100%|██████████| 21/21 [01:56<00:00,  5.55s/it]
 14%|█▍        | 3/21 [00:04<00:30,  1.69s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 3


 29%|██▊       | 6/21 [00:27<01:09,  4.61s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 6


 52%|█████▏    | 11/21 [00:52<00:35,  3.52s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 11


 71%|███████▏  | 15/21 [01:14<00:22,  3.76s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 15


 86%|████████▌ | 18/21 [01:37<00:15,  5.01s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 18


100%|██████████| 21/21 [01:59<00:00,  5.71s/it]
 29%|██▊       | 6/21 [00:30<01:34,  6.27s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 6


 43%|████▎     | 9/21 [00:53<01:11,  5.93s/it]

Rate Limit Error. Waiting 15 seconds. Retry attempt 1 / 5 for row 9


100%|██████████| 21/21 [02:19<00:00,  6.65s/it]


In [63]:
# Load Results data
final_data = pd.read_csv('Results/results.csv')
# Calculate the average
for metric in metrics:
    print(f"Average {metric}: {final_data[metric].mean()}")

Average Relevance: 3.761904761904762
Average Groundedness: 4.285714285714286
Average Retrieval: 3.238095238095238
