In [1]:
import re
import json
from collections import defaultdict

import pandas as pd
import openai
from vertexai.language_models import TextGenerationModel

import constants

In [17]:
with open('cred.json', 'r') as f:
    cred_json = json.load(f)

openai.api_key = cred_json['api_key']
openai.organization_id = cred_json['organization_id']
# list_models = openai.Model.list()
# [x.id for x in list_models['data'] if 'gpt' in x.id]

In [2]:
# extract conversations and gt
example_sentences, example_sentences_idx, gt_idx = [], [], []
with open('example_conv_sents.txt', 'r') as f:
    file_lines = f.read().splitlines()
    for x in file_lines:
        x = x.strip()

        if not x:
            continue
        if x.startswith("gt"):
            idx = int(x[3:].split(":")[0])
            sent = ":".join(x[3:].strip().split(":")[1:]).strip()
            gt_idx.append(idx)
        else:
            idx = int(x.split(":")[0])
            sent = ":".join(x.split(":")[1:]).strip()

        example_sentences.append(sent)
        example_sentences_idx.append(idx)

example_sentences[:3], example_sentences_idx[:3], gt_idx


(['Good morning, Ms. Smith!',
  'How are you feeling today?',
  'Morning, Doctor.'],
 [0, 1, 2],
 [12, 13, 14, 17, 18, 30])

In [13]:
def cluster_numbers(numbers, threshold):
    """
    Clusters numbers that are within `threshold` of each other
    """
    clusters = []
    current_cluster = []
    # Sort the numbers in ascending order
    numbers.sort()
    # cluster number within threshold
    for i in range(len(numbers)):
        if not current_cluster:
            current_cluster.append(numbers[i])
        else:
            if numbers[i] - current_cluster[-1] <= threshold:
                current_cluster.append(numbers[i])
            else:
                clusters.append(current_cluster.copy())
                current_cluster = [numbers[i]]
    # consider the last cluster that was not appended
    if current_cluster:
        clusters.append(current_cluster)
    return clusters

def isoverlap(range1, range2):
    if range1[0] <= range2[1] and range1[1] >= range2[0]:
        return True
    return False

def calculate_precision_recall(gt_ranges_expanded, pred_ranges):
    true_positives, false_positives, false_negatives = 0, 0, 0
    tp_clusters_in_gt_expanded, tp_clusters_in_pred = [], []
    fn_clusters_in_gt_expanded, fn_clusters_in_pred = [], []
    # compare gt and pred clusters to find true positives and false negatives
    for gt_range in gt_ranges_expanded:
        overlap_found = False
        for pred_range in pred_ranges:
            if isoverlap(pred_range, gt_range):
                overlap_found = True
                break
        if overlap_found:
            true_positives += 1
            tp_clusters_in_gt_expanded.append(gt_range)
            tp_clusters_in_pred.append(pred_range)
        else:
            false_negatives += 1
            fn_clusters_in_gt_expanded.append(gt_range)
            fn_clusters_in_pred.append(pred_range)

    # compare gt and pred clusters to find false positives
    fp_clusters_in_pred = []    
    for pred_range in pred_ranges:
        overlap_found = False
        for gt_range in gt_ranges_expanded:
            if isoverlap(pred_range, gt_range):
                overlap_found = True
                break
        if not overlap_found:
            false_positives += 1
            fp_clusters_in_pred.append(pred_range)
    
    # calculate precision and recall
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    cluster_types = {
        'tp_clusters_in_gt_expanded': tp_clusters_in_gt_expanded,
        'tp_clusters_in_pred': tp_clusters_in_pred,
        'fn_clusters_in_pred': fn_clusters_in_pred,
        'fn_clusters_in_gt_expanded': fn_clusters_in_gt_expanded,
        'fp_clusters_in_pred': fp_clusters_in_pred}
    agg_info = {
        'precision': precision, 'recall': recall,
        'true_positives': true_positives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
    }
    return agg_info, cluster_types

def get_metrics(y_idx, pred_idx):
    # cluster the indices
    y_clusters = cluster_numbers(y_idx, constants.PROIMITY_THRESHOLD)
    pred_clusters = cluster_numbers(pred_idx, constants.PROIMITY_THRESHOLD)
    # expand the gt by offset
    y_idx_expanded = sorted(set([
        x 
        for y_idx in y_clusters
        for y in y_idx
        for x in range(y-constants.OFFSET, y+constants.OFFSET+1)
    ]))
    y_clusters_expanded = cluster_numbers(y_idx_expanded, 1)

    # convert the clusters to ranges
    y_range_expanded = [[x[0], x[-1]] for x in y_clusters_expanded]
    pred_ranges = [[x[0], x[-1]] for x in pred_clusters]
    
    # find precision and recall by finding overlapping ranges
    metrics, clusters = calculate_precision_recall(y_range_expanded, pred_ranges)
    return metrics, clusters

def parse_and_get_utt_ids_v3(input_string):
    input_string = input_string.lower()
    matches = re.findall(r'<(.*?)>', input_string)
    matches = [
        x.replace('sentence ids', '').replace('sentence id', '').replace('sentence', '')
        for x in matches]
    matches = [
        " ".join(x.replace(',', ' ').replace(':', ' ').replace('-', ' ').replace('/', ' ').strip().split())
        for x in matches]

    # print(matches)
    sentence_ids = []

    for match in matches:
        assert bool(re.search(r'[^0-9. ]', match)) == False, f"Invalid output format {match}"

        ranges = re.findall(r'\d+\.\d+|\d+', match)
        if not ranges: # this can happen if the sentence ids are empty
            continue
        
        # Check if it's a range or a single ID
        if len(ranges) == 2:
            start, end = int(float(ranges[0])), int(float(ranges[1]))
            for i in range(start, end + 1):
                sentence_ids.append(i)
        else:
            sentence_ids.append(int(float(ranges[0])))
    return sentence_ids


In [10]:
def get_google_prompt(messages):
    prompt = []
    for m in messages:
        if m['role'] == 'system':
            content = [m['content'], '']
        elif m['role'] == 'user':
            content = ['Input:', m['content']]
        else:
            content = ['Output:', m['content'], '']
        prompt.extend(content)
        
    prompt = "\n".join(prompt)
    return prompt

def format_conv(utterances, utterances_ids):
    formatted_conv = []
    for utterance, utterance_id in zip(utterances, utterances_ids):
        formatted_conv.append(f'{utterance_id} - {utterance}')
    return "\n".join(formatted_conv)

def get_messages(utterances, utterances_ids, use_few_shot=False,
                 example_inputs=None, example_outputs=None,
                 use_explanation=True):
    formatted_conv = format_conv(utterances, utterances_ids)
    messages = [{"role": "system", "content": constants.system_prompt}]

    if use_explanation:
        option_prompt = constants.option_prompt_original
    else:
        option_prompt = constants.option_prompt_no_explanation
        
    if use_few_shot:
        for _, (example_input, example_output)  in enumerate(zip(example_inputs, example_outputs)):
            messages.append(
                {"role": "user", "content": f'Conversation:\n{example_input}\nInstruction:{option_prompt}'},
            )
            messages.append(
                {"role": "assistant", "content": example_output}
            )
    messages.append(
        {"role": "user", "content": f'Conversation:\n{formatted_conv}\nInstruction:{option_prompt}'}
    )
    return messages


## Generate and save outputs

In [14]:
fileid = "example"
response_all = defaultdict(list)
eval_options = [
    ('gpt-3.5-turbo-0301', True, True),
    ('gpt-3.5-turbo-0301', False, True),
    ('gpt-3.5-turbo-0301', False, False),
    ('gpt-3.5-turbo-0301', True, False),
    # ('text-bison@001', True, True),
 ]
tag = f'_1'

for model, use_few_shot, use_explanation in eval_options:
    print('running', model)

    idx = list(range(0, len(example_sentences), constants.BATCH_SIZE))
    example_sentences_idx = list(range(len(example_sentences)))

    for batchid, (st, end) in enumerate(zip(idx[:], idx[1:] + [len(example_sentences)])):
        utterances = example_sentences[st:end]
        utterances_ids = example_sentences_idx[st:end]
        
        # Create a list of message objects
        start_offset = list(utterances_ids)[0] if constants.USE_ZERO_ALL_BATCHES else 0
        messages = get_messages(utterances, [x-start_offset for x in utterances_ids],
                                use_few_shot=use_few_shot,
                                example_inputs=constants.example_inputs,
                                example_outputs=constants.example_outputs, use_explanation=use_explanation)
        
        if 'bison' in model:
            google_prompt = get_google_prompt(messages)
            parameters = {
                "max_output_tokens": 256,
                "temperature": 0.0,
                "top_p": 1,
                "top_k": 40
            }
            google_model = TextGenerationModel.from_pretrained("text-bison@001")
            response = google_model.predict(
                google_prompt,
                **parameters
            )

            response_all[f"{fileid}_{tag}_{model}_fewshot_{use_few_shot}_useexp_{use_explanation}"].append({
                'vendor': 'google', 'utterances_ids': utterances_ids, 'response': response,
                "messages": messages, "google_prompt": google_prompt, "model": model, 'start_idx': start_offset})
        else:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0.0,
                top_p=1,
            )
        response_all[f"{fileid}_{tag}_{model}_fewshot_{use_few_shot}_useexp_{use_explanation}"].append({
            'utterances_ids': utterances_ids, 'response': response, "messages": messages, 'start_offset': start_offset})


running gpt-3.5-turbo-0301
running gpt-3.5-turbo-0301
running gpt-3.5-turbo-0301
running gpt-3.5-turbo-0301


In [15]:
display_metric = []
for model, use_few_shot, use_explanation in eval_options:
    print(model, use_few_shot, use_explanation)
    metrics_all = defaultdict(list)

    llm_out = response_all[f"{fileid}_{tag}_{model}_fewshot_{use_few_shot}_useexp_{use_explanation}"]
    
    pred_sentence_ids_list_all = []
    for llm_response in llm_out:
        if 'bison' in model:
            out = llm_response['response'].text.strip()
        else:
            out = llm_response['response'].choices[0].message["content"].strip()

        pred_sentence_ids_list = parse_and_get_utt_ids_v3(out)
        start_offset = llm_response.get('start_offset', 0)

        pred_sentence_ids_list = [int(start_offset + x) for x in pred_sentence_ids_list]
        
        input_utterances_ids = [int(x) for x in set(llm_response['utterances_ids'])]
        not_found = []
        for sentence_id in pred_sentence_ids_list:
            if sentence_id not in input_utterances_ids:
                print("Sentence ID {} not found in utterances".format(sentence_id))
                not_found.append(sentence_id)
        for id in not_found:
            pred_sentence_ids_list.remove(id)
        pred_sentence_ids_list_all.extend(pred_sentence_ids_list)

    pred_array = [1 if x in pred_sentence_ids_list_all else 0 for x in example_sentences_idx]
    pred_sentence_ids_list_all = list(set(pred_sentence_ids_list_all))
    metrics, clusters = get_metrics(gt_idx, pred_sentence_ids_list_all)
    print('pred_sentence_ids_list_all', pred_sentence_ids_list_all, 'gt_idx', gt_idx)
    print('clusters', clusters)
    print('metrics', metrics)
    print()
    
    for k, v in metrics.items():
        metrics_all[k].append(v)

    precisiondenominator = max(1, sum(metrics_all['true_positives']) + sum(metrics_all['false_positives']))
    recalldenominator = max(1, sum(metrics_all['true_positives']) + sum(metrics_all['false_negatives']))
    precision = sum(metrics_all['true_positives'])/precisiondenominator
    recall = sum(metrics_all['true_positives'])/recalldenominator
    f1 = 2*precision*recall/max(precision+recall, 1)

    display_metric.append({
        'tag': tag,
        'model':model, 'use_few_shot': use_few_shot,
        'precision': precision, 'recall': recall, 'f1': f1,
        'num_file': 1, 'use_explanation': use_explanation
    })

gpt-3.5-turbo-0301 True True
pred_sentence_ids_list_all [10, 11, 12, 14, 16, 17, 18, 21, 22, 24, 25, 26] gt_idx [12, 13, 14, 17, 18, 30]
clusters {'tp_clusters_in_gt_expanded': [[11, 19]], 'tp_clusters_in_pred': [[10, 18]], 'fn_clusters_in_pred': [[21, 26]], 'fn_clusters_in_gt_expanded': [[29, 31]], 'fp_clusters_in_pred': [[21, 26]]}
metrics {'precision': 0.5, 'recall': 0.5, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 1}

gpt-3.5-turbo-0301 False True
pred_sentence_ids_list_all [10, 14, 15, 17, 18, 21, 22, 24, 25, 26] gt_idx [12, 13, 14, 17, 18, 30]
clusters {'tp_clusters_in_gt_expanded': [[11, 19]], 'tp_clusters_in_pred': [[14, 18]], 'fn_clusters_in_pred': [[21, 26]], 'fn_clusters_in_gt_expanded': [[29, 31]], 'fp_clusters_in_pred': [[10, 10], [21, 26]]}
metrics {'precision': 0.3333333333333333, 'recall': 0.5, 'true_positives': 1, 'false_positives': 2, 'false_negatives': 1}

gpt-3.5-turbo-0301 False False
pred_sentence_ids_list_all [10, 14, 15, 17, 18, 21, 22, 24, 25,

In [16]:
metrics_df = pd.DataFrame(display_metric)
metrics_df

Unnamed: 0,tag,model,use_few_shot,precision,recall,f1,num_file,use_explanation
0,_1,gpt-3.5-turbo-0301,True,0.5,0.5,0.5,1,True
1,_1,gpt-3.5-turbo-0301,False,0.333333,0.5,0.333333,1,True
2,_1,gpt-3.5-turbo-0301,False,0.333333,0.5,0.333333,1,False
3,_1,gpt-3.5-turbo-0301,True,0.5,0.5,0.5,1,False
