In [None]:
import json
import torch
import itertools

import numpy as np
import pandas as pd

from tqdm import tqdm
from transformers import AutoTokenizer

root_dir = '.'
tasks = ['atis', 'snips', 'clinic150', 'massive']
encoder = 'bge-large-en-v1.5'
config = 'e+p+om'
top_ks = [3, 16, 151]

models = [
    'Mistral-7B-Instruct-v0.3',
    'llama/Meta-Llama-3.1-8B-Instruct',
    'gemma-2-9b-it',
    'Phi-3-medium-4k-instruct'
]

In [5]:
from typing import List, Dict, Tuple

def format_prompt(candidates: List[str], descriptions: Dict[str, str], utterance: str):
    """ Given the similarity scores for candidate classes, format prompt to
        pass to LLM for candidate selection

    Args:
        candidates (List[str]): list of candidates intents, index
                                corresponds to sims index
        descriptions (Dict[str, str]): textual descriptions corresponding to
                                        each intent class
        utterance (str): original utterance for model to evaluate
    """
    output_text = '\n'.join([
        f"Given the user said \"{utterance}\"\nPlease give the 'intent' that best reflect what the user is saying/asking for, based on which of the following intents has a description best matching the user's utterance:",
        "",
        *[f"intent: {intent}\ndescription: {descriptions[intent]}\n" for intent in candidates],
        "",
        "Please give the intent name only, do not provide reasoning.",
        "The intent is: "
    ])
    return output_text


In [None]:
outputs = []

for model in models:
    model_path = f""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    combinations = list(itertools.product(top_ks, tasks))
    for (top_k, task) in tqdm(combinations):
        output_tokens_count = []
        all_candidates = json.load(open(f"{root_dir}/candidates/{task}-{encoder}-{config}-{top_k}-cands.jsonl"))
        descriptions = json.load(open(f"{root_dir}/data/{task}/descriptions.json"))
        data = json.load(open(f"{root_dir}/data/{task}/data-full-shuffled.json"))['data']
        for (entry, candidates) in zip(data, all_candidates):
            prompt = format_prompt(candidates, descriptions, entry['text'])
            tokens = tokenizer(prompt).input_ids
            output_tokens_count.append(len(tokens))
        output_tokens_count = np.array(output_tokens_count)
        outputs.append({
            'model': model,
            'top_k': top_k,
            'task': task,
            'mean_tok': output_tokens_count.mean()
        })

100%|██████████| 12/12 [01:41<00:00,  8.46s/it]
100%|██████████| 12/12 [01:40<00:00,  8.39s/it]
100%|██████████| 12/12 [01:54<00:00,  9.51s/it]
100%|██████████| 12/12 [01:48<00:00,  9.06s/it]


In [16]:
output_df = pd.DataFrame(outputs)
output_df2 = []

for model in models:
    for top_k in top_ks:
        entry = {
            'model': model,
            'top_k': top_k
        }
        for task in tasks:
            entry[task] = output_df[((output_df['model'] == model)
                                     & (output_df['top_k'] == top_k)
                                     & (output_df['task'] == task))]['mean_tok'].item()
        output_df2.append(entry)
output_df2 = pd.DataFrame(output_df2)

In [18]:
output_df2.to_excel(f"{root_dir}/results/analysis/input_token_count.xlsx")