## Large Language Models Inference with naive Hugging Face Pipelines and vLLM Inference Servers

In [1]:
import torch
import random
import pandas as pd

import numpy as np
from evaluate import load

from vllm import LLM, SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
test = pd.read_csv("rogers_data/rogers_test_df.csv")

In [3]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        local_files_only=False,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.eval()
    return tokenizer, model


def format_prompt(customer, agent):
    system_prompt = """You are an autocomplete assistant that would help Agents to write responses to customers more quickly and efficiently.
    Given the conversation between agent and customer, you should generate suitable and relevant responses based on the provided chat context."""
    prompt = "<s>"
    prompt += f"[INST] {system_prompt} {customer} [/INST]"
    prompt += f"{agent}"
    return prompt


def naive_hf_inference(sample):
    # ground_truth = sample["target_utterance"]
    # history = sample["history"] + "<|assistant|>\n"
    # partial_target = " ".join(ground_truth.split()[:5])
    # prompt = history + partial_target
    gen_op = ""
    tokenized_text = tokenizer(sample, return_tensors="pt", add_special_tokens=False).to("cuda:0")
    generated_ids = model.generate(input_ids=tokenized_text["input_ids"], 
                                   attention_mask=tokenized_text["attention_mask"],
                                   pad_token_id=tokenizer.pad_token_id,
                                   eos_token_id=tokenizer.eos_token_id,
                                   max_new_tokens=20,
                                   temperature=0.5,
                                   num_return_sequences=1,
                                   top_p=0.92,
                                   top_k=30,
                                   repetition_penalty=1.5,
                                   early_stopping=False,
                                   use_cache=False,
                                   do_sample=True)
    for sequence in generated_ids:
        gen_op += tokenizer.decode(sequence, skip_special_tokens=True)

    clean_idx = gen_op.rfind("<|assistant|>\n") + len("<|assistant|>\n")
    gen_op = gen_op[clean_idx:]
    return gen_op

In [4]:
# Load the finetuned model from Hugging face or local disk
model_name = "skshreyas714/autocomplete-rogers-zephyr"
tokenizer, model = load_model(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
ids = [i for i in range(len(test))]
idx = random.choice(ids)
print(idx)

ground_truth = test.iloc[idx]["targets"]
partial_target = " ".join(ground_truth.split()[:5])
history = test.iloc[idx]["text"] + partial_target
outs = naive_hf_inference(history)

print(f"Chat History: {history}\n")
print(f"Generated Suggestion: {outs}\n")
print(f"Actual Output: {ground_truth}")

1068
Chat History: <|system|>
You are a helpful assistant. Your task is to help an assistant to write responses quickly for a customer-agent conversation based on the coversation history</s>
<|user|>
Hello I'm trying to figure out if I could sign my husband and I up for a family plan 0$ on the phone? But how do I check and see if I'm even able to proceed</s>
<|user|>
Ok so I'd would like my credit checked and to transfer my current number over</s>
<|user|>
But we need two 0$ down phone in a family plan I'm down with pay as I go</s>
<|user|>
My number is <PHONE_NUMBER> if you could have some one to call tomorrow</s>
<|assistant|>Hi <PERSON>! Thanks for reaching

Generated Suggestion: u are a helpful assistant. Your task is to help an assistant to write responses quickly for a customer-agent conversation based on the coversation history 
<|user|>
Hello I'm trying to figure out if I could sign my husband and I up for a family plan 0$ on the phone? But how do I check and see if I'm even ab

## vLLM Inference Server Engine for increased inference throughput and latency

In [3]:
# Mention the Hugging face model tag
model_name = "skshreyas714/autocomplete-rogers-zephyr"

# Set tensor parallel size = num GPUs available. Uses Continuous batching to utilize full GPU Memory for higher throughput
llm = LLM(model=model_name, tensor_parallel_size=4, seed=42)

2023-11-15 09:40:03,276	INFO worker.py:1673 -- Started a local Ray instance.


INFO 11-15 09:40:17 llm_engine.py:72] Initializing an LLM engine with config: model='skshreyas714/autocomplete-rogers-zephyr', tokenizer='skshreyas714/autocomplete-rogers-zephyr', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=4, quantization=None, seed=42)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 11-15 09:40:42 llm_engine.py:207] # GPU blocks: 30073, # CPU blocks: 8192


In [4]:
# Prepare the test set prompts according to LLM Chat template

def prepare_test_prompts(test_df, target_length=5):
    test_history = test_df.text.tolist()

    ground_truth = test_df.targets.tolist()
    partial_target = [" ".join(gt.split()[:target_length]) for gt in ground_truth]
    test_prompts = []
    for i, j in zip(test_history, partial_target):
        test_prompts.append(i + j)
    return test_prompts, partial_target

In [14]:
test_all, partial_target = prepare_test_prompts(test, target_length=5)

In [6]:
# Load test set dataframe and pick random rows each time to see the responses
ids = [i for i in range(len(test))]
idx = random.choice(ids)
print(f"Random ID picked is - {idx}")

ground_truth = test.iloc[idx]["targets"]
partial_target = " ".join(ground_truth.split()[:5])
history = test.iloc[idx]["text"] # + partial_target
print(f"Actual Output: {ground_truth}")

Random ID picked is - 3007
Actual Output: I am glad to hear that it has been restored. You're very welcome! Have a great night! <PERSON></s>



In [7]:
# Modify the hyper parameters below for checking response variablity and modify accordingly
sampling_params = SamplingParams(temperature=0.5, max_tokens=20,
                                 presence_penalty=1.5,
                                 frequency_penalty=1.5, top_p=0.9, top_k=50,
                                 skip_special_tokens=True,
                                 use_beam_search=False,
                                 best_of=10,
                                 early_stopping=False)

# LLM Response text
outputs = llm.generate(test_all[:200], sampling_params)

predicted = []
# Print the outputs.
for output in outputs:
    generated_text = output.outputs[0].text
    predicted.append(generated_text)
    # print(f"Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 200/200 [01:27<00:00,  2.28it/s]


## Evaluation on Test Set for Autocomplete

In [8]:
def exact_match_at_n(prediction, true_value, generated_tokens, input_len):
    score = 0
    exact_match = load("exact_match")
    for pred, target in zip(prediction, true_value):
        pred = pred.split()
        target = target.split()
        min_len = min(len(pred), len(target), input_len + generated_tokens)
        pred_list = pred[input_len:min_len]
        target_list = target[input_len:min_len]
        if min_len > input_len:
            results = exact_match.compute(
                references=target_list,
                predictions=pred_list,
                ignore_case=True,
                ignore_punctuation=True
            )
            score += results["exact_match"]
        else:
            score += 1.0
    exact_match_score = np.round(score / len(prediction) * 100, 2)
    return exact_match_score

In [15]:
preds = []
for target, prediction in zip(partial_target, predicted):
    preds.append(target + prediction)

In [16]:
outputs = preds
targets = test["targets"]
length = 5

print("ExactMatch@1: {}%".format(exact_match_at_n(prediction=outputs, true_value=targets, generated_tokens=1, input_len=length)))
print("ExactMatch@2: {}%".format(exact_match_at_n(prediction=outputs, true_value=targets, generated_tokens=2, input_len=length)))
print("ExactMatch@3: {}%".format(exact_match_at_n(prediction=outputs, true_value=targets, generated_tokens=3, input_len=length)))
print("ExactMatch@5: {}%".format(exact_match_at_n(prediction=outputs, true_value=targets, generated_tokens=5, input_len=length)))

ExactMatch@1: 50.5%
ExactMatch@2: 43.0%
ExactMatch@3: 38.33%
ExactMatch@5: 31.72%
