## Load necessary imports for vLLM and Prompt Template

In [1]:
from transformers import AutoTokenizer
import json
from tqdm import tqdm
from vllm import LLM, SamplingParams
from datasets import load_dataset
from prompt_templates import comedy_template, instruction_template, knowledge_template # You can add your custom template under code/prompt_templates.py

def request_input_format(user_prompt, tokenizer):
    """
        Formating the dataset for input prompts
        user_prompt: Input Prompt of the dataset
        tokenizer:
    """
    system_prompt = "You are a helpful assistant."
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text

def truncate_text(text, max_length):
    """
    This function is to get rid of OOM issues, it reduces the prompt length.
    """
    return text[:max_length] if len(text) > max_length else text

  from .autonotebook import tqdm as notebook_tqdm




2024-11-07 11:39:12,914	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Load reference dataset and Template for Synthetic data generation

In [2]:
CHOICE_TEMPLATE="comedy"   # template can also be  "knowledge" or "math". Feel free to try others; You can also add your customized data synthesis prompt in code/prompt_templates.py
SAMPLE_SIZE=50  # Set sample_size=0 if you want to use the full version of 200k personas.
OUT_PATH=f"{CHOICE_TEMPLATE}_{SAMPLE_SIZE}_synthesis_output.jsonl"
MODEL_PATH="meta-llama/Meta-Llama-3-70B" # feel free to replace it with any other open-sourced LLMs supported by vllm, Ex: "meta-llama/Meta-Llama-3-8B-Instruct".


import torch
import intel_extension_for_pytorch as ipex #Include XPU namespace

if torch.xpu.is_available():
    torch.xpu.empty_cache() # Query for XPU(Intel GPU) and empty the cache.

def main():
    
    # Load the appropriate template
    if CHOICE_TEMPLATE == "instruction":
        template = instruction_template
    elif CHOICE_TEMPLATE == "knowledge":
        template = knowledge_template
    elif CHOICE_TEMPLATE == "comedy":
        template = comedy_template
    else:
        raise ValueError("Invalid template type. Choose from 'instruction', 'knowledge', 'math' or 'npc', or create a custom template and add to the imports.")

    # Load the dataset
    persona_dataset = load_dataset("proj-persona/PersonaHub", data_files="persona.jsonl")['train']

    max_char_length = 1024 #Setting a max length to data input, to avoid OOM issues.
    persona_dataset = persona_dataset.map(lambda x: {'persona': truncate_text(x['persona'], max_char_length)})
    
    if SAMPLE_SIZE > 0:
        persona_dataset = persona_dataset[:SAMPLE_SIZE]
    print(f"Total number of input personas: {len(persona_dataset['persona'])}")

    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    llm = LLM(model=MODEL_PATH) # please set tensor_parallel_size based on the GPUs you are using

    prompts = []
    max_len = 2048
    

    for persona in persona_dataset['persona']:
        persona = persona.strip()
        user_prompt = template.format(persona=persona)
        prompt = request_input_format(user_prompt, tokenizer)
        prompts.append(prompt)

    print(f"Loaded {len(prompts)} entries to process...\n\n")
    print(f"Sample 0: {prompts[0]}")

    sampling_params = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=max_len, stop=["<|eot_id|>"])
    outputs = llm.generate(prompts, sampling_params)

    with open(OUT_PATH, 'w') as out:
        for i, output in enumerate(outputs):
            out_txt = output.outputs[0].text
            finish_reason = output.outputs[0].finish_reason
            data = {'prompt': output.prompt, "input persona": persona_dataset['persona'][i].strip(), "finish_reason": finish_reason}
            data['synthesized text'] = out_txt
            out.write(json.dumps(data, ensure_ascii=False) + '\n')

    print(f"Outputted the results to: {OUT_PATH}")

if __name__ == "__main__":
    main()



Total number of input personas: 50
INFO 11-07 11:39:18 llm_engine.py:226] Initializing an LLM engine (v0.6.3.dev0+g7193774b.d20241009) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=xpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct, use_v2

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:03,  1.09s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:05<00:05,  2.83s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:09<00:03,  3.35s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:12<00:00,  3.55s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:12<00:00,  3.24s/it]


INFO 11-07 11:39:33 xpu_model_runner.py:406] Loading model weights took 14.9595 GB





INFO 11-07 11:39:36 gpu_executor.py:122] # GPU blocks: 14459, # CPU blocks: 2048
Loaded 50 entries to process...


Sample 0: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

A Political Analyst specialized in El Salvador's political landscape.
 
Assume you are the persona described above and I want you to act as a stand-up comedian. Write content that reflects your unique voice, expertise, and humor, tailored to your specific field.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|██████████| 50/50 [00:41<00:00,  1.21it/s, est. speed input: 92.39 toks/s, output: 643.91 toks/s]

Outputted the results to: comedy_50_synthesis_output.jsonl





## Load Generated Synthetic Dataset

In [3]:
dataset = load_dataset("json", data_files=OUT_PATH)['train']
# dataset = load_dataset("json", data_files="comedy_synthesis_15000.jsonl")["train"]
print(dataset)
print(f"Input Prompt: \n\n{dataset[0]['prompt']}")
print(f"Synthesized Text: \n\n{dataset[0]['synthesized text']}")

Generating train split: 50 examples [00:00, 9131.15 examples/s]

Dataset({
    features: ['prompt', 'input persona', 'finish_reason', 'synthesized text'],
    num_rows: 50
})
Input Prompt: 

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

A Political Analyst specialized in El Salvador's political landscape.
 
Assume you are the persona described above and I want you to act as a stand-up comedian. Write content that reflects your unique voice, expertise, and humor, tailored to your specific field.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Synthesized Text: 

(clears throat) Ah, ¡Hola, amigos! So, I know what you're thinking: "A political analyst as a stand-up comedian? ¡Eso es una locura!" But trust me, I've got the credentials – I've spent years studying the intricacies of El Salvador's political landscape, and I've got jokes for days! (winks)

So, have you heard about the latest developments in the FMLN party? It's like a soap opera! (chuckle


