# 📒 Overview

- Before running the notebook make sure you select the ```vllm-xpu``` environment in the kernel, If there is no such kernel present. Please follow setup instructions from [here](../vllm-setup/).

- The notebook helps you create Synthetic data for **Comedy dialogue generation** 😅.

- Among various prompt templates under ```prompt_templates.py```, here we've used comedy_template. In the same way you can customize your own template and import it accordingly.

## ⌛Load necessary Imports

* Import vLLM and required packages.
* Import templates from the ```prompt_templates``` file, we can make necessary changes into the file and import custom templates.

In [None]:
import json
from tqdm import tqdm
import torch
import intel_extension_for_pytorch as ipex #Include XPU namespace
from transformers import AutoTokenizer
from datasets import load_dataset
from vllm import LLM, SamplingParams
from prompt_templates import comedy_template, instruction_template, knowledge_template # You can add your custom template under code/prompt_templates.py

* Below are few helper functions to format the user_prompt and to truncate the data to avoid OOM.

In [None]:
def request_input_format(user_prompt, tokenizer):
    """
        Formating the dataset for input prompts
        {user_prompt}: Input Prompt of the dataset
        {tokenizer}: Tokernizer of the Model.
        return: Formats the user_prompt according to the chat template
    """
    system_prompt = "You are a helpful assistant."
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text

def truncate_text(text, max_length):
    """
    This function is to get rid of OOM issues, it reduces the prompt length.
    {text}: Input prompt
    {max_length}: Lenght to want to slice.
    return: Updated Input prompt
    """
    return text[:max_length] if len(text) > max_length else text

## 🧪 Choose the template and Run generation

* Select a template from the ```prompt_templates.py```

* Define the sample size, which descibes how many samples of synthetic data to be generated based on the whole dataset.

* Select the model, based on your hardware capacity and VRAM.

In [None]:
CHOICE_TEMPLATE="comedy"   # template can also be  "knowledge" or "math". Feel free to try others; You can also add your customized data synthesis prompt in code/prompt_templates.py
SAMPLE_SIZE=10  # Set sample_size=0 if you want to use the full version of 200k personas.
OUT_PATH=f"{CHOICE_TEMPLATE}_{SAMPLE_SIZE}_synthesis_output.jsonl"
MODEL_PATH="meta-llama/Meta-Llama-3-70B" # feel free to replace it with any other open-sourced LLMs supported by vllm, Ex: "meta-llama/Meta-Llama-3-8B-Instruct".

if torch.xpu.is_available():
    torch.xpu.empty_cache() # Query for XPU(Intel GPU) and empty the cache.

def main():
    
    # Load the appropriate template
    if CHOICE_TEMPLATE == "instruction":
        template = instruction_template
    elif CHOICE_TEMPLATE == "knowledge":
        template = knowledge_template
    elif CHOICE_TEMPLATE == "comedy":
        template = comedy_template
    else:
        raise ValueError("Invalid template type. Choose from 'instruction', 'knowledge', 'math' or 'npc', or create a custom template and add to the imports.")

    # Load the dataset
    persona_dataset = load_dataset("proj-persona/PersonaHub", data_files="persona.jsonl")['train']

    max_char_length = 1024 #Setting a max length to data input, to avoid OOM issues.
    persona_dataset = persona_dataset.map(lambda x: {'persona': truncate_text(x['persona'], max_char_length)})
    
    if SAMPLE_SIZE > 0:
        persona_dataset = persona_dataset[:SAMPLE_SIZE]
    print(f"Total number of input personas: {len(persona_dataset['persona'])}")

    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    llm = LLM(model=MODEL_PATH) # please set tensor_parallel_size based on the GPUs you are using

    prompts = []
    max_len = 2048

    for persona in persona_dataset['persona']:
        persona = persona.strip()
        user_prompt = template.format(persona=persona)
        prompt = request_input_format(user_prompt, tokenizer)
        prompts.append(prompt)

    print(f"Loaded {len(prompts)} entries to process...\n\n")
    print(f"Sample 0: {prompts[0]}")

    sampling_params = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=max_len, stop=["<|eot_id|>"])
    outputs = llm.generate(prompts, sampling_params)

    with open(OUT_PATH, 'w') as out:
        for i, output in enumerate(outputs):
            out_txt = output.outputs[0].text
            finish_reason = output.outputs[0].finish_reason
            data = {'prompt': output.prompt, "input persona": persona_dataset['persona'][i].strip(), "finish_reason": finish_reason}
            data['synthesized text'] = out_txt
            out.write(json.dumps(data, ensure_ascii=False) + '\n')

    print(f"Outputted the results to: {OUT_PATH}")

if __name__ == "__main__":
    main()

## 👀 View the generated Synthetic Dataset

In [None]:
dataset = load_dataset("json", data_files=OUT_PATH)['train']
# dataset = load_dataset("json", data_files="comedy_synthesis_10.jsonl")["train"]
print(dataset)
print(f"\n\nInput Prompt: \n\n{dataset[0]['prompt']}")
print(f"Synthesized Text: \n\n{dataset[0]['synthesized text']}")