In [None]:
%load_ext autoreload
%autoreload 2

### Install SDG
```bash 
git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
cd sdg_hub
pip install .[examples]
```
**⚠️ If you haven't already, run the document pre-processing notebook to create the seed data.**

In [None]:
# Third Party
from datasets import load_dataset

# First Party
from sdg_hub import Flow, FlowRegistry

In [None]:
# Required to run the flow with async mode
import nest_asyncio

nest_asyncio.apply()  

In [None]:
# Remember timestamp
from datetime import datetime

now = datetime.now()
timestamp = now.strftime('%Y%m%d-%H%M%S')

### Configurations

#### Configure seed data

In [None]:
data_name = ""  # an arbitrary label for seed data (can be empty)

data_lang = "ja"  # for Japanese seed data
# data_lang = ""

repeat_times = 1
# repeat_times = 5  # for using the same seed data multiple times

In [None]:
_data_name = f"_{data_name}" if data_name is not None and len(data_name) > 0 else ""
_repeat_times = f"_r{repeat_times}" if repeat_times > 1 else ""

sdg_demo_output = "sdg_demo_output"

seed_data_path = f"{sdg_demo_output}{_data_name}/seed_data.jsonl"
output_dir_prefix = f"{sdg_demo_output}{_data_name}{_repeat_times}"

#### Configure model

In [None]:
# HuggingFace model name
phi4_model_name_hf = "microsoft/phi-4"
llama3_model_name_hf = "meta-llama/Llama-3.3-70B-Instruct"
mixtral_model_name_hf = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# Model short name
phi4_short_name = "phi4"
llama3_short_name = "llama3"
mixtral_short_name = "mixtral"

In [None]:
# Select a model
short_name = phi4_short_name

In [None]:
if short_name == phi4_short_name:
    model_name_hf = phi4_model_name_hf
elif short_name == llama3_short_name:
    model_name_hf = llama3_model_name_hf
elif short_name == mixtral_short_name:
    model_name_hf = mixtral_model_name_hf
else:
    raise ValueError(f"Invalid short_name: {short_name}. Must be one of: {phi4_short_name}, {llama3_short_name}, {mixtral_short_name}")

output_dir = f"{output_dir_prefix}_{short_name}"  # for continued execution after failure
# output_dir = f"{output_dir_prefix}_{short_name}_{timestamp}"

In [None]:
async_mode = True
# async_mode = False  # single worker
timeout = 3600

#### Configure generation

In [None]:
checkpoint_dir = f"{output_dir}_ckpt"
# See https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/docs/blocks/llm-blocks.md#async-processing--concurrency-control
save_freq = 10  # checkpoint interval (in request)
max_concurrency = 20  # (async_mode only) max in-flight requests

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to provided QnA pairs

#### Discover the available generation flows

In [None]:
# Auto-discover all available flows (no setup needed!)
FlowRegistry.discover_flows()

# List available flows
flows = FlowRegistry.list_flows()
print(f"Available flows: {flows}")

# You can also search the flows by tag
qa_flows = FlowRegistry.search_flows(tag="question-generation")
print(f"QA flows: {qa_flows}")

In [None]:
# We will use the "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning" flow.
# For loading the flow simply use the fullname to load it
if data_lang == "ja":
    flow_name = "Advanced Japanese Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
else:
    flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
flow_path = FlowRegistry.get_flow_path(flow_name)
flow = Flow.from_yaml(flow_path)

#### Identify the recommended model and set the model config

In [None]:
flow.get_default_model()

In [None]:
flow.get_model_recommendations()

In [None]:
# You can dynamically change the model without having to change the flow yaml file.
# Configure the flow to use a vllm model hosted at localhost:8000/v1. 
flow.set_model_config(
    model=f"hosted_vllm/{model_name_hf}",
    api_base="http://localhost:8000/v1",
    api_key="EMPTY",
    async_mode=async_mode,
    timeout=timeout,
)

#### Load and prepare seed data

In [None]:
# Load the seed data
ds = load_dataset('json', data_files=seed_data_path, split='train')

Repeat (duplicate) seed data

In [None]:
if repeat_times > 1:
    ds = ds.repeat(repeat_times)

Shuffle seed data

In [None]:
ds = ds.shuffle(seed=42)

(Optional) sample seed data

In [None]:
number_of_samples = 2
ds = ds.select(range(number_of_samples))

Add seed id

In [None]:
# Add seed_id column to preserve repetition in seed data
# See https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/42650f1340a2d3576818d68e05508dfe2a8d04bd/src/sdg_hub/checkpointer.py#L103
ds = ds.add_column("seed_id", list(range(len(ds))))

In [None]:
print(f"Loaded {len(ds)} seed data", flush=True)

In [None]:
# Generate data
generated_data = flow.generate(ds, checkpoint_dir=checkpoint_dir, save_freq=save_freq, max_concurrency=max_concurrency)

### Converting the generated data into training format

In [None]:
from datasets import Dataset

def create_simple_qa_dataset(generated_data: Dataset) -> Dataset:
    seen = set()
    messages_list: list[dict] = []
    for generated_data_i in generated_data:
        user = generated_data_i['question']
        assistant = generated_data_i['response']
        messages = [
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant},
        ]
        # deduplicate messages
        key = tuple([frozenset(d.items()) for d in messages])
        if key not in seen:
            seen.add(key)
            messages_list.append({"messages": messages})
    messages_data = Dataset.from_list(messages_list)
    return messages_data

In [None]:
import os

messages_data = create_simple_qa_dataset(generated_data)
os.makedirs(output_dir, exist_ok=True)
messages_data.to_json(f"{output_dir}/messages_data.jsonl")