In [None]:
%load_ext autoreload
%autoreload 2

### Install SDG
```bash 
git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
cd sdg_hub
pip install .[examples]
```
**⚠️ If you haven't already, run the document pre-processing notebook to create the seed data.**

In [None]:
# Third Party
from datasets import load_dataset

# First Party
from sdg_hub import Flow, FlowRegistry

In [None]:
# Required to run the flow with async mode
import nest_asyncio

nest_asyncio.apply()

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

#### Discover the available generation flows

In [None]:
# Auto-discover all available flows (no setup needed!)
FlowRegistry.discover_flows()

# List available flows
flows = FlowRegistry.list_flows()
print(f"Available flows: {flows}")

# You can also search the flows by tag
qa_flows = FlowRegistry.search_flows(tag="question-generation")
print(f"QA flows: {qa_flows}")

In [None]:
# We will use the "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning" flow.
# For loading the flow simply use the fullname to load it
flow_name = (
    "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
)
flow_path = FlowRegistry.get_flow_path(flow_name)
flow = Flow.from_yaml(flow_path)

#### Identify the recommended model and set the model config

In [None]:
flow.get_default_model()

In [None]:
flow.get_model_recommendations()

In [None]:
# You can dynamically change the model without having to change the flow yaml file.
# Configure the flow to use a vllm model hosted at localhost:8000/v1.
flow.set_model_config(
    model="hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base="http://localhost:8000/v1",
    api_key="EMPTY",
)

In [None]:
# Load the seed data
number_of_samples = 2
seed_data_dir = f"sdg_demo_output/"
ds = load_dataset("json", data_files=f"{seed_data_dir}/seed_data.jsonl", split="train")
ds = ds.shuffle(seed=42).select(range(number_of_samples))
ds = ds.to_pandas()

In [None]:
# Generate data
generated_data = flow.generate(ds)

### Converting the generated data into training format

In [None]:
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))
from knowledge_utils import create_knowledge_regular_ds, create_knowledge_pretraining_ds

from datasets import concatenate_datasets

output_dir = f"sdg_demo_output/"


# Create Pretraining Knowledge Dataset (Also known as Phase 0.7/Phase 7)
instructlab_phase_1_ds = create_knowledge_pretraining_ds(generated_data)
os.makedirs(output_dir, exist_ok=True)
instructlab_phase_1_ds.to_json(
    f"{output_dir}/instructlab_phase_1_ds.jsonl", orient="records", lines=True
)

# Create Regular Knowledge Dataset (Also known as Phase 1.0/Phase 10)
instructlab_phase_2_ds = create_knowledge_regular_ds(generated_data)

# Mix the pre-computed skills with the regular knowledge dataset. If more than one dataset were generated simply add those in this concatenation stage.
# If you have any generated instruction data, that can be also mixed in this stage. If you only have generated skills phase 07 generation and training can be skipped.
instructlab_phase_2_ds.to_json(
    f"{output_dir}/instructlab_phase_2_ds.jsonl", orient="records", lines=True
)

In [None]:
# If you have any other instruction tuning datasets you can mix with phase 2 dataset.
instruction_tuning_dataset_path = "<Your instruction tuning dataset path>"
instruction_tuning_dataset = load_dataset(
    "json", data_files=instruction_tuning_dataset_path, split="train"
)
instructlab_phase_2_ds = concatenate_datasets(
    [instructlab_phase_2_ds, instruction_tuning_dataset]
)
instructlab_phase_2_ds.to_json(
    f"{output_dir}/instructlab_phase_2_ds.jsonl", orient="records", lines=True
)