### Getting Started

To being, start by installing the SDG Hub examples.

In [None]:
%pip install sdg-hub[examples]

In [None]:
from ai_tools.usecase.knowledge_tuning.knowledge_utils import create_knowledge_regular_ds, create_knowledge_pretraining_ds
from pathlib import Path

WORKSPACE = Path.cwd().parent # Path to the workspace directory


OUTPUT_DIR= WORKSPACE / "output" / "step_02"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Create the output directory if it doesn't exist


SEED_DATA_FILE = WORKSPACE / "output" / "step_01" / "final_seed_data.jsonl" # Path to the seed data file generated in step 1


if not SEED_DATA_FILE.exists():
    raise FileNotFoundError(f"\nNot a valid seed data ! {SEED_DATA_FILE}.\nPlease run step 1 to generate the seed data. \n(or) Provide the correct path to the seed data file.")

# CONFIGURE MODEL DETAILS HERE FOR THE FLOW
MODEL_NAME = "openai/llama-4-scout-17b-16e-w4a16"
API_KEY = ""   # Provide your API key here
ENDPOINT = "https://llama-4-scout-17b-16e-w4a16-maas-apicast-production.apps.prod.rhoai.rh-aiservices-bu.com:443/v1"

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

#### Discover the available generation flows

In [None]:
from datasets import load_dataset
from sdg_hub import Flow, FlowRegistry

# Required to run the flow with async mode
import nest_asyncio

nest_asyncio.apply()  

# Auto-discover all available flows (no setup needed!)
FlowRegistry.discover_flows()

# List available flows
flows = FlowRegistry.list_flows()
print(f"Available flows: {flows}")

# You can also search the flows by tag
qa_flows = FlowRegistry.search_flows(tag="question-generation")
print(f"QA flows: {qa_flows}")

In [None]:
# We will use the "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning" flow.
# For loading the flow simply use the fullname to load it
flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
flow_path = FlowRegistry.get_flow_path(flow_name)
flow = Flow.from_yaml(flow_path)

#### Identify the recommended model and set the model config

In [None]:
flow.get_default_model()

In [None]:
flow.get_model_recommendations()

In [None]:
# You can dynamically change the model without having to change the flow yaml file.
# Configure the flow to use a vllm model hosted at localhost:8000/v1. 
flow.set_model_config(
    model=MODEL_NAME,
    api_base=ENDPOINT,
    api_key=API_KEY,
)

In [None]:
# Load the seed data
number_of_samples = 2

ds = load_dataset('json', data_files=f"{SEED_DATA_FILE}", split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [None]:
# Generate data
generated_data = flow.generate(ds)

### Converting the generated data into training format

In [None]:
from ai_tools.usecase.knowledge_tuning.knowledge_utils import create_knowledge_regular_ds, create_knowledge_pretraining_ds

from datasets import concatenate_datasets



# Create Pretraining Knowledge Dataset (Also known as Phase 0.7/Phase 7)
instructlab_phase_1_ds = create_knowledge_pretraining_ds(generated_data)
instructlab_phase_1_ds.to_json(f'{OUTPUT_DIR}/instructlab_phase_1_ds.jsonl', orient='records', lines=True)

# Create Regular Knowledge Dataset (Also known as Phase 1.0/Phase 10)
instructlab_phase_2_ds = create_knowledge_regular_ds(generated_data)

# Mix the pre-computed skills with the regular knowledge dataset. If more than one dataset were generated simply add those in this concatenation stage.
# If you have any generated instruction data, that can be also mixed in this stage. If you only have generated skills phase 07 generation and training can be skipped.
instructlab_phase_2_ds.to_json(f'{OUTPUT_DIR}/instructlab_phase_2_ds.jsonl', orient='records', lines=True)

In [None]:
# If you have any other instruction tuning datasets you can mix with phase 2 dataset.
instruction_tuning_dataset_path = "<Your instruction tuning dataset path>"
instruction_tuning_dataset = load_dataset('json', data_files=instruction_tuning_dataset_path, split='train')
instructlab_phase_2_ds = concatenate_datasets([instructlab_phase_2_ds, instruction_tuning_dataset])
instructlab_phase_2_ds.to_json(f'{OUTPUT_DIR}/instructlab_phase_2_ds.jsonl', orient='records', lines=True)