In [1]:
%load_ext autoreload
%autoreload 2

### Install SDG
```bash 
git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
cd sdg_hub
pip install .[examples]
```
**⚠️ If you haven't already, run the document pre-processing notebook to create the seed data.**

In [2]:
from ai_tools.usecase.knowledge_tuning.knowledge_utils import create_knowledge_regular_ds, create_knowledge_pretraining_ds

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from pathlib import Path

WORKSPACE = Path.cwd().parent # Path to the workspace directory


OUTPUT_DIR= WORKSPACE / "output" / "step_02"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Create the output directory if it doesn't exist


SEED_DATA_FILE = WORKSPACE / "output" / "step_01" / "final_seed_data.jsonl" # Path to the seed data file generated in step 1


if not SEED_DATA_FILE.exists():
    raise FileNotFoundError(f"\nNot a valid seed data ! {SEED_DATA_FILE}.\nPlease run step 1 to generate the seed data. \n(or) Provide the correct path to the seed data file.")

# CONFIGURE MODEL DETAILS HERE FOR THE FLOW
MODEL_NAME = "openai/llama-4-scout-17b-16e-w4a16"
API_KEY = ""   # Provide your API key here
ENDPOINT = "https://llama-4-scout-17b-16e-w4a16-maas-apicast-production.apps.prod.rhoai.rh-aiservices-bu.com:443/v1"

In [4]:
# Third Party
from datasets import load_dataset

# First Party
from sdg_hub import Flow, FlowRegistry

In [5]:
# Required to run the flow with async mode
import nest_asyncio

nest_asyncio.apply()  

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

#### Discover the available generation flows

In [6]:
# Auto-discover all available flows (no setup needed!)
FlowRegistry.discover_flows()

# List available flows
flows = FlowRegistry.list_flows()
print(f"Available flows: {flows}")

# You can also search the flows by tag
qa_flows = FlowRegistry.search_flows(tag="question-generation")
print(f"QA flows: {qa_flows}")

Available flows: [{'id': 'green-clay-812', 'name': 'Structured Text Insights Extraction Flow'}, {'id': 'small-rock-799', 'name': 'Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning'}, {'id': 'mild-thunder-748', 'name': 'Detailed Summary Knowledge Tuning Dataset Generation Flow'}, {'id': 'heavy-heart-77', 'name': 'Key Facts Knowledge Tuning Dataset Generation Flow'}, {'id': 'epic-jade-656', 'name': 'Extractive Summary Knowledge Tuning Dataset Generation Flow'}]
QA flows: [{'id': 'small-rock-799', 'name': 'Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning'}, {'id': 'mild-thunder-748', 'name': 'Detailed Summary Knowledge Tuning Dataset Generation Flow'}, {'id': 'heavy-heart-77', 'name': 'Key Facts Knowledge Tuning Dataset Generation Flow'}, {'id': 'epic-jade-656', 'name': 'Extractive Summary Knowledge Tuning Dataset Generation Flow'}]


In [7]:
# We will use the "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning" flow.
# For loading the flow simply use the fullname to load it
flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
flow_path = FlowRegistry.get_flow_path(flow_name)
flow = Flow.from_yaml(flow_path)

#### Identify the recommended model and set the model config

In [8]:
flow.get_default_model()

'meta-llama/Llama-3.3-70B-Instruct'

In [9]:
flow.get_model_recommendations()

{'default': 'meta-llama/Llama-3.3-70B-Instruct',
 'compatible': ['microsoft/phi-4', 'mistralai/Mixtral-8x7B-Instruct-v0.1'],
 'experimental': []}

In [10]:
# You can dynamically change the model without having to change the flow yaml file.
# Configure the flow to use a vllm model hosted at localhost:8000/v1. 
flow.set_model_config(
    model=MODEL_NAME,
    api_base=ENDPOINT,
    api_key=API_KEY,
)

In [11]:
# Load the seed data
number_of_samples = 2

ds = load_dataset('json', data_files=f"{SEED_DATA_FILE}", split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [12]:
# Generate data
generated_data = flow.generate(ds)

[92m15:43:17 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:17 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Map: 100%|██████████| 2/2 [00:00<00:00, 244.05 examples/s]


[92m15:43:23 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:23 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Map: 100%|██████████| 2/2 [00:00<00:00, 258.49 examples/s]


[92m15:43:27 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:27 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Map: 100%|██████████| 8/8 [00:00<00:00, 639.30 examples/s]


[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:31 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Map: 100%|██████████| 72/72 [00:00<00:00, 3730.34 examples/s]


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:43:50 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Filter: 100%|██████████| 72/72 [00:00<00:00, 4490.16 examples/s]
Filter: 100%|██████████| 72/72 [00:00<00:00, 5589.71 examples/s]


Map: 100%|██████████| 71/71 [00:00<00:00, 2692.47 examples/s]


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:03 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Map: 100%|██████████| 71/71 [00:00<00:00, 3514.19 examples/s]
Filter: 100%|██████████| 71/71 [00:00<00:00, 6636.70 examples/s]
Filter: 100%|██████████| 71/71 [00:00<00:00, 5435.61 examples/s]


Map: 100%|██████████| 62/62 [00:00<00:00, 2620.01 examples/s]


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai
[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


[92m15:44:16 - LiteLLM:INFO[0m: utils.py:3293 - 
LiteLLM completion() model= llama-4-scout-17b-16e-w4a16; provider = openai


Map: 100%|██████████| 62/62 [00:00<00:00, 2894.49 examples/s]
Filter: 100%|██████████| 62/62 [00:00<00:00, 5141.40 examples/s]
Filter: 100%|██████████| 62/62 [00:00<00:00, 4421.89 examples/s]


### Converting the generated data into training format

In [13]:
from ai_tools.usecase.knowledge_tuning.knowledge_utils import create_knowledge_regular_ds, create_knowledge_pretraining_ds

from datasets import concatenate_datasets



# Create Pretraining Knowledge Dataset (Also known as Phase 0.7/Phase 7)
instructlab_phase_1_ds = create_knowledge_pretraining_ds(generated_data)
instructlab_phase_1_ds.to_json(f'{OUTPUT_DIR}/instructlab_phase_1_ds.jsonl', orient='records', lines=True)

# Create Regular Knowledge Dataset (Also known as Phase 1.0/Phase 10)
instructlab_phase_2_ds = create_knowledge_regular_ds(generated_data)

# Mix the pre-computed skills with the regular knowledge dataset. If more than one dataset were generated simply add those in this concatenation stage.
# If you have any generated instruction data, that can be also mixed in this stage. If you only have generated skills phase 07 generation and training can be skipped.
instructlab_phase_2_ds.to_json(f'{OUTPUT_DIR}/instructlab_phase_2_ds.jsonl', orient='records', lines=True)

Map (num_proc=10): 100%|██████████| 47/47 [00:00<00:00, 61.20 examples/s]
Map: 100%|██████████| 47/47 [00:00<00:00, 3523.18 examples/s]
Map: 100%|██████████| 47/47 [00:00<00:00, 11404.82 examples/s]
Filter: 100%|██████████| 47/47 [00:00<00:00, 3899.75 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 2199.62 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 2604.35 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 228.98ba/s]
Map (num_proc=10): 100%|██████████| 47/47 [00:00<00:00, 69.00 examples/s]
Map: 100%|██████████| 47/47 [00:00<00:00, 3162.21 examples/s]
Map: 100%|██████████| 47/47 [00:00<00:00, 6464.20 examples/s]
Filter: 100%|██████████| 47/47 [00:00<00:00, 3482.04 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 2146.71 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 376.47ba/s]


312751

In [None]:
# If you have any other instruction tuning datasets you can mix with phase 2 dataset.
instruction_tuning_dataset_path = "<Your instruction tuning dataset path>"
instruction_tuning_dataset = load_dataset('json', data_files=instruction_tuning_dataset_path, split='train')
instructlab_phase_2_ds = concatenate_datasets([instructlab_phase_2_ds, instruction_tuning_dataset])
instructlab_phase_2_ds.to_json(f'{OUTPUT_DIR}/instructlab_phase_2_ds.jsonl', orient='records', lines=True)