### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
    !pip install synthetic-data-kit==0.0.3
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm
    !pip install synthetic-data-kit==0.0.3


In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

## Start VLM server

In [None]:
from unsloth.dataprep import SyntheticDataKit

generator = SyntheticDataKit.from_pretrained(
    # Choose any model from https://huggingface.co/unsloth
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2048, # Longer sequence lengths will be slower!
)

Unsloth: Your GPU cannot handle sequence lengths of 256 due to limited GPU memory.
Unsloth: Your GPU can only handle approximately the maximum sequence length of 256.
Unsloth: Using dtype = torch.float16 for vLLM.
Unsloth: vLLM loading unsloth/Llama-3.2-3B-Instruct with actual GPU utilization = 11.91%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 256. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 0.0 GB. Also swap space = 0 GB.
vLLM STDOUT: INFO 05-21 18:36:42 [__init__.py:239] Automatically detected platform cuda.
vLLM STDOUT: INFO 05-21 18:36:53 [api_server.py:1043] vLLM API server version 0.8.5.post1
vLLM STDOUT: INFO 05-21 18:36:53 [api_server.py:1044] args: Namespace(subparser='serve', model_tag='unsloth/Llama-3.2-3B-Instruct', config='', host=None, port=8000, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_meth

## Generate QA Pairs + Auto clean data

In [None]:
generator.prepare_qa_generation(
    output_folder = "data", # Output location of synthetic data
    temperature = 0.7, # Higher temp makes more diverse datases
    top_p = 0.95,
    overlap = 64, # Overlap portion during chunking
    max_generation_tokens = 512, # Can increase for longer QA pairs
)

### Sanity Checks

In [None]:
!synthetic-data-kit system-check

[2K[31mL VLLM server is not available at [0m[4;94mhttp://localhost:8000/v1[0m
[2KError: [1;35mHTTPConnectionPool[0m[1m([0m[33mhost[0m=[32m'localhost'[0m, [33mport[0m=[1;36m8000[0m[1m)[0m: Read timed out. [1m([0mread 
[33mtimeout[0m=[1;36m2[0m[1m)[0m
[2K
[33mTo start the server, run:[0m
[2K[1;34mvllm serve meta-llama/Llama-[0m[1;36m3.3[0m[1;34m-70B-Instruct --port [0m[1;36m8000[0m
[2K[32m⠼[0m Checking VLLM server at http://localhost:8000/v1...
[1A[2K

In [None]:

import requests

url = "http://localhost:8000/v1/chat/completions"

payload = {
    "model": "unsloth/Llama-3.2-3B-Instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"}
    ],
    "temperature": 0.7,
    "max_tokens": 100
}

response = requests.post(url, json=payload)

# Pretty-print the result
if response.status_code == 200:
    print(response.json()["choices"][0]["message"]["content"])
else:
    print(f"Error {response.status_code}: {response.text}")


### Ingest

In [None]:
!synthetic-data-kit ingest /content/acuff.pdf
!synthetic-data-kit ingest /content/le.pdf
!synthetic-data-kit ingest /content/paige.pdf
!synthetic-data-kit ingest /content/osborne.pdf

!synthetic-data-kit ingest /content/YT_acuff.pdf
!synthetic-data-kit ingest /content/YT_le.pdf
!synthetic-data-kit ingest /content/YT_paige.pdf
!synthetic-data-kit ingest /content/YT_osborne.pdf

[?25l[32m⠋[0m Processing /content/acuff.pdf...[2K[32m⠙[0m Processing /content/acuff.pdf...[2K[32m⠹[0m Processing /content/acuff.pdf...
[1A[2K[32m Text successfully extracted to [0m[1;32mdata/output/acuff.txt[0m
[2K[32m⠙[0m Processing /content/le.pdf...
[1A[2K[32m Text successfully extracted to [0m[1;32mdata/output/le.txt[0m
[2K[32m⠙[0m Processing /content/paige.pdf...
[1A[2K[32m Text successfully extracted to [0m[1;32mdata/output/paige.txt[0m
[2K[32m⠹[0m Processing /content/osborne.pdf...
[1A[2K[32m Text successfully extracted to [0m[1;32mdata/output/osborne.txt[0m


### Generate

#### Custom Bios text

In [None]:
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/acuff.txt  \
        --num-pairs 25 \
        --type "qa"
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/le.txt  \
        --num-pairs 25 \
        --type "qa"

[2KProcessing 2 chunks to generate QA pairs...
[2KBatch processing complete.
[2KGenerated 25 QA pairs total
[2KSaving result to data/generated/acuff_qa_pairs.json
[2KSuccessfully wrote test file to data/generated/test_write.json
[2KSuccessfully wrote result to data/generated/acuff_qa_pairs.json
[2K[32m⠧[0m Generating qa content from /content/data/output/acuff.txt...
[1A[2K[32m Content saved to [0m[1;32mdata/generated/acuff_qa_pairs.json[0m
[2KProcessing 2 chunks to generate QA pairs...
[2KBatch processing complete.
[2KGenerated 25 QA pairs total
[2KSaving result to data/generated/le_qa_pairs.json
[2KSuccessfully wrote test file to data/generated/test_write.json
[2KSuccessfully wrote result to data/generated/le_qa_pairs.json
[2K[32m⠇[0m Generating qa content from /content/data/output/le.txt...
[1A[2K[32m Content saved to [0m[1;32mdata/generated/le_qa_pairs.json[0m
[2K[32m⠋[0m Generating qa content from /content/data/output/paige.txt...
[1A[2K[31mL Err

In [None]:
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/paige.txt  \
        --num-pairs 25 \
        --type "qa"
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/osborne.txt  \
        --num-pairs 25 \
        --type "qa"

### Youtube Transcripts

In [None]:
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/YT_acuff.txt  \
        --num-pairs 25 \
        --type "qa"
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/YT_le.txt  \
        --num-pairs 25 \
        --type "qa"

In [None]:
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/YT_paige.txt  \
        --num-pairs 25 \
        --type "qa"
!synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create /content/data/output/YT_osborne.txt  \
        --num-pairs 25 \
        --type "qa"

### Convert to HF format

In [None]:
qa_pairs_filenames = os.listdir('/content/data/generated')
for filename in qa_pairs_filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        save-as {filename} -f ft

[?25l[32m⠋[0m Converting data/generated/arxiv_org_0_qa_pairs.json to ft format with json 
storage...
[1A[2K[1A[2K[32m Converted to ft format and saved to [0m[1;32mdata/final/arxiv_org_0_qa_pairs_ft.json[0m
[?25l[32m⠋[0m Converting data/generated/arxiv_org_1_qa_pairs.json to ft format with json 
storage...
[1A[2K[1A[2K[32m Converted to ft format and saved to [0m[1;32mdata/final/arxiv_org_1_qa_pairs_ft.json[0m
[?25l[32m⠋[0m Converting data/generated/arxiv_org_2_qa_pairs.json to ft format with json 
storage...
[1A[2K[1A[2K[32m Converted to ft format and saved to [0m[1;32mdata/final/arxiv_org_2_qa_pairs_ft.json[0m


In [None]:
#generator.cleanup()

Attempting to terminate the VLLM server gracefully...
Server terminated gracefully.
