In [None]:
%pip install treelib -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.core.instrumentation.span_handlers import SimpleSpanHandler
import llama_index.core.instrumentation as instrument

span_handler = SimpleSpanHandler()
dispatcher = instrument.get_dispatcher()
dispatcher.add_span_handler(span_handler)

In [None]:
from llama_index.core.llama_dataset.simple import LabelledSimpleDataset
from llama_index.packs.diff_private_simple_dataset.base import PromptBundle
from llama_index.packs.diff_private_simple_dataset import DiffPrivateSimpleDatasetPack
from llama_index.llms.openai import OpenAI
import tiktoken

### Load LabelledSimpleDataset

In [None]:
simple_dataset = LabelledSimpleDataset.from_json("./agnews.json")

In [None]:
simple_dataset.to_pandas()[:5]

Unnamed: 0,reference_label,text,text_by
0,Business,Wall St. Bears Claw Back Into the Black (Reute...,human
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,human
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters...,human
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,human
4,Business,"Oil prices soar to all-time record, posing new...",human


In [None]:
simple_dataset.to_pandas().value_counts("reference_label")

reference_label
Business    30000
Sci/Tech    30000
Sports      30000
World       30000
Name: count, dtype: int64

### InstantiatePack

In [None]:
llm = OpenAI(
    model="gpt-3.5-turbo-instruct",
    max_tokens=1,
    logprobs=True,
    top_logprobs=5,
)
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-instruct")

prompt_bundle = PromptBundle(
    instruction=(
        "Given a label of news type, generate the chosen type of news accordingly.\n"
        "Start your answer directly after 'Text: '. Begin your answer with [RESULT].\n"
    ),
    label_heading="News Type",
    text_heading="Text",
)

dp_simple_dataset_pack = DiffPrivateSimpleDatasetPack(
    llm=llm,
    tokenizer=tokenizer,
    prompt_bundle=prompt_bundle,
    simple_dataset=simple_dataset,
)

In [None]:
dp_simple_dataset_pack.generate_dp_synthetic_example(
    label="Sports", t_max=35, sigma=0.1, num_splits=2, num_samples_per_split=8
)

100%|█████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:30<00:00,  1.14it/s]


LabelledSimpleDataExample(reference_label='Sports', text='The 2021 NBA Finals are set to begin on Tuesday, July 6th, with the Phoenix Suns facing off against the Milwaukee Bucks. The Suns are', text_by=CreatedBy(model_name='gpt-3.5-turbo-instruct', type=<CreatedByType.AI: 'ai'>))

In [None]:
await dp_simple_dataset_pack.agenerate_dp_synthetic_example(
    label="Sports", t_max=35, sigma=0.1, num_splits=2, num_samples_per_split=8
)

100%|█████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:30<00:00,  1.17it/s]


LabelledSimpleDataExample(reference_label='Sports', text='The latest sports news: The New York Yankees have clinched the American League East division title for the first time since 2012. The team celebrated their victory', text_by=CreatedBy(model_name='gpt-3.5-turbo-instruct', type=<CreatedByType.AI: 'ai'>))

In [None]:
synthetic_dataset = dp_simple_dataset_pack.run(
    sizes={"World": 1, "Sports": 1, "Sci/Tech": 0, "Business": 0},
    t_max=5,
    sigma=0.5,
    num_splits=3,
    num_samples_per_split=8,
)

100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.30s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.05s/it]


In [None]:
synthetic_dataset.to_pandas()

Unnamed: 0,reference_label,text,text_by
0,Sports,[RESULT]The,ai (gpt-3.5-turbo-instruct)
1,World,[RESULT]The United,ai (gpt-3.5-turbo-instruct)


In [None]:
print(synthetic_dataset.to_pandas().iloc[0].text)

[RESULT]The 


In [None]:
print(synthetic_dataset.to_pandas().iloc[1].text)

 [RESULT]The United


In [None]:
span_handler.print_trace_trees()

DiffPrivateSimpleDatasetPack.generate_dp_synthetic_example-9656d8cd-d7ab-4e7e-8e69-fce6a5562a42 (30.8202)
└── DiffPrivateSimpleDatasetPack.agenerate_dp_synthetic_example-1f965d26-815f-4caf-a2ad-7033debd485e (30.819737)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-77f10b72-ce52-402a-8198-9811605c789e (0.091823)
    ├── DiffPrivateSimpleDatasetPack._split_dataset-0c1abe3f-6a98-4072-bff7-4f9eec6c8ada (0.006922)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-8dce00d4-6c27-4aef-97b3-5ff01226965a (0.096537)
    ├── DiffPrivateSimpleDatasetPack._split_dataset-7d645e59-4453-460b-ba51-5fe6de72d9fc (0.006868)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-a9f0bd05-b7b2-48e3-bfe8-137868e32b94 (0.088195)
    ├── DiffPrivateSimpleDatasetPack._split_dataset-621ae7be-92d5-4b59-b863-df196e315dc1 (0.006807)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-7106660e-75c7-4064-b942-747182c9e969 (0.092613)
    ├── DiffPrivateSimpleDatasetPack._spl