In [None]:
%pip install treelib -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.core.instrumentation.event_handlers import BaseEventHandler


class MyEventHandler(BaseEventHandler):
    @classmethod
    def class_name(cls) -> str:
        """Class name."""
        return "MyEventHandler"

    def handle(self, event) -> None:
        """Logic for handling event."""
        # THIS IS WHERE YOU ADD YOUR LOGIC TO HANDLE EVENTS
        print(
            str(event.dict())[:125] + " ..."
        )  # to avoid printing everything in this notebook
        print("")
        with open("log.txt", "a") as f:
            f.write(str(event))
            f.write("\n")

In [None]:
from llama_index.core.instrumentation.span_handlers import SimpleSpanHandler
import llama_index.core.instrumentation as instrument

span_handler = SimpleSpanHandler()
dispatcher = instrument.get_dispatcher()
dispatcher.add_span_handler(span_handler)

app_dispatcher = instrument.get_dispatcher("app")
app_dispatcher.add_event_handler(MyEventHandler())

In [None]:
from llama_index.core.llama_dataset.simple import LabelledSimpleDataset
from llama_index.packs.diff_private_simple_dataset.base import PromptBundle
from llama_index.packs.diff_private_simple_dataset import DiffPrivateSimpleDatasetPack
from llama_index.llms.openai import OpenAI
import tiktoken

### Load LabelledSimpleDataset

In [None]:
simple_dataset = LabelledSimpleDataset.from_json("./agnews.json")

In [None]:
simple_dataset.to_pandas()[:5]

Unnamed: 0,reference_label,text,text_by
0,Business,Wall St. Bears Claw Back Into the Black (Reute...,human
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,human
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters...,human
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,human
4,Business,"Oil prices soar to all-time record, posing new...",human


In [None]:
simple_dataset.to_pandas().value_counts("reference_label")

reference_label
Business    30000
Sci/Tech    30000
Sports      30000
World       30000
Name: count, dtype: int64

### InstantiatePack

In [None]:
llm = OpenAI(
    model="gpt-3.5-turbo-instruct",
    max_tokens=1,
    logprobs=True,
    top_logprobs=5,
)
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-instruct")

prompt_bundle = PromptBundle(
    instruction=(
        "Given a label of news type, generate the chosen type of news accordingly.\n"
        "Start your answer directly after 'Text: '. Begin your answer with [RESULT].\n"
    ),
    label_heading="News Type",
    text_heading="Text",
)

dp_simple_dataset_pack = DiffPrivateSimpleDatasetPack(
    llm=llm,
    tokenizer=tokenizer,
    prompt_bundle=prompt_bundle,
    simple_dataset=simple_dataset,
)

In [None]:
dp_simple_dataset_pack.generate_dp_synthetic_example(
    label="Sports", t_max=35, sigma=0.1, num_splits=2, num_samples_per_split=8
)

100%|█████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:19<00:00,  1.75it/s]


LabelledSimpleDataExample(reference_label='Sports', text='The New York Yankees defeat the Tampa Bay Rays 5-3 in a thrilling game at Yankee Stadium. Aaron Judge hits a walk-off', text_by=CreatedBy(model_name='gpt-3.5-turbo-instruct', type=<CreatedByType.AI: 'ai'>))

In [None]:
await dp_simple_dataset_pack.agenerate_dp_synthetic_example(
    label="Sports", t_max=35, sigma=0.1, num_splits=2, num_samples_per_split=8
)

100%|█████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:20<00:00,  1.70it/s]


LabelledSimpleDataExample(reference_label='Sports', text='The NBA Finals are set to begin on Tuesday, with the Los Angeles Lakers facing off against the Miami Heat. The Lakers are led by superstar LeBron James, while', text_by=CreatedBy(model_name='gpt-3.5-turbo-instruct', type=<CreatedByType.AI: 'ai'>))

In [None]:
synthetic_dataset = await dp_simple_dataset_pack.arun(
    sizes={"World": 1, "Sports": 1, "Sci/Tech": 0, "Business": 0},
    t_max=5,
    sigma=0.5,
    num_splits=3,
    num_samples_per_split=8,
)

100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.38s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.96s/it]


In [None]:
synthetic_dataset.to_pandas()

Unnamed: 0,reference_label,text,text_by
0,World,,ai (gpt-3.5-turbo-instruct)
1,Sports,The latest,ai (gpt-3.5-turbo-instruct)


In [None]:
print(synthetic_dataset.to_pandas().iloc[0].text)




In [None]:
print(synthetic_dataset.to_pandas().iloc[1].text)

The latest


In [None]:
span_handler.print_trace_trees()

DiffPrivateSimpleDatasetPack.generate_dp_synthetic_example-b6a9433d-7122-48ec-8ac9-887c89b5cb24 (19.972498)
└── DiffPrivateSimpleDatasetPack.agenerate_dp_synthetic_example-b427f199-1c9f-49a8-8c8e-d8a3435bbeb9 (19.971644)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-f0f32888-7883-4fa2-87f7-5a29ab34ca82 (0.084942)
    ├── DiffPrivateSimpleDatasetPack._split_dataset-2f50d28b-f16c-4f15-ac7c-27bf4bed48b9 (0.006806)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-80070744-abc5-4f7e-aa07-e41d360b5b79 (0.092045)
    ├── DiffPrivateSimpleDatasetPack._split_dataset-daec71b0-b149-43cc-a378-0cf118dd7982 (0.006882)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-57a5c78b-4ddd-48f6-afca-e44566d6dca9 (0.095239)
    ├── DiffPrivateSimpleDatasetPack._split_dataset-3ec8be81-60d1-4891-a124-cdcf9046cee7 (0.007076)
    ├── DiffPrivateSimpleDatasetPack._filter_dataset_by_label-24316f38-6940-468c-bc6e-f76aa7c3a83c (0.093037)
    ├── DiffPrivateSimpleDatasetPack._s