In [None]:
from llama_index.core.llama_dataset.simple import LabelledSimpleDataset
from llama_index.packs.diff_private_simple_dataset.base import PromptBundle
from llama_index.packs.diff_private_simple_dataset import DiffPrivateSimpleDatasetPack
from llama_index.llms.openai import OpenAI
import tiktoken

### Load LabelledSimpleDataset

In [None]:
simple_dataset = LabelledSimpleDataset.from_json("./agnews.json")

In [None]:
simple_dataset.to_pandas()[:5]

Unnamed: 0,reference_label,text,text_by
0,Business,Wall St. Bears Claw Back Into the Black (Reute...,human
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,human
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters...,human
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,human
4,Business,"Oil prices soar to all-time record, posing new...",human


In [None]:
simple_dataset.to_pandas().value_counts("reference_label")

reference_label
Business    30000
Sci/Tech    30000
Sports      30000
World       30000
Name: count, dtype: int64

### InstantiatePack

In [None]:
llm = OpenAI(
    model="gpt-3.5-turbo-instruct",
    max_tokens=1,
    logprobs=True,
    top_logprobs=5,
)
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-instruct")


prompt_bundle = PromptBundle(
    instruction=(
        "Given a label of news type, generate the chosen type of news accordingly.\n"
        "Start your answer directly after 'Text: '. Begin your answer with [RESULT].\n"
    ),
    label_heading="News Type",
    text_heading="Text",
)

dp_simple_dataset_pack = DiffPrivateSimpleDatasetPack(
    llm=llm,
    tokenizer=tokenizer,
    prompt_bundle=prompt_bundle,
    simple_dataset=simple_dataset,
)

In [None]:
dp_simple_dataset_pack.generate_dp_synthetic_example(
    label="Sports", t_max=35, sigma=0.1, num_splits=2, num_samples_per_split=8
)

100%|█████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:29<00:00,  1.17it/s]


LabelledSimpleDataExample(reference_label='Sports', text='The latest sports news: The New York Yankees have clinched a spot in the playoffs after defeating the Tampa Bay Rays 5-1 on Wednesday night. This', text_by=CreatedBy(model_name='gpt-3.5-turbo-instruct', type=<CreatedByType.AI: 'ai'>))

In [None]:
synthetic_dataset = dp_simple_dataset_pack.run(
    sizes={"World": 1, "Sports": 1, "Sci/Tech": 0, "Business": 0},
    t_max=100,
    sigma=0.5,
    num_splits=3,
    num_samples_per_split=8,
)

  6%|█████                                                                                | 6/100 [00:08<02:06,  1.34s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 100/100 [01:49<00:00,  1.09s/it]


In [None]:
print(synthetic_dataset.to_pandas().iloc[0].text)

News Type: Sports
Text: In the world of sports today, the NBA Finals concluded with a thrilling match between the Los Angeles Lakers and the Miami Heat. In football news, Real Madrid secured a decisive victory in their latest match. Meanwhile, preparations for the upcoming Olympics are in full swing with athletes from around the globe gearing up for the competition. Stay tuned for more updates and highlights from various sports events happening throughout the day.


In [None]:
print(synthetic_dataset.to_pandas().iloc[1].text)

News Type: World
Text: In the latest global news, the United Nations has called for a new strategy to address the ongoing conflict in Ukraine. World leaders have gathered to discuss potential solutions and developments. Stay tuned for more updates.


In [None]:
from openai import OpenAI as OpenAIAPI

client = OpenAIAPI()

In [None]:
business = dp_simple_dataset_pack._filter_dataset_by_label("Business")
splits = dp_simple_dataset_pack._split_dataset(business, 5, 3)

In [None]:
synthetic_example = "\n\n[RESULT] Business"
label = "Business"
prompt = dp_simple_dataset_pack._get_public_prompt(synthetic_example, label)

In [None]:
print(prompt)

Given a label of news type, generate the chosen type of news accordingly.
Start your answer directly after 'Text: '. Begin your answer with [RESULT].

News Type: Business
Text: 

[RESULT] Business


In [None]:
response = llm.complete(prompt)

raw response: Completion(id='cmpl-91qIhnRAOvGt3nfS9M1XDH6fUg7E8', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[196], token_logprobs=[-0.5169217], tokens=[' news'], top_logprobs=[{' news': -0.5169217, ' News': -2.1725404, ' is': -2.6637266, ' Insider': -3.6237085, 'man': -4.2520003}]), text=' news')], created=1710226399, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=43, total_tokens=44))


In [None]:
response.logprobs

[[LogProb(token=' Sales', logprob=-0.21729077, bytes=[]),
  LogProb(token='ers', logprob=-1.8933505, bytes=[]),
  LogProb(token='er', logprob=-4.769968, bytes=[]),
  LogProb(token=' Spending', logprob=-5.085491, bytes=[]),
  LogProb(token=' Stocks', logprob=-5.312193, bytes=[])]]

In [None]:
response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    max_tokens=2,
    temperature=0,
    logprobs=5,
)

In [None]:
response

Completion(id='cmpl-91pFbxtuIRkDdTAkc30uT4A25s7BC', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[1408, 1414], token_logprobs=[-0.43670404, -1.7844177], tokens=[' Sales', ' Rise'], top_logprobs=[{' Sales': -0.43670404, 'ers': -1.5960592, 'er': -3.2029052, ' Holiday': -4.567468, ' Stocks': -4.951999}, {' Rise': -1.7844177, ' Rose': -2.3044314, ' Re': -2.4139857, ' Fall': -3.0089946, ' Up': -3.07311}]), text=' Sales Rise')], created=1710222363, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=2, prompt_tokens=326, total_tokens=328))

In [None]:
response

Completion(id='cmpl-91p57xjQwaDPfOWvkVjJZq7ruciIN', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[1401], token_logprobs=[-2.6306255], tokens=[' Retail'], top_logprobs=[{' Retail': -2.6306255, ' Stocks': -2.6668556, ' Job': -2.6953552, ' Economy': -2.8212268, ' Consumer': -3.2094753}]), text=' Retail')], created=1710221713, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=325, total_tokens=326))

In [None]:
response

Completion(id='cmpl-91p4tTDEp03V2FXboURBsiHbIZydN', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[1400], token_logprobs=[-0.012113422], tokens=['.'], top_logprobs=[{'.': -0.012113422, '.,': -5.5512695, '.-': -6.629635, ' Retail': -7.7420826, ' Stocks': -7.8746986}]), text='.')], created=1710221699, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=324, total_tokens=325))

In [None]:
response

Completion(id='cmpl-91p4UPRyoftVdoqxRdOnHylcl3NKh', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[1398], token_logprobs=[-0.02724619], tokens=['.S'], top_logprobs=[{'.S': -0.02724619, 'BS': -4.65904, '.K': -5.094541, '.N': -5.8702025, 'AW': -5.907785}]), text='.S')], created=1710221674, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=323, total_tokens=324))

In [None]:
response

Completion(id='cmpl-91p3CYqLb6KBCT6eeWdnWGPrGRXCU', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[1396], token_logprobs=[-2.9744303], tokens=[' U'], top_logprobs=[{' U': -2.9744303, '\n': -3.3239577, '<|endoftext|>': -3.4435198, ' US': -3.508074, ' Wall': -3.8395998}]), text=' U')], created=1710221594, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=322, total_tokens=323))