In [1]:
from llama_index.packs.diff_private_simple_dataset.simple_dataset import LabelledSimpleDataset
from llama_index.packs.diff_private_simple_dataset.base import PromptBundle
from llama_index.packs.diff_private_simple_dataset import DiffPrivateSimpleDatasetPack
from llama_index.llms.openai import OpenAI
import tiktoken

### Load LabelledSimpleDataset

In [2]:
simple_dataset = LabelledSimpleDataset.from_json("./agnews.json")

In [3]:
simple_dataset.to_pandas()[:5]

Unnamed: 0,reference_label,text,text_by
0,Business,Wall St. Bears Claw Back Into the Black (Reute...,human
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,human
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters...,human
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,human
4,Business,"Oil prices soar to all-time record, posing new...",human


### InstantiatePack

In [17]:
llm = OpenAI(
    model="gpt-3.5-turbo",
    max_tokens=1,
    logprobs=True,
    top_logprobs=20,
)
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
refine_llm = OpenAI(model="gpt-4")


prompt_bundle = PromptBundle(
    instruction=(
        "Given a label of news type, generate the chosen type of news accordingly."
        " Always start your answer with the format provided below. Don't provide any"
        " other text, go straight to answering in the provided format and continue any partial "
        " assistant messages.\n"
        "\nNews Type: [repeat provided label]\n"
        "Text: [provide your answer here].\n"
        "------------------------------\n"
    ),
    label_heading="News Type",
    text_heading="Text"
)

dp_simple_dataset_pack = DiffPrivateSimpleDatasetPack(
    llm=llm,
    refine_llm=refine_llm,
    tokenizer=tokenizer,
    prompt_bundle=prompt_bundle,
    simple_dataset=simple_dataset
)

In [13]:
business =  dp_simple_dataset_pack._filter_dataset_by_label("Business")

In [6]:
splits = dp_simple_dataset_pack._split_dataset(business, 5, 1)

In [7]:
splits

[LabelledSimpleDataset(examples=[LabelledSimpleDataExample(reference_label='Business', text='Hilfiger Sales and Net Fall Amid Inquiry More bad news poured out of the Tommy Hilfiger Corporation yesterday, as the clothing company - facing an investigation by the United States attorney #39;s office in Manhattan - reported disappointing sales and declining operating margins.', text_by=CreatedBy(model_name='', type=<CreatedByType.HUMAN: 'human'>))]),
 LabelledSimpleDataset(examples=[LabelledSimpleDataExample(reference_label='Business', text='Dollar Steadies, More Weakness Expected (Reuters) Reuters - The U.S. dollar paused near recent\\lows Friday but dealers expected selling to resume after a\\weekend meeting of G20 finance ministers, while stock investors\\sought cover in companies immune to currency risks.', text_by=CreatedBy(model_name='', type=<CreatedByType.HUMAN: 'human'>))]),
 LabelledSimpleDataset(examples=[LabelledSimpleDataExample(reference_label='Business', text='US Airways Talk

In [50]:
synthetic_example = ""
label = "Business"
chat_messages = dp_simple_dataset_pack._get_messages_for_synthetic_generation(
                    splits[1], synthetic_example, label
                )

In [51]:
for c in chat_messages:
    print(c.role)
    print(c.content)
    print("")

MessageRole.SYSTEM
You are a helpful assistant that follows instructions and formatting strictly.

MessageRole.USER
Given a label of news type, generate the chosen type of news accordingly. Always start your answer with the format provided below. Don't provide any other text, go straight to answering in the provided format and continue any partial  assistant messages.

News Type: [repeat provided label]
Text: [provide your answer here].
------------------------------

News Type: Business

MessageRole.ASSISTANT
News Type: Business
Text: Peoplesoft bounces back to profit Peoplesoft, the software maker that is the target of a hostile takeover bid from industry heavyweight Oracle, returns to profit in the third quarter.

MessageRole.USER
Given a label of news type, generate the chosen type of news accordingly. Always start your answer with the format provided below. Don't provide any other text, go straight to answering in the provided format and continue any partial  assistant messages.



In [52]:
llm.max_tokens = 10

In [53]:
print(llm.chat(chat_messages))

assistant: Tech giant, Apple Inc., has announced a groundbreaking


In [19]:
response = dp_simple_dataset_pack.llm.chat(chat_messages)
response.logprobs

[[LogProb(token='In', logprob=-0.63690376, bytes=[73, 110]),
  LogProb(token='Tech', logprob=-2.1681538, bytes=[84, 101, 99, 104]),
  LogProb(token='The', logprob=-2.4806538, bytes=[84, 104, 101]),
  LogProb(token='Global', logprob=-2.6056538, bytes=[71, 108, 111, 98, 97, 108]),
  LogProb(token='Major', logprob=-3.7775288, bytes=[77, 97, 106, 111, 114]),
  LogProb(token='Stock', logprob=-3.9806538, bytes=[83, 116, 111, 99, 107]),
  LogProb(token='Shares', logprob=-4.199404, bytes=[83, 104, 97, 114, 101, 115]),
  LogProb(token='E', logprob=-4.699404, bytes=[69]),
  LogProb(token='Apple', logprob=-4.746279, bytes=[65, 112, 112, 108, 101]),
  LogProb(token='Microsoft', logprob=-4.902529, bytes=[77, 105, 99, 114, 111, 115, 111, 102, 116]),
  LogProb(token='Today', logprob=-4.965029, bytes=[84, 111, 100, 97, 121]),
  LogProb(token='Invest', logprob=-4.996279, bytes=[73, 110, 118, 101, 115, 116]),
  LogProb(token='Technology', logprob=-5.074404, bytes=[84, 101, 99, 104, 110, 111, 108, 111, 1

In [20]:
zero_shot_chat_messages = dp_simple_dataset_pack._get_messages_for_reduced_token_universe(
                synthetic_example, label
            )

In [21]:
for c in zero_shot_chat_messages:
    print(c.role)
    print(c.content)
    print("")

MessageRole.SYSTEM
You are a helpful assistant that follows instructions and formatting strictly.

MessageRole.USER
Given a label of news type, generate the chosen type of news accordingly. Always start your answer with the format provided below. Don't provide any other text, go straight to answering in the provided format and continue any partial  assistant messages.

News Type: [repeat provided label]
Text: [provide your answer here].
------------------------------

News Type: Business

MessageRole.ASSISTANT
News Type: Business
Text: 



In [22]:
llm.max_tokens = 10
response = dp_simple_dataset_pack.llm.chat(zero_shot_chat_messages)

In [24]:
print(response)

assistant: In a surprising turn, Tech-Start Corporation,


In [23]:
response.logprobs

[[LogProb(token='In', logprob=-0.22761486, bytes=[73, 110]),
  LogProb(token='The', logprob=-2.6494899, bytes=[84, 104, 101]),
  LogProb(token='Shares', logprob=-3.9463649, bytes=[83, 104, 97, 114, 101, 115]),
  LogProb(token='Today', logprob=-4.14949, bytes=[84, 111, 100, 97, 121]),
  LogProb(token='News', logprob=-4.24324, bytes=[78, 101, 119, 115]),
  LogProb(token='Global', logprob=-4.49324, bytes=[71, 108, 111, 98, 97, 108]),
  LogProb(token='Stock', logprob=-5.102615, bytes=[83, 116, 111, 99, 107]),
  LogProb(token='Tech', logprob=-5.196365, bytes=[84, 101, 99, 104]),
  LogProb(token='Major', logprob=-5.227615, bytes=[77, 97, 106, 111, 114]),
  LogProb(token='Market', logprob=-5.571365, bytes=[77, 97, 114, 107, 101, 116]),
  LogProb(token='Following', logprob=-5.58699, bytes=[70, 111, 108, 108, 111, 119, 105, 110, 103]),
  LogProb(token='Invest', logprob=-5.74324, bytes=[73, 110, 118, 101, 115, 116]),
  LogProb(token='Leading', logprob=-5.80574, bytes=[76, 101, 97, 100, 105, 110,

In [16]:
dp_simple_dataset_pack.generate_dp_synthetic_example(
    label="Sports",
    t_max=42,
    sigma=0.5,
    num_splits=3,
    num_samples_per_split=2
)

Generating token: 1
Generating token: 2
Generating token: 3
Generating token: 4
Generating token: 5
Generating token: 6
Generating token: 7
Generating token: 8
Generating token: 9
Generating token: 10
Generating token: 11
Generating token: 12
Generating token: 13
Generating token: 14
Generating token: 15
Generating token: 16
Generating token: 17
Generating token: 18
Generating token: 19
Generating token: 20
Generating token: 21
Generating token: 22
Generating token: 23
Generating token: 24
Generating token: 25
Generating token: 26
Generating token: 27
Generating token: 28
Generating token: 29
Generating token: 30
Generating token: 31
Generating token: 32
Generating token: 33
Generating token: 34
Generating token: 35
Generating token: 36
Generating token: 37
Generating token: 38
Generating token: 39
Generating token: 40
Generating token: 41
Generating token: 42
synthetic_example: In Type: In 
Text: The last yesterday from of the last theanticipateddog team another New to score late game

LabelledSimpleDataExample(reference_label='Sports', text="News Type: Sports\nText: In the latest sports news, the New York Yankees managed to seize a late-game victory yesterday. This win has further ignited their perfect season, showing their resilience and ability to overcome deficits. The team's performance has been nothing short of spectacular, setting a high bar for their competitors.", text_by=CreatedBy(model_name='gpt-4', type=<CreatedByType.AI: 'ai'>))

In [10]:
print('As\n Type  \nText: The: market continue  continues show today in gains major key earnings key  investor earnings.')

As
 Type  
Text: The: market continue  continues show today in gains major key earnings key  investor earnings.
