# Exp009: Adaptive training data generation
The goal is three-fold: Generate good adversarial examples
1. based on frequent unigrams and bigrams in positive text
2. based on maximum scoring token in classification
3. reclassfied positive examples

In [193]:
from tqdm import tqdm
import pandas as pd
import importlib
import re
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

import sys
sys.path.append('../source')
import models
import random
import data
import helpers
import api

importlib.reload(helpers)

<module 'helpers' from '/mnt/qb/work/meurers/mpb672/grammarctg/experiments/../source/helpers.py'>

Load augmented EGP dataset

In [2]:
egp_examples = pd.read_json("../data/egp_examples.json")

Load corpora to test classifier on

In [335]:
dd = data.DailyDialog()
sents = dd.get_all_sentences()
sents = sents[:10000]
ds = data.DialogSum()
sents += ds.get_all_sentences()
sents = sents[:20000]
print("Shuffling sentences...")
random.shuffle(sents)

encoded_inputs = models.bert_tokenizer(sents, return_tensors='pt', max_length=64, padding='max_length', truncation=True)
dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

Shuffling sentences...


In [10]:
def probe_model(model, probes):
    encoded_input = models.bert_tokenizer(probes, return_tensors='pt', max_length=64, padding='max_length', truncation=True).to(models.device)
    with torch.no_grad():
        values, indices = model(encoded_input['input_ids'], encoded_input['attention_mask'], diagnose=False)
    tokens = [models.bert_tokenizer.convert_ids_to_tokens(ids) for ids in encoded_input['input_ids']]
    max_tokens = [token[indices[i]] for i, token in enumerate(tokens)]
    return values.cpu(), max_tokens
def probe_model_from_loader(model, dataloader, n_batches=3):
    model.eval()
    all_values = []
    all_max_tokens = []
    batches = 0
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(dataloader):
            batches += 1
            if batches > n_batches: break
            tokens = [models.bert_tokenizer.convert_ids_to_tokens(ids) for ids in input_ids]
            input_ids, attention_mask = input_ids.to(models.device), attention_mask.to(models.device)
            
            values, indices = model(input_ids, attention_mask)
            max_tokens = [tokens[j][idx] if idx < len(tokens[j]) else '[PAD]' for j, idx in enumerate(indices.cpu().tolist())]

            all_values.extend(values.cpu().tolist())
            all_max_tokens.extend(max_tokens)
    return all_values, all_max_tokens

Sample a random EGP rule and create dataset and classifier

In [447]:
rule = egp_examples.sample(1).iloc[0]
print(rule['Can-do statement'])
print(rule['Example'])
positive = rule['augmented_examples']
negative = rule['augmented_negative_examples']
others = [example for sublist in egp_examples.loc[egp_examples['#'] != rule['#'], 'augmented_examples'].to_list() for example in sublist]
print(random.sample(positive, 5))
print(random.sample(negative, 5))

classifier = models.RuleDetector(models.bert_encoder)
dataset = data.get_dataset(positive, negative, others, models.bert_tokenizer, 64) 
train_dataloader, val_dataloader = data.get_loaders(dataset)

Can use the affirmative form with 'like'. 
I'd like to invite you to dinner. 

I would like to eat some Spanish food.
["He'd like to watch a movie tonight.", "They'd like to buy a new car.", "She'd like to visit the zoo.", 'The neighbors would like to organize a street party.', 'She would like to buy a birthday present for her friend.']
['I like to attend a concert.', 'She enjoys taking long walks in the evening.', 'She likes going for a walk in the park.', 'She would prefer to read a book this afternoon.', 'She wants to learn how to cook a new recipe.']


In [231]:
#classifier = models.RuleDetector(models.bert_encoder)
#dataset = data.get_dataset(positive004_explore_smille.ipynb, negative, others, models.bert_tokenizer, 64) 
#train_dataloader, val_dataloader = data.get_loaders(dataset)

In [448]:
optimizer = models.train(classifier, train_dataloader, val_dataloader)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 37.04it/s]


Training loss: 0.41380715906620025
Accuracy: 0.94


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 37.80it/s]


Training loss: 0.2068227358162403
Accuracy: 0.945


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 38.04it/s]


Training loss: 0.1581458967924118
Accuracy: 0.94


Test the trained model

Start with training example and random negative

In [449]:
probe_model(classifier, random.sample(positive, 1))

(tensor([0.2859]), ['like'])

In [450]:
probe_model(classifier, random.sample(others, 1))

(tensor([0.0575]), ['thing'])

Continue with EGP examples

In [451]:
for example in rule['Example'].split("\n\n"):
    print(example)
    score, max_token = probe_model(classifier, example)
    print(f'Score: {score.item()}, Max Token: {max_token[0]}')

I'd like to invite you to dinner. 
Score: 0.9895182251930237, Max Token: d
I would like to eat some Spanish food.
Score: 0.9806193113327026, Max Token: would


Apply classifier to corpus

In [452]:
# should stop after n examples have been found
scores, tokens = probe_model_from_loader(classifier, dataloader, n_batches=100)
for score, token, sample in zip(scores, tokens, sents[:len(scores)]):
    if score > 0.5:
        print(f"Sample: {sample}, Score: {score}, Max Token: {token}")

 32%|███████████████████████████████████████████▏                                                                                           | 100/313 [00:08<00:17, 12.14it/s]

Sample: Have you decided what you'd like, sir?, Score: 0.962340235710144, Max Token: d
Sample: I would like hair spray, please., Score: 0.9699370265007019, Max Token: would
Sample: Yes, I would like a roast duck and my friend wants steaks., Score: 0.9752779006958008, Max Token: would
Sample: I would like a cup of decaf with cream and no sugar., Score: 0.9675355553627014, Max Token: would
Sample: I'd like to send this parcel to Changchun., Score: 0.9889812469482422, Max Token: d
Sample: Yes, I'd like to exchange some US dollars into RIB., Score: 0.9844177961349487, Max Token: d
Sample: Let me tell the lady what we would like to order in front of the seafood fridge first., Score: 0.9828557372093201, Max Token: would
Sample: Is there anything else you would like to have?, Score: 0.9829109311103821, Max Token: would
Sample: I'd just like to trim., Score: 0.8719457387924194, Max Token: d
Sample: Hi, I'd like to get your store credit card., Score: 0.9926693439483643, Max Token: d
Sample: we'




Let's ask GPT4 to classify hits

In [453]:
hits = [sample for score, sample in zip(scores, sents[:len(scores)]) if score > 0.5]

prompt = helpers.get_prompt(rule, n_examples=0, mark_words=False)
prompt += "\nDecide for each of these sentences if they use this rule:\n"
for i, hit in enumerate(hits[:25]):
    prompt += f"{i+1}. {hit}\n"
prompt += "Output format:\n1. [Yes/No]\n2. [Yes/No]\n..."
#print(prompt)
messages = [ helpers.SYSTEM_MESSAGE, { "role": "user", "content": prompt }]
response = api.get_openai_chat_completion(messages, max_tokens=512, model="gpt-4-0125-preview")[0]
print(response)

1. Yes
2. Yes
3. Yes
4. Yes
5. Yes
6. Yes
7. Yes
8. Yes
9. Yes
10. Yes
11. No
12. Yes
13. Yes
14. Yes
15. Yes
16. Yes
17. Yes
18. Yes
19. Yes
20. Yes
21. Yes
22. Yes
23. Yes
24. Yes
25. Yes


In [444]:
true_positives_idx = np.array([answer == 'Yes' for num, answer in re.findall(r'(\d+)\.\s(Yes|No)', response)])
true_positives = list(np.array(hits[:25])[true_positives_idx])
print(true_positives)
false_positives = list(np.array(hits[:25])[true_positives_idx == False])
false_positives

['I think it unwise for either of us to be inflexible.', 'I always seem to chop them into pieces that are either too big or too small.']


["My wife doesn't like it either.",
 "I don't like the colour either.",
 'Yea, I never see theaters turn people away for their age anymore either.',
 'Neither too cold, nor too hot.',
 "I don't think so either.",
 "I don't like seafood, neither.",
 'Neither am I.']

In [445]:
new_positives = random.sample(positive, len(true_positives)) + true_positives
print(len(new_positives))
new_negatives = random.sample(negative, len(false_positives)) + false_positives
print(len(new_negatives))
dataset = data.get_dataset(new_positives, new_negatives, others, models.bert_tokenizer, 64, random_negatives=False) 
train_dataloader, val_dataloader = data.get_loaders(dataset)

4
14


In [446]:
optimizer = models.train(classifier, train_dataloader, val_dataloader, num_epochs=5, optimizer=optimizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 29.11it/s]


Training loss: 1.0718133449554443
Accuracy: 0.5


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.90it/s]


Training loss: 0.8767577409744263
Accuracy: 0.5


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.97it/s]


Training loss: 0.485191285610199
Accuracy: 1.0


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.86it/s]


Training loss: 0.36909806728363037
Accuracy: 1.0


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.25it/s]

Training loss: 0.3678267002105713
Accuracy: 1.0





In [419]:
scores, tokens = probe_model_from_loader(classifier, dataloader, n_batches=150)
for score, token, sample in tqdm(zip(scores, tokens, sents[:len(scores)])):
    if score > 0.5:
        print(f"Sample: {sample}, Score: {score}, Max Token: {token}")

 48%|████████████████████████████████████████████████████████████████▋                                                                      | 150/313 [00:12<00:13, 12.05it/s]
9600it [00:00, 1505808.47it/s]

Sample: The project I'm in charged of now will be done by the end of this week., Score: 0.779354453086853, Max Token: m
Sample: Sorry Tom, We're having problems hearing you here., Score: 0.9634317755699158, Max Token: re
Sample: Yes, they're right., Score: 0.6047001481056213, Max Token: re
Sample: You're going to love singing with them., Score: 0.7732325792312622, Max Token: re
Sample: I'm reading a lot of books about the two countries., Score: 0.9844804406166077, Max Token: m
Sample: I'm from Lexington Software and I'm here to collect an export L / C., Score: 0.7105753421783447, Max Token: m
Sample: I'm freezing., Score: 0.6661328077316284, Max Token: m
Sample: I'm just longing for some of the comforts of home, like my mom's cooking and being around my family., Score: 0.9250532984733582, Max Token: m
Sample: I take it you're looking for temporary office work., Score: 0.9683218598365784, Max Token: re
Sample: I'm trying to choose a gift., Score: 0.9817553758621216, Max Token: m
Sample:




Let's take the strategy to generate harder minimal examples

In [420]:
from collections import Counter
pos_tokens = [token for score, token in zip(scores, tokens) if score > 0.5 ]
print(Counter(pos_tokens))
top_token = Counter(pos_tokens).most_common()[0][0]

Counter({'m': 183, 're': 70, 'are': 39, 's': 30, 'am': 17, 'is': 7, 've': 7})


In [421]:
prompt = f'{helpers.get_prompt(rule, n_examples=0, mark_words=False)}Create 25 sentences that do NOT fulfill that rule but include "{top_token}".'
#print(prompt)
messages = [ helpers.SYSTEM_MESSAGE, { "role": "user", "content": prompt }]
response = api.get_openai_chat_completion(messages, max_tokens=256)[0]
print(response)

1. My mom makes delicious meals every day.
2. I like movies that make me laugh.
3. My brother enjoys playing the guitar in his free time.
4. She misses her family when she's away.
5. We sometimes go for walks in the park.
6. The cat meowed loudly at night.
7. My friend likes reading mystery novels.
8. I often dream about traveling the world.
9. They always go fishing on weekends.
10. The rain melted the snow on the ground.
11. The dog barks at strangers passing by.
12. He usually wears a hat to protect himself from the sun.
13. The baby smiled at me, melting my heart.
14. We play board games on rainy days.
15. She writes poetry to express her emotions.
16. He helps his parents with household chores.
17. The bike ride up the hill was exhausting.
18. My sister plays the piano beautifully.
19. The smell of freshly baked cookies fills the kitchen.
20. The flowers bloom in the springtime.
21. The thunderstorm frightened the children.
22. The kids laugh when they see something funny.
23. The

In [407]:
hard_negatives, _  = helpers.parse_response(response)
hard_negatives

['Each student prepared their own presentation.',
 'Each member of the team had a different opinion.',
 'They brought their own food, so they each had something different to eat.',
 'Each book on the shelf has a different story to tell.',
 'They sat at each end of the table to avoid any arguments.',
 'They each received a gift at the party.',
 'The children picked out a toy each from the store.',
 'Each participant in the study was given a questionnaire to fill out.',
 'Each car in the parking lot had a different color.',
 'The siblings decided to split the cake into three equal parts, so each got a fair share.',
 'Each plant in the garden requires different levels of sunlight.',
 'Each member of the group had a different responsibility.',
 'They each chose a different movie to watch on movie night.',
 'Each child has their own unique talent.',
 'They each wrote a letter to express their gratitude.',
 'The team competed against each other in a friendly match.',
 'The students lined up 

In [408]:
new_positives = random.sample(positive, len(true_positives)) + true_positives
new_negatives = random.sample(negative, len(hard_negatives)) + hard_negatives
print(len(new_negatives))
dataset = data.get_dataset(new_positives, new_negatives, others, models.bert_tokenizer, 64, random_negatives=False)
print(len(dataset))
train_dataloader, val_dataloader = data.get_loaders(dataset)
optimizer = models.train(classifier, train_dataloader, val_dataloader, num_epochs=5, optimizer=optimizer)
scores, tokens = probe_model_from_loader(classifier, dataloader, n_batches=150)
for score, token, sample in tqdm(zip(scores, tokens, sents[:len(scores)])):
    if score > 0.5:
        print(f"Sample: {sample}, Score: {score}, Max Token: {token}")

36
16


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 23.42it/s]


Training loss: 0.05150964856147766
Accuracy: 0.75


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 37.19it/s]


Training loss: 0.08152154088020325
Accuracy: 0.75


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 36.75it/s]


Training loss: 0.0877128466963768
Accuracy: 0.75


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 38.38it/s]

Training loss: 0.0583927147090435





Accuracy: 0.75


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 37.94it/s]


Training loss: 0.03087630495429039
Accuracy: 0.75


 48%|████████████████████████████████████████████████████████████████▋                                                                      | 150/313 [00:12<00:13, 11.87it/s]
9600it [00:00, 1927215.74it/s]

Sample: We are not seeing each other any more., Score: 0.9773754477500916, Max Token: each
Sample: Well, I suggest we meet each other halfway., Score: 0.9794229865074158, Max Token: each
Sample: Let's meet each other half-away again and split the difference ; I think this is a price we can both be satisfied with., Score: 0.9830936193466187, Max Token: each
Sample: Do people in different departments get along with each other?, Score: 0.9656620621681213, Max Token: each
Sample: Let's meet each other half way once more, then the gap will be closed and our business completed., Score: 0.9834455847740173, Max Token: each
Sample: When I suggested we meet each other half way, I meant it literally., Score: 0.9747605323791504, Max Token: each
Sample: How about meeting each other half way so that business can be concluded?, Score: 0.9503453373908997, Max Token: each



