# Exp012: Train classifiers mainly from corpus examples

In [1]:
import re
import os
import random
import numpy as np
from torch import tensor, Tensor
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

import sys
sys.path.append('../source')
import models
import data
import importlib
#importlib.reload(data)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# load corpus sentences and prepare dataloader
sentences = data.get_mixed_sentences(1000000) # get all sentences

# initialize corpus to check against
max_batches = 250
batch_size = 64
encoded_inputs = models.bert_tokenizer(sentences[:8*max_batches*batch_size], return_tensors='pt', max_length=64, padding='max_length', truncation=True)
encoded_inputs['sentences'] = sentences[:8*max_batches*batch_size]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:37<00:00,  9.42s/it]


In [5]:
# output dataset
output_path = '../data/annotated_corpus.json'
instances = pd.DataFrame(columns=['#', 'sentence', 'positive']) if not os.path.exists(output_path) else pd.read_json(output_path)
instances['positive'] = instances['positive'].astype(bool)

egp_gpt = pd.read_json("../data/egp_gpt35.json")

In [6]:
# manipulation functions for instance dataset
def get_positives(instances, nr):
    return list(instances[(instances['#'] == nr) & instances['positive']]['sentence'])
def get_negatives(instances, nr):
    return list(instances[(instances['#'] == nr) & ~instances['positive']]['sentence'])
def get_others(sentences, matches):
    return list(set(sentences).difference(set(matches)))
def add_to_instances(sentences, nr, positive=True):
    global instances
    if isinstance(sentences, list):
        for sentence in sentences:
            new_row = pd.DataFrame({'#': [nr], 'sentence': [sentence], 'positive': [positive]})
            instances = pd.concat([instances, new_row], ignore_index=True)
    else:
        new_row = pd.DataFrame({'#': [nr], 'sentence': [sentences], 'positive': [positive]})
        instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)
# data preparation
def get_dataset(positives, negatives, others, tokenizer, max_len, others_ratio = 3):
    unique_positive = list(set(positives)) # remove duplicates
    unique_negative = list(set(negatives).difference(set(positives))) # remove duplicates and positives
    num_rands = int(others_ratio * len(unique_negative))
    random.shuffle(others)
    sentences = unique_positive + unique_negative + others[:num_rands]
    labels = [1] * len(unique_positive) + [0] * len(unique_negative) + [0] * len(others[:num_rands])
    print(sum(labels) / len(labels))
    return data.SentenceDataset(sentences, labels, tokenizer, max_len)
# model training
def get_trained_classifer(positive, negative, others, classifier=models.RuleDetector(models.bert_encoder), num_epochs=3, ratio=3):
    dataset = get_dataset(positive, negative, others, models.bert_tokenizer, 64, ratio) 
    train_dataloader, val_dataloader = data.get_loaders(dataset)
    models.train(classifier, train_dataloader, val_dataloader, num_epochs)
    return classifier

Find examples for would rules (616-638)

In [7]:
nr = 628

In [8]:
pattern = r"(W|w)ould.*like\?"
matches = [sentence for sentence in sentences if re.search(pattern, sentence)]
random.shuffle(matches)
candidates = iter(matches)
threshold = min(len(matches), 50)

In [120]:
gpt35 = egp_gpt[egp_gpt['#']==nr].iloc[0]
matches = gpt35['augmented_examples'][:50]
candidates = iter(matches)
threshold = min(len(matches), 50)

In [15]:
anti_pattern = r"('d|would).*love"
anti_matches = [sentence for sentence in sentences if re.search(anti_pattern, sentence)]
random.shuffle(anti_matches)
candidates = iter(anti_matches)

In [54]:
len(instances)

5501

In [53]:
instances = instances.iloc[:-103]

In [121]:
while 1==1: # len(get_positives(instances, nr)) < 1 * threshold or len(get_negatives(instances, nr)) < 2 * threshold:
    if len(get_positives(instances, nr)) == 50: print("** REACHED 50 POSITIVES **")
    try:
        candidate = next(candidates) 
    except StopIteration:
        print("No candidates left.")
        break
    if candidate in list(instances[instances['#'] == nr]['sentence']): continue
    user_response = input(f"{candidate}")
    if user_response == "c": break
    if user_response == "del": 
        instances = instances.iloc[:-1]
        continue
    new_row = pd.DataFrame({'#': [nr], 'sentence': [candidate], 'positive': [True if user_response == '2' else False]})
    instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)

Based on his technical skills and innovative thinking, I would especially propose Andrew for the leadership position. 2
Considering the economic situation, I would actually invest in real estate at this time. 2
Despite the challenges, we would absolutely attend the event if given the opportunity. 2
Given her expertise and dedication, she would gladly take on the new project. 2
Knowing his attention to detail, I would strongly suggest Martin for the quality control role. 2
Due to the company's ethical standards, they would easily reject the proposal. 2
In light of recent developments, we would especially focus on customer satisfaction. 2
Taking into account the market trends, I would actually diversify our investment portfolio. 2
Knowing his love for adventure, he would absolutely enjoy the outdoor expedition. 2
Given her experience and knowledge, she would gladly accept the teaching position. 2
Reflecting on her talent and commitment, I would easily hire her for the creative role. 2
Ba

In [10]:
print(f'Positive: {len(get_positives(instances, nr))}, Negative: {len(get_negatives(instances, nr))}')

Positive: 123, Negative: 365


In [43]:
[sent for sent in get_negatives(instances, 621) if not "?" in sent]

['How could you go without me for all of those years.',
 'Jórlaug works OK, as do Obba, Sigurfljóð, Úranía and – should you choose – Vagna.',
 'Hi, taxi.Could you take me to the financial street, please',
 "Well, I don't like climbing many stairs when there is a power cut.",
 'It does seem like a fun alternative to other dating activities!',
 'I think my girls liked them because of the bears - they are both big animal fans.',
 'More like a dance club but not quite sure how to get started.',
 "I think you ' ll like it if you give it a chance.",
 'I like redheaded girls so much.',
 'The girl coughed the water up almost immediately, and just like that, it was over.',
 "It's frustrating that they feel the need to monitor what we do so closely instead of judging us based on our task performance, like most companies do these days.",
 'As many as you like, sir.',
 'We would like to start you off at 2, 000 yuan a month, excluding bonus and overtime pay.',
 'I would like to ask if those four da

In [12]:
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=3, ratio=0.5)

0.18385650224215247


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:01<00:00, 11.61it/s]


Training loss: 0.2258114060934852
Accuracy: 0.9701492537313433


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:01<00:00, 11.94it/s]


Training loss: 0.1526809260249138
Accuracy: 0.9701492537313433


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:01<00:00, 11.44it/s]


Training loss: 0.12813870477325776
Accuracy: 0.9850746268656716


In [32]:
models.probe_model(classifier, 'I would want you to')

(tensor([0.0331]), ['would'])

Check on entire corpus

In [57]:
# shuffle inputs
shuffled_index = np.random.permutation(encoded_inputs['input_ids'].size(0))
for key, value in encoded_inputs.items():
    encoded_inputs[key] = value[shuffled_index] if isinstance(value, Tensor) else [value[i] for i in shuffled_index]

corpus_dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
corpus_dataloader = DataLoader(corpus_dataset, batch_size=batch_size, shuffle=False)
scores, tokens = models.score_corpus(classifier, corpus_dataloader, max_positive=250, max_batches=250, threshold=0.5)
results = list(zip(scores, tokens, encoded_inputs['sentences'][:len(scores)]))

 12%|█████████████▎                                                                                            | 250/2000 [00:40<04:45,  6.13it/s]


In [62]:
threshold = 0.5

subset = [(score, token, sample) for score, token, sample in results if
     score > threshold]
subset_sentences = [sample for _, _, sample in subset if "like" in sample]
subset_sentences

['Anne, would you please come in for a while?',
 'Would you mind taking a look at the layout?',
 'Would you still want full coverage?',
 'Would Dr.Black be able to see me at 9:00 a. m. tomorrow?',
 'Why would the person who told have problems with the law?',
 'Would you mind telling me the purpose of your visit to the U. K.?',
 'Would you consider an offer of $ 56,000 per year?',
 'Would you mind speaking slowly?',
 'Why would the officer lie?',
 "If I don't pay my taxes, would the tax officials discover it?",
 'Would you mind if I ask some personal questions?',
 'Would you please let the next applicant come in on your way out?',
 'Thanks.Would you do me another favor?',
 'Would you please bring me the black suit?',
 'What would you do to achieve that?',
 'How would you make that pay?',
 'Yes, would you please call me a taxi first?',
 'Excuse me, would you get me some paper napkins?',
 'Would you please send them to us by next Wednesday?',
 'Would you please weight this letter for me?'

In [56]:
#add_to_instances(gpt35['augmented_examples'][100:150], nr, True)
#add_to_instances(random.sample(subset_sentences, 25), nr, False)
add_to_instances(subset_sentences, nr, False)
#add_to_instances(random.sample(get_positives(instances, 619), 50), nr, False)
#add_to_instances(list(set(get_positives(instances, 621)).difference(get_negatives(instances, nr))), nr, False)
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr)[-200:], get_others(sentences, matches), num_epochs=5, ratio=0.5)

0.2949640287769784


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 11.13it/s]


Training loss: 0.10598782179030505
Accuracy: 1.0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 12.38it/s]


Training loss: 0.08268489519303496
Accuracy: 1.0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 12.44it/s]


Training loss: 0.05490157820961692
Accuracy: 1.0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 12.43it/s]


Training loss: 0.06297581778331236
Accuracy: 1.0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 12.36it/s]


Training loss: 0.04817179421132261
Accuracy: 1.0


In [63]:
models.save_classifier(classifier, nr, "corpus_training")

In [5]:
exp =models.load_classifier(616, "corpus_training")

In [9]:
models.probe_model(exp, "Would you like to invite me?")

(tensor([0.1237]), ['would'])