In [1]:
import re
import os
import random
import numpy as np
from torch import tensor, Tensor
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

import sys
sys.path.append('../source')
import models
import data
import importlib
#importlib.reload(data)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Error loading punkt: <urlopen error [Errno 101] Network is
[nltk_data]     unreachable>


In [2]:
# load corpus sentences and prepare dataloader
sentences = data.get_mixed_sentences(1000000) # get all sentences
sentences = list(set(sentences))

# initialize corpus to check against
max_batches = 250
batch_size = 64
encoded_inputs = models.bert_tokenizer(sentences[:8*max_batches*batch_size], return_tensors='pt', max_length=64, padding='max_length', truncation=True)
encoded_inputs['sentences'] = sentences[:8*max_batches*batch_size]

100%|████████████████████████████████| 3/3 [00:32<00:00, 10.91s/it]


In [3]:
# output dataset
output_path = '../data/annotated_corpus.json'
instances = pd.DataFrame(columns=['#', 'sentence', 'positive']) if not os.path.exists(output_path) else pd.read_json(output_path)
instances['positive'] = instances['positive'].astype(bool)

egp_gpt = pd.read_json("../data/egp_gpt35.json")

In [4]:
# manipulation functions for instance dataset
def get_positives(instances, nr):
    return list(instances[(instances['#'] == nr) & instances['positive']]['sentence'])
def get_negatives(instances, nr):
    return list(instances[(instances['#'] == nr) & ~instances['positive']]['sentence'])
def get_others(sentences, matches):
    return list(set(sentences).difference(set(matches)))
def add_to_instances(sentences, nr, positive=True):
    global instances
    if isinstance(sentences, list):
        for sentence in sentences:
            new_row = pd.DataFrame({'#': [nr], 'sentence': [sentence], 'positive': [positive]})
            instances = pd.concat([instances, new_row], ignore_index=True)
    else:
        new_row = pd.DataFrame({'#': [nr], 'sentence': [sentences], 'positive': [positive]})
        instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)
# data preparation
def get_dataset(positives, negatives, others, tokenizer, max_len, others_ratio = 3):
    unique_positive = list(set(positives)) # remove duplicates
    unique_negative = list(set(negatives).difference(set(positives))) # remove duplicates and positives
    num_rands = int(others_ratio * len(unique_negative))
    random.shuffle(others)
    sentences = unique_positive + unique_negative + others[:num_rands]
    labels = [1] * len(unique_positive) + [0] * len(unique_negative) + [0] * len(others[:num_rands])
    print(sum(labels) / len(labels))
    return data.SentenceDataset(sentences, labels, tokenizer, max_len)
# model training
def get_trained_classifer(positive, negative, others, classifier=models.RuleDetector(models.bert_encoder), num_epochs=3, ratio=3):
    dataset = get_dataset(positive, negative, others, models.bert_tokenizer, 64, ratio) 
    train_dataloader, val_dataloader = data.get_loaders(dataset)
    models.train(classifier, train_dataloader, val_dataloader, num_epochs)
    return classifier

Find examples for would rules (616-638)

In [315]:
nr = 636

In [317]:
pattern = r"(ed|en|nt).*would"
matches = [sentence for sentence in sentences if re.search(pattern, sentence)]
random.shuffle(matches)
candidates = iter(matches)
threshold = min(len(matches), 50)

In [343]:
gpt35 = egp_gpt[egp_gpt['#']==nr].iloc[0]
matches = gpt35['augmented_examples'][:50]
matches = subset_sentences
candidates = iter(matches)
threshold = min(len(matches), 50)

In [325]:
anti_pattern = r"('d|would)"
anti_matches = [sentence for sentence in sentences if re.search(anti_pattern, sentence)]
random.shuffle(anti_matches)
candidates = iter(anti_matches)

In [344]:
while len(get_positives(instances, nr)) < 2 * threshold or len(get_negatives(instances, nr)) < 2 * threshold:
    if len(get_positives(instances, nr)) == 50: print("** REACHED 50 POSITIVES **")
    try:
        candidate = next(candidates) 
    except StopIteration:
        print("No candidates left.")
        break
    if candidate in list(instances[instances['#'] == nr]['sentence']): continue
    user_response = input(f"{candidate}")
    if user_response == "c": break
    if user_response == "del": 
        instances = instances.iloc[:-1]
        continue
    new_row = pd.DataFrame({'#': [nr], 'sentence': [candidate], 'positive': [True if user_response == '2' else False]})
    instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)

** REACHED 50 POSITIVES **


Because of the limitations on the speed of aircraft at the time, it would take many hours to make the trip; it was generally accepted that a pilot could not make it alone. 1


** REACHED 50 POSITIVES **


They say that if you didn't have a monopoly, you wouldn't be able to do the things you do. 1


** REACHED 50 POSITIVES **


They would be his friends. 2
So the playing field would mostly be even. 2
I was worried you would think it was too far to drive. 1
Yes, it would be. 1
But if our ambitions were so easy to achieve, we would soon get bored. 2
Seeing how she handled it -- she would talk about it if people wanted to -- showed me this was meant to be. 2
But they would just raise the price of the units to cover their costs. 1
I knew he would win. 1
I like those too, I just wish that they would release more songs c


In [327]:
print(f'Positive: {len(get_positives(instances, nr))}, Negative: {len(get_negatives(instances, nr))}')

Positive: 50, Negative: 50


In [328]:
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=5, ratio=1)

0.3333333333333333


100%|█| 4/4 [00:00<00:00,  9.48it/


Training loss: 1.943656712770462
Accuracy: 0.7333333333333333


100%|█| 4/4 [00:00<00:00, 11.93it/


Training loss: 0.9839863628149033
Accuracy: 0.9333333333333333


100%|█| 4/4 [00:00<00:00, 12.11it/


Training loss: 0.2726943977177143
Accuracy: 1.0


100%|█| 4/4 [00:00<00:00, 12.15it/


Training loss: 0.12760697212070227
Accuracy: 1.0


100%|█| 4/4 [00:00<00:00, 12.07it/


Training loss: 0.17192151863127947
Accuracy: 1.0


In [338]:
models.probe_model(classifier, "If I could, I would play the piano.")

(tensor([0.6954]), ['would'])

Check on entire corpus

In [351]:
# shuffle inputs
shuffled_index = np.random.permutation(encoded_inputs['input_ids'].size(0))
for key, value in encoded_inputs.items():
    encoded_inputs[key] = value[shuffled_index] if isinstance(value, Tensor) else [value[i] for i in shuffled_index]

corpus_dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
corpus_dataloader = DataLoader(corpus_dataset, batch_size=batch_size, shuffle=False)
scores, tokens = models.score_corpus(classifier, corpus_dataloader, max_positive=250, max_batches=250, threshold=0.5)
results = list(zip(scores, tokens, encoded_inputs['sentences'][:len(scores)]))

 25%|▎| 250/1000 [00:39<01:58,  6.


In [352]:
threshold = 0.5

subset = [(score, token, sample) for score, token, sample in results if
     score > threshold if "would" in sample]
subset_sentences = [sample for _, _, sample in subset]
subset_sentences

['Tomorrow she would ask someone else to help her.',
 'She would check his text messages and calls.',
 'I would take biology with Mr. Green.',
 'She would be laughing one moment, and if I said something insensitive, she would start crying.',
 'I had a coworker that would take her dogs to a seniors home every weekend.',
 'People would see them and begin crying.',
 'Every weekend Jeffrey and his best friend Chad would head to Santa Monica Beach, where they would surf big waves.',
 'He remembers watching his favorite players on TV when he was young, and how his father would take him to see the Giants play at Candlestick Park.',
 "It's so named because in the pavilion over there the great poet Bai Buyi of the Tang Dynasty would take a rest after drinking a little too much, and watch the moon over the lake.",
 'She would tell people he was a deadbeat dad when he would call to get me and she would refuse.',
 'He sang the lead, so he would do the main melody.',
 'He would look at it again.',


In [350]:
add_to_instances(gpt35['augmented_examples'][100:150], nr, True)
#add_to_instances(subset_sentences, nr, False)
add_to_instances(get_positives(instances, 634), nr, False)
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=5, ratio=0.5)

0.38817480719794345


100%|█| 10/10 [00:00<00:00, 10.98i


Training loss: 0.28319564908742906
Accuracy: 0.9871794871794872


100%|█| 10/10 [00:00<00:00, 12.14i


Training loss: 0.21335369497537612
Accuracy: 1.0


100%|█| 10/10 [00:00<00:00, 12.08i


Training loss: 0.16749295704066752
Accuracy: 1.0


100%|█| 10/10 [00:00<00:00, 12.14i


Training loss: 0.14245141297578812
Accuracy: 1.0


100%|█| 10/10 [00:00<00:00, 12.13i


Training loss: 0.11716953404247761
Accuracy: 1.0


In [353]:
models.save_classifier(classifier, nr, "corpus_training")