In [1]:
import re
import os
import random

import pandas as pd
import sys
sys.path.append('../source')
import models
import data
import importlib
#importlib.reload(data)
from torch.utils.data import DataLoader, TensorDataset

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load corpus sentences and prepare dataloader
sentences = data.get_mixed_sentences(1000000) # get all sentences
sentences = list(set(sentences))

# initialize corpus to check against
max_batches = 250
batch_size = 64
random.shuffle(sentences)
encoded_inputs = models.bert_tokenizer(sentences[:4*max_batches*batch_size], return_tensors='pt', max_length=64, padding='max_length', truncation=True)
corpus_dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
corpus_dataloader = DataLoader(corpus_dataset, batch_size=batch_size, shuffle=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:29<00:00,  9.95s/it]


In [3]:
# output dataset
output_path = '../data/annotated_corpus.json'
instances = pd.DataFrame(columns=['#', 'sentence', 'positive']) if not os.path.exists(output_path) else pd.read_json(output_path)
instances['positive'] = instances['positive'].astype(bool)

In [4]:
# manipulation functions for instance dataset
def get_positives(instances, nr):
    return list(instances[(instances['#'] == nr) & instances['positive']]['sentence'])
def get_negatives(instances, nr):
    return list(instances[(instances['#'] == nr) & ~instances['positive']]['sentence'])
def get_others(sentences, matches):
    return list(set(sentences).difference(set(matches)))
def add_to_instances(sentences, nr, positive=True):
    global instances
    if isinstance(sentences, list):
        for sentence in sentences:
            new_row = pd.DataFrame({'#': [nr], 'sentence': [sentence], 'positive': [positive]})
            instances = pd.concat([instances, new_row], ignore_index=True)
    else:
        new_row = pd.DataFrame({'#': [nr], 'sentence': [sentences], 'positive': [positive]})
        instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)

In [11]:
def get_dataset(positives, negatives, others, tokenizer, max_len, others_ratio = 3, max_positive_examples=500):
    unique_positive = list(set(positives)) # remove duplicates
    unique_negative = list(set(negatives).difference(set(positives))) # remove duplicates and positives
    
    sentences = unique_positive
    labels = [1] * len(sentences)
    sentences += unique_negative
    labels += [0] * len(unique_negative)
    num_rands = int(others_ratio * len(unique_negative))
    sentences += others[:num_rands]
    labels += [0] * num_rands
    return data.SentenceDataset(sentences, labels, tokenizer, max_len)

def get_trained_classifer(positive, negative, others, classifier=models.RuleDetector(models.bert_encoder), num_epochs=3):
    dataset = get_dataset(positive, negative, others, models.bert_tokenizer, 64) 
    train_dataloader, val_dataloader = data.get_loaders(dataset)
    models.train(classifier, train_dataloader, val_dataloader, num_epochs)
    return classifier

Find examples for would rules (616-638)

In [6]:
nr = 627

In [9]:
pattern = r"(would).*(n't|not).*(have).*"
matches = [sentence for sentence in sentences if re.search(pattern, sentence)]
random.shuffle(matches)
candidates = iter(matches)
threshold = min(len(matches), 50)

In [393]:
anti_pattern = r"(W|w)(ould).*(have).*(ed|en|id|ght) "
anti_matches = [sentence for sentence in sentences if re.search(anti_pattern, sentence)]
random.shuffle(anti_matches)
candidates = iter(anti_matches)

In [394]:
while len(get_positives(instances, nr)) < threshold or len(get_negatives(instances, nr)) < 1.5 * threshold:
    if len(get_positives(instances, nr)) == 50: print("** REACHED 50 POSITIVES **")
    try:
        candidate = next(candidates) 
    except StopIteration:
        print("No candidates left.")
        break
    if candidate in list(instances[instances['#'] == nr]['sentence']): continue
    user_response = input(f"{candidate}")
    if user_response == "c": break
    new_row = pd.DataFrame({'#': [nr], 'sentence': [candidate], 'positive': [True if user_response == '2' else False]})
    instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)

The EPA would know best, although they have been hindered this presidency. 1
Would you have wanted them to adopt another child? 1
i would loved to have and  listen to Katy Perry in the shower, but no, i do not 1
I would have never imagined they come from persia, i love eating them specially in salads 1
I would have loved some pork chops off the grill but my husband doesn't like pork chops not even the leaner piece cuts. 1
I would have respected Al Gore a bit more if he hadn't tried to make this a film about himself as well. 1
The next change would have been to put those words into sentences, similar to the 'protolanguage' children use when they first learn to speak. 1
But I think it would have been wiser, far wiser, for the administration to have notified, certainly the leadership of Congress in the interest of having good relations. 1
I would never have guessed that many people wore them! 1
Wow, who would have thought an Arkansas company would get so big! 1
That would have been boring

In [384]:
# delete the last entry
instances = instances.iloc[:-1]
instances

Unnamed: 0,#,sentence,positive
0,616,I would like to go there and pick one up too.,True
1,616,"I would like to see cute pandas, too.",True
2,616,"One more thing, we'd like you to quote us on C...",True
3,616,I'd like to invite you and your photographic t...,True
4,616,Maybe he would like to play with us.,True
...,...,...,...
2140,627,Besides you wouldn't have to do much preparati...,False
2141,627,I would not have thought that.,True
2142,627,"I would, but I don't have any in my tackle box.",False
2143,627,I really shouldn't have given her so much candy.,False


In [7]:
print(f'Positive: {len(get_positives(instances, nr))}, Negative: {len(get_negatives(instances, nr))}')

Positive: 55, Negative: 135


In [12]:
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=5)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00,  9.14it/s]


Training loss: 0.6508052984873454
Accuracy: 0.8956521739130435


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.26it/s]


Training loss: 0.37286275029182436
Accuracy: 0.9043478260869565


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.31it/s]


Training loss: 0.23663161943356195
Accuracy: 0.9043478260869565


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.31it/s]


Training loss: 0.1766846348841985
Accuracy: 0.9652173913043478


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.19it/s]


Training loss: 0.13269841820001602
Accuracy: 0.9565217391304348


In [13]:
models.probe_model(classifier, "I wouldn't have guessed that.")

(tensor([0.7191]), ['have'])

Check on entire corpus

In [414]:
scores, tokens = models.score_corpus(classifier, corpus_dataloader, max_positive=250, max_batches=500, threshold=0.5)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [00:40<00:00,  6.17it/s]


In [413]:
whitelist = []
threshold = 0.5

[(score, token, sample) for score, token, sample in zip(scores, tokens, list(sentences)[:len(scores)]) if
     score > threshold and not token in whitelist ]

[(0.5522863268852234,
  'other',
  "On any other day, it would cost me a fortune, but it ' s on special offer today."),
 (0.5168756246566772, 'have', 'Thats more than I would have thought.'),
 (0.7443615794181824,
  'have',
  'I would not have thought about forests, that is interesting.'),
 (0.9253101944923401,
  'otherwise',
  'The Internet has connected people around the world that would never have been able to meet otherwise.')]

In [367]:
false_pos = [sample for score, token, sample in zip(scores, tokens, list(sentences)[:len(scores)]) if
     score > threshold and not token in whitelist and not "would" in sample]
print(false_pos)

["If I had to guess, I'd say the Miss Universe pageant is probably the biggest.", "If I were you, I'd just sleep, read a book, or watch TV.", "If I were you I'd check the bus schedule."]


In [406]:
add_to_instances(get_positives(instances, 626), nr, False)
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=5)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 11.29it/s]


Training loss: 0.11018610124786694
Accuracy: 0.9739130434782609


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.18it/s]


Training loss: 0.09457445945590734
Accuracy: 0.9652173913043478


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.27it/s]


Training loss: 0.0772305446366469
Accuracy: 0.9826086956521739


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.16it/s]


Training loss: 0.07244106183449427
Accuracy: 0.9652173913043478


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.21it/s]


Training loss: 0.05815803979833921
Accuracy: 0.991304347826087


In [374]:
models.save_classifier(classifier, nr, "corpus_training")