In [1]:
import re
import os
import random
import numpy as np
from torch import tensor, Tensor
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

import sys
sys.path.append('../source')
import models
import data
import importlib
#importlib.reload(data)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load corpus sentences and prepare dataloader
sentences = data.get_mixed_sentences(1000000) # get all sentences

# initialize corpus to check against
max_batches = 250
batch_size = 64
encoded_inputs = models.bert_tokenizer(sentences[:8*max_batches*batch_size], return_tensors='pt', max_length=64, padding='max_length', truncation=True)
encoded_inputs['sentences'] = sentences[:8*max_batches*batch_size]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:36<00:00,  9.16s/it]


In [3]:
# output dataset
output_path = '../data/annotated_corpus.json'
instances = pd.DataFrame(columns=['#', 'sentence', 'positive']) if not os.path.exists(output_path) else pd.read_json(output_path)
instances['positive'] = instances['positive'].astype(bool)

egp_gpt = pd.read_json("../data/egp_gpt35.json")

In [4]:
# manipulation functions for instance dataset
def get_positives(instances, nr):
    return list(instances[(instances['#'] == nr) & instances['positive']]['sentence'])
def get_negatives(instances, nr):
    return list(instances[(instances['#'] == nr) & ~instances['positive']]['sentence'])
def get_others(sentences, matches):
    return list(set(sentences).difference(set(matches)))
def add_to_instances(sentences, nr, positive=True):
    global instances
    if isinstance(sentences, list):
        for sentence in sentences:
            new_row = pd.DataFrame({'#': [nr], 'sentence': [sentence], 'positive': [positive]})
            instances = pd.concat([instances, new_row], ignore_index=True)
    else:
        new_row = pd.DataFrame({'#': [nr], 'sentence': [sentences], 'positive': [positive]})
        instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)
# data preparation
def get_dataset(positives, negatives, others, tokenizer, max_len, others_ratio = 3):
    unique_positive = list(set(positives)) # remove duplicates
    unique_negative = list(set(negatives).difference(set(positives))) # remove duplicates and positives
    num_rands = int(others_ratio * len(unique_negative))
    random.shuffle(others)
    sentences = unique_positive + unique_negative + others[:num_rands]
    labels = [1] * len(unique_positive) + [0] * len(unique_negative) + [0] * len(others[:num_rands])
    print(sum(labels) / len(labels))
    return data.SentenceDataset(sentences, labels, tokenizer, max_len)
# model training
def get_trained_classifer(positive, negative, others, classifier=models.RuleDetector(models.bert_encoder), num_epochs=3, ratio=3):
    dataset = get_dataset(positive, negative, others, models.bert_tokenizer, 64, ratio) 
    train_dataloader, val_dataloader = data.get_loaders(dataset)
    models.train(classifier, train_dataloader, val_dataloader, num_epochs)
    return classifier

Find examples for would rules (616-638)

In [98]:
nr = 637

In [99]:
pattern = r"(would|'d).*(strongly|easily|especially|actually|absolutely|gladly)"
matches = [sentence for sentence in sentences if re.search(pattern, sentence)]
random.shuffle(matches)
candidates = iter(matches)
threshold = min(len(matches), 50)

In [120]:
gpt35 = egp_gpt[egp_gpt['#']==nr].iloc[0]
matches = gpt35['augmented_examples'][:50]
candidates = iter(matches)
threshold = min(len(matches), 50)

In [15]:
anti_pattern = r"('d|would).*love"
anti_matches = [sentence for sentence in sentences if re.search(anti_pattern, sentence)]
random.shuffle(anti_matches)
candidates = iter(anti_matches)

In [121]:
while 1==1: # len(get_positives(instances, nr)) < 1 * threshold or len(get_negatives(instances, nr)) < 2 * threshold:
    if len(get_positives(instances, nr)) == 50: print("** REACHED 50 POSITIVES **")
    try:
        candidate = next(candidates) 
    except StopIteration:
        print("No candidates left.")
        break
    if candidate in list(instances[instances['#'] == nr]['sentence']): continue
    user_response = input(f"{candidate}")
    if user_response == "c": break
    if user_response == "del": 
        instances = instances.iloc[:-1]
        continue
    new_row = pd.DataFrame({'#': [nr], 'sentence': [candidate], 'positive': [True if user_response == '2' else False]})
    instances = pd.concat([instances, new_row], ignore_index=True)
    instances.to_json(output_path)

Based on his technical skills and innovative thinking, I would especially propose Andrew for the leadership position. 2
Considering the economic situation, I would actually invest in real estate at this time. 2
Despite the challenges, we would absolutely attend the event if given the opportunity. 2
Given her expertise and dedication, she would gladly take on the new project. 2
Knowing his attention to detail, I would strongly suggest Martin for the quality control role. 2
Due to the company's ethical standards, they would easily reject the proposal. 2
In light of recent developments, we would especially focus on customer satisfaction. 2
Taking into account the market trends, I would actually diversify our investment portfolio. 2
Knowing his love for adventure, he would absolutely enjoy the outdoor expedition. 2
Given her experience and knowledge, she would gladly accept the teaching position. 2
Reflecting on her talent and commitment, I would easily hire her for the creative role. 2
Ba

In [122]:
print(f'Positive: {len(get_positives(instances, nr))}, Negative: {len(get_negatives(instances, nr))}')

Positive: 96, Negative: 248


In [163]:
[sent for sent in get_negatives(instances, nr) if "ould" in sent]

['Would you guess Drake is actually his middle name?',
 'Those waves would definitely work especially if they were a little bigger.',
 "I wouldn't say that.They seem to be on good terms but actually they always speak ill of each other.",
 'I would love the bird attractions especially humming birds but not so much of the bumblebees unless they are sting-less lol.',
 "Mountains would be so beautiful right now for me especially since I'm reading novels about warriors by Erin Hunter it would be peaceful to be in that setting.",
 'I feel like signaling drivers would be hard, especially since the methods of signaling them differ around the world.',
 '“But I don’t think I would ever say I’m absolutely, 100% done,” he added, noting the sudden successes of private spaceflight companies such as SpaceX.',
 "However, I still wouldn't mind the cold if I lived in Canada, I actually think I would prefer it to my home country!",
 'I would assume that most truckers actually enjoy what they do.',
 "I wo

In [124]:
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=3, ratio=1)

0.1696113074204947


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.21it/s]


Training loss: 0.14989060312509536
Accuracy: 0.9736842105263158


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.44it/s]


Training loss: 0.14846555888652802
Accuracy: 0.9824561403508771


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.42it/s]


Training loss: 0.1283236013104518
Accuracy: 0.9736842105263158


In [158]:
models.probe_model(classifier, "I would actually like to have it.")

(tensor([0.2408]), ['would'])

Check on entire corpus

In [112]:
# shuffle inputs
shuffled_index = np.random.permutation(encoded_inputs['input_ids'].size(0))
for key, value in encoded_inputs.items():
    encoded_inputs[key] = value[shuffled_index] if isinstance(value, Tensor) else [value[i] for i in shuffled_index]

corpus_dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
corpus_dataloader = DataLoader(corpus_dataset, batch_size=batch_size, shuffle=False)
scores, tokens = models.score_corpus(classifier, corpus_dataloader, max_positive=250, max_batches=250, threshold=0.5)
results = list(zip(scores, tokens, encoded_inputs['sentences'][:len(scores)]))

 12%|█████████████▎                                                                                            | 250/2000 [00:39<04:36,  6.33it/s]


In [119]:
threshold = 0.5

subset = [(score, token, sample) for score, token, sample in results if
     score > threshold]
subset_sentences = [sample for _, _, sample in subset if "ly " in sample]
subset_sentences

['He thus joins Lyotard in promoting creative experimentation as a leading power of thought, a power that surpasses reason, narrowly defined, and without which thought would be inert.',
 'If mine saw a mud puddle she would immediately be in it.',
 'The opposite person would be an early bird',
 'Anderson would have replied imperiously as he did to Gilbert Ryle’s “representationist” criticisms: “That is nothing to me”.',
 'Indeed thats among the most cherished item i would proudly display next to my other trophies.',
 'she rarely left her bedroom and would almost never leave her house in  her later years']

In [147]:
#add_to_instances(gpt35['augmented_examples'][100:150], nr, True)
#add_to_instances(random.sample(subset_sentences, 25), nr, False)
#add_to_instances(subset_sentences, nr, False)
#add_to_instances(get_positives(instances, 629), nr, False)
classifier = get_trained_classifer(get_positives(instances, nr), get_negatives(instances, nr), get_others(sentences, matches), num_epochs=5, ratio=0.5)

0.21428571428571427


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12.37it/s]


Training loss: 0.10243181635936101
Accuracy: 0.9666666666666667


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12.40it/s]


Training loss: 0.09866684628650546
Accuracy: 0.9555555555555556


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12.56it/s]


Training loss: 0.08372893316360812
Accuracy: 0.9666666666666667


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12.66it/s]


Training loss: 0.08029156876727939
Accuracy: 0.9555555555555556


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12.69it/s]


Training loss: 0.09256573549161355
Accuracy: 0.9555555555555556


In [97]:
models.save_classifier(classifier, nr, "corpus_training")