https://arxiv.org/pdf/1810.04805.pdf

In [1]:
import os
os.sys.path.append('..')

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import logging
import json
import math
import os
import random
import six
from tqdm import tqdm_notebook as tqdm
from IPython.display import HTML, display

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

import tokenization
from modeling import BertConfig, BertForMaskedLanguageModelling
from optimization import BERTAdam
from masked_language_model import notqdm, convert_tokens_to_features, LMProcessor, predict_masked_words, predict_next_words, improve_words_recursive


In [4]:

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

# Args

In [5]:
parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument("--data_dir",
                    default=None,
                    type=str,
                    required=True,
                    help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--bert_config_file",
                    default=None,
                    type=str,
                    required=True,
                    help="The config json file corresponding to the pre-trained BERT model. \n"
                         "This specifies the model architecture.")
parser.add_argument("--task_name",
                    default=None,
                    type=str,
                    required=True,
                    help="The name of the task to train.")
parser.add_argument("--vocab_file",
                    default=None,
                    type=str,
                    required=True,
                    help="The vocabulary file that the BERT model was trained on.")
parser.add_argument("--output_dir",
                    default=None,
                    type=str,
                    required=True,
                    help="The output directory where the model checkpoints will be written.")

## Other parameters
parser.add_argument("--init_checkpoint",
                    default=None,
                    type=str,
                    help="Initial checkpoint (usually from a pre-trained BERT model).")
parser.add_argument("--do_lower_case",
                    default=False,
                    action='store_true',
                    help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--max_seq_length",
                    default=128,
                    type=int,
                    help="The maximum total input sequence length after WordPiece tokenization. \n"
                         "Sequences longer than this will be truncated, and sequences shorter \n"
                         "than this will be padded.")
parser.add_argument("--do_train",
                    default=False,
                    action='store_true',
                    help="Whether to run training.")
parser.add_argument("--do_eval",
                    default=False,
                    action='store_true',
                    help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size",
                    default=32,
                    type=int,
                    help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
                    default=8,
                    type=int,
                    help="Total batch size for eval.")
parser.add_argument("--learning_rate",
                    default=5e-5,
                    type=float,
                    help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
                    default=3.0,
                    type=float,
                    help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion",
                    default=0.1,
                    type=float,
                    help="Proportion of training to perform linear learning rate warmup for. "
                         "E.g., 0.1 = 10%% of training.")
parser.add_argument("--no_cuda",
                    default=False,
                    action='store_true',
                    help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
                    type=int,
                    default=-1,
                    help="local_rank for distributed training on gpus")
parser.add_argument('--seed', 
                    type=int, 
                    default=42,
                    help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
                    type=int,
                    default=1,
                    help="Number of updates steps to accumualte before performing a backward/update pass.") 

_StoreAction(option_strings=['--gradient_accumulation_steps'], dest='gradient_accumulation_steps', nargs=None, const=None, default=1, type=<class 'int'>, choices=None, help='Number of updates steps to accumualte before performing a backward/update pass.', metavar=None)

In [6]:
experiment_name = 'poetry_uncased_5_tied_mlm'

argv = """
--task_name lm \
--data_dir {DATA_DIR} \
--vocab_file {BERT_BASE_DIR}/vocab.txt \
--bert_config_file {BERT_BASE_DIR}/bert_config.json \
--init_checkpoint {BERT_BASE_DIR}/pytorch_model.bin \
--do_train \
--do_eval \
--gradient_accumulation_steps 2 \
--train_batch_size 16 \
--learning_rate 3e-5 \
--num_train_epochs 3.0 \
--max_seq_length 128 \
--output_dir ../outputs/{name}/
""".format(
    BERT_BASE_DIR='../data/weights/cased_L-12_H-768_A-12',
    DATA_DIR='../data/input/poetry_gutenberg',
    name=experiment_name
).replace('\n', '').split(' ')
print(argv)
args = parser.parse_args(argv)

['--task_name', 'lm', '--data_dir', '../data/input/poetry_gutenberg', '--vocab_file', '../data/weights/cased_L-12_H-768_A-12/vocab.txt', '--bert_config_file', '../data/weights/cased_L-12_H-768_A-12/bert_config.json', '--init_checkpoint', '../data/weights/cased_L-12_H-768_A-12/pytorch_model.bin', '--do_train', '--do_eval', '--gradient_accumulation_steps', '2', '--train_batch_size', '16', '--learning_rate', '3e-5', '--num_train_epochs', '3.0', '--max_seq_length', '128', '--output_dir', '../outputs/poetry_uncased_5_tied_mlm/']


# Init

In [7]:
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
else:
    device = torch.device("cuda", args.local_rank)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

if args.gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                        args.gradient_accumulation_steps))

args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

if not args.do_train and not args.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

12/12/2018 22:31:25 - INFO - __main__ -   device cuda n_gpu 1 distributed training False


In [8]:
bert_config = BertConfig.from_json_file(args.bert_config_file)

if args.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
        args.max_seq_length, bert_config.max_position_embeddings))

In [9]:
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    print("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
save_path = os.path.join(args.output_dir, 'state_dict.pkl')
save_path

Output directory (../outputs/poetry_uncased_5_tied_mlm/) already exists and is not empty.


'../outputs/poetry_uncased_5_tied_mlm/state_dict.pkl'

# Load data

In [10]:
tokenizer = tokenization.FullTokenizer(
    vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

decoder = {v:k for k,v in tokenizer.wordpiece_tokenizer.vocab.items()}

In [11]:
processors = {
        "lm": LMProcessor,
}
    
task_name = args.task_name.lower()
if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

processor = processors[task_name](tokenizer=tokenizer)
label_list = processor.get_labels()

In [12]:
train_examples = processor.get_train_examples(args.data_dir, skip=30, tqdm=tqdm)
num_train_steps = int(
    len(train_examples) / args.train_batch_size * args.num_train_epochs)

HBox(children=(IntProgress(value=0, description='tokenising', max=169371, style=ProgressStyle(description_widt…

KeyboardInterrupt: 

In [None]:
train_features = convert_tokens_to_features(
    train_examples, label_list, args.max_seq_length, tokenizer, tqdm=tqdm)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
all_label_weights = torch.tensor([f.label_weights for f in train_features], dtype=torch.long)

In [None]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_weights)
if args.local_rank == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

# Load model

In [None]:
model = BertForMaskedLanguageModelling(bert_config)
if args.init_checkpoint is not None:
    model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    
if os.path.isfile(save_path):
    model.load_state_dict(torch.load(save_path, map_location='cpu'))
    
model.to(device)

if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                      output_device=args.local_rank)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)
    
model

# Opt

In [None]:
no_decay = ['bias', 'gamma', 'beta']
optimizer_parameters = [
    {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
    ]

optimizer = BERTAdam(optimizer_parameters,
                     lr=args.learning_rate,
                     warmup=args.warmup_proportion,
                t_total=num_train_steps)

# Train 3

In [None]:
val_test="""What is more gentle than a wind in summer? What is more soothing than the pretty hummer That stays one moment in an open flower, And buzzes cheerily from bower to bower? What is more tranquil than a musk-rose blowing In a green island, far from all men's knowing?"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=20, T=1, device=device))

In [None]:
# val_test="""The next day I was somewhat somnolent, of which you may be sure Miss Frankland took no notice. She retired to her own room when we went for our recreation. My sisters scolded me for not coming to them the previous night, but I told them that Miss F. had continued to move about her room for so long a time that I had fallen fast asleep, and even then had not had enough, as they might have observed how sleepy I had been all"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=20, T=1, device=device))

In [None]:
display(predict_masked_words(val_test, processor, tokenizer, model, device=device, max_seq_length=args.max_seq_length))

In [None]:
global_step = 0

In [None]:
model.train()
for _ in tqdm(range(int(args.num_train_epochs)), desc="Epoch"):
    tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0
    with tqdm(total=len(train_dataloader), desc='Iteration', mininterval=60) as prog:
        for step, batch in enumerate(train_dataloader):
            
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, label_weights = batch
            loss, logits = model(input_ids, segment_ids, input_mask, label_ids, label_weights)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()    # We have accumulated enougth gradients
                model.zero_grad()
            prog.update(1)
            prog.desc = 'Iter. loss={:2.6f}'.format(tr_loss/nb_tr_examples)
            if step%3000==10:
                
                print('step', step, 'loss', tr_loss/nb_tr_examples)
                display(predict_masked_words(val_test, processor, tokenizer, model, device=device, max_seq_length=args.max_seq_length))
                display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=10, device=device))
                tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0
                
            # TODO validation test at end of each epoch to check for overfitting
                
    
    torch.save(model.state_dict(), save_path)

global_step += 1

In [None]:
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=10, device=device, debug=False))

In [None]:
torch.save(model.state_dict(), save_path)

In [None]:
val_test="""Frank could no longer resist"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=.1, device=device))

In [None]:
val_test="""There was no doubt the lad had seen everything"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=26, T=.5, device=device))

In [None]:
val_test="""The next night I had been asleep about a couple of hours when I was suddenly awakened by"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=1, device=device))

In [None]:
val_test="""His mind spun in on itself"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=150, T=1, device=device))

In [None]:
val_test="""A giant spider descended on to"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=.5, device=device))

In [None]:
val_test="""Madness enveloped his mind as"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=.1, device=device))

In [None]:
val_test="""A thin film of"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=1, device=device))

In [None]:
val_test="""Quivering with fear, he trembled as"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=1, device=device))

In [None]:
val_test="""Madness enveloped his mind as"""
display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=300, T=1, device=device))

In [None]:
val_test="""Roses are red, violets are blue, you are something but something"""
predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=100, T=1, device=device)

In [None]:
val_test="""The sun shone on the great home """
h=predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=12, T=1, device=device)
h.data = h.data.replace('.', '.<p></p>')
h

# Generate here

In [None]:
improve_words_recursive?

In [None]:
val_test="""Quivering with fear, he trembled, he, but lo! Fell he, Again until emptied of life, Again; Till, when Love sighed-- Love dreamed, but dreaming his; and the still stream he very much esteems I plac, or lie large in size Balyms others attributed to Dashdims-- saint of Bat. 4. 143 Bb version-Morgen 144 3 5 sing the Heathings 48 8 singbi 79 12 13 5 4 5 6 Vuivering with fear, he trembled, he, but lo! Fell he, Again until emptied of life, Again; Till, when Love sighed-- Love dreamed, but dreaming his; and the still stream he very much esteems I plac, or lie large in size Balyms others attributed to Dashdims-- saint of Bat. 4. 143 Bb version-Morgen 144 3 5 sing the Heathings"""
display(improve_words_recursive(
    val_test, 
    processor, 
    tokenizer, 
    model, 
    max_seq_length=args.max_seq_length, 
    iterations=50, # How many change to go through, it replaces 
    T=1, # Tempreture - Higher gives more random, but less stable output
    device="cuda", 
    debug=1
))

In [None]:
val_test="""What is more gentle than a wind in summer. How is more hearty than a winters soul. Why are we glum unto the Autumn. And where are we that we may skip in the spring. Unto thee I ask the question of the seasons, so you furnish me with empty lies and we may hunt over all for the final lie so we my sleep"""
display(improve_words_recursive(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, iterations=50, T=1, device="cuda", debug=1))

In [None]:
text="""It's strange to think that we will live and die
in the eternal grey,
After years of the same things; sin and toil
and empty delight.
I will wed, then father and perish
what else can I do?
I bore as did those who came before me
without question or blame.
And go hither unto oblivion, leaving neither trace nor name
"""
improve_words_recursive(text, processor, tokenizer, model, iterations=100, max_seq_length=128, n=10, T=0.9, device="cuda", debug=1)

In [None]:
text="""Mary had a little lamb
Little lamb, little lamb
Mary had a little lamb
Its fleece was white as snow
And everywhere that Mary went
Mary went, Mary went
Everywhere that Mary went
The lamb was sure to go

He followed her to school one day
School one day, school one day
He followed her to school one day
Which was against the rule
It made the children laugh and play
Laugh and play, laugh and play
It made the children laugh and play
To see a lamb at school"""
improve_words_recursive(text, processor, tokenizer, model, ITERATIVE_MASK_FRAC=0.1, iterations=100, max_seq_length=128, n=10, T=0.3, device="cuda", debug=10)

In [None]:
text="""The rose is red, the violet's blue,
    The honey's sweet, and so are you.
    Thou are my love and I am thine;
    I drew thee to my Valentine:
    The lot was cast and then I drew,
    And Fortune said it shou'd be you. The rose is red, the violet's blue,
    The honey's sweet, and so are you.
    Thou are my love and I am thine;
    I drew thee to my Valentine:
    The lot was cast and then I drew,
    And Fortune said it shou'd be you."""
improve_words_recursive(text, processor, tokenizer, model, ITERATIVE_MASK_FRAC=0.2, iterations=100, max_seq_length=128, T=0.7, device="cuda", debug=1)

In [None]:
text="""I saw the best minds of my generation destroyed by madness, starving hysterical naked,

dragging themselves through the negro streets at dawn looking for an angry fix,

angelheaded hipsters burning for the ancient heavenly connection to the starry dynamo in the machinery of night,

who poverty and tatters and hollow-eyed and high sat up smoking in the supernatural darkness of cold-water flats floating across the tops of cities contemplating jazz,

who bared their brains to Heaven under the El and saw Mohammedan angels staggering on tenement roofs illuminated,

who passed through universities with radiant eyes hallucinating Arkansas and Blake-light tragedy among the scholars of war,

who were expelled from the academies for crazy & publishing obscene odes on the windows of the skull,

who cowered in unshaven rooms in underwear, burning their money in wastebaskets and listening to the Terror through the wall,

who got busted in their pubic beards returning through Laredo with a belt of marijuana for New York,

who ate fire in paint hotels or drank turpentine in Paradise Alley, death, or purgatoried their torsos night after night

with dreams, with drugs, with waking nightmares, alcohol and cock and endless balls,

incomparable blind streets of shuddering cloud and lightning in the mind leaping towards poles of Canada & Paterson, illuminating all the motionless world of Time between,

Peyote solidities of halls, backyard green tree cemetery dawns, wine drunkenness over the rooftops, storefront boroughs of teahead joyride neon blinking traffic light, sun and moon and tree vibrations in the roaring winter dusks of Brooklyn, ashcan rantings and kind king light of mind,"""
improve_words_recursive(text, processor, tokenizer, model, ITERATIVE_MASK_FRAC=0.2, iterations=100, max_seq_length=128, n=10, T=1, device="cuda", debug=1)

And, as, and the sky's blue, And love's sea, and I was love; And and I of love: I was bide: Love gave me to the sea: Love gave me, and I was sea, And ay it was Fortune's betead. The sky is blue, and man's sea, The sea of the star, and of the sea; And I was deep, though I sing it; And I did say of the sea. And and and and and and I said; And then and then, of the hills and the sea, and and and,

And stan'd down on the sea. But ere the the, while they slept, He left them, give'd them upon the dew, He lay'd their bodies in the dew. On the shore were the boars'slings in the dew, Roasting o'er all the hot dew; He rose, and gave them the dew. He bore them down to the sea, He lay'd them with them, On the spray of the and water. Twowain and twas, He led their heads to the shore, And bounding

reason's it I think that we will live and engage in the same works, The works of the same, with fire and toil and empty heart. I was a woman then, but now for what else can I hope. We talk nothing, all that I go under to do... and achieve at all some oar, but do know the one and, to, do have, I after know. things I and that of, of can. I another. do or also over of, spring, have of, little secure, a-and at, at work of one yours,, heart and I

quivering with fear, But he stood there, and ah! saw him, he was void of sight, O!-- and he prayed-- he prayed, and raised his hand to the holy book to see, Too little a mythismen or dream: and the psalm Of living things do to you, my saintly friend. VI. " Bless ye the holyrysm-- and the book he held him,-- He saw it, " ( quivering with fear ) He saw, he prayed, lo! but he lived, was void of life, O; he

carries me out, for one good in each. He is more hearty than the single thing. They are under the darkness of the heaven. And, am I, I may go down the roads, tiptoed off, up out of the mountains. Never you will lead me with a child and we may hold it over me or you or over-night me all night crying night of and and it, me, any, you and and-everywhere and and me in all you a you night, We. you and the me under together up still and and of the thing thing.. up all we ahead andto

it's enough to talk with spirits that see and know in the eternal things of the great, the great, that love to toil and toil. I am one of their friends and foes, and am I one? I am as a one who had it. I am the voice, that passes over to the turner, that is the voice of the life one that it of, the the of,, it is me the man it all, the you., the to living the, of of voice and the the the, to is and, the the one, the great, that love, to

# TODO

- show probability in next word logging. Record probability of each letter, then use them when displaying as html
- try other ways of doing next word. E.g. going back and redoing, doing more than 1 at once
- make the masked language generator often mask last word
- should I be doing loss on just the masked words, or all? It's hard to tell from the tensorflow repo. This is marked with a TODO or FIXME in the code
- add validation loss, since overfitting seems to be a factor
- for eval, don't pad, just have a batch size of one. That may lead to better results
- for eval, add some words, and let it fill in the blanks
  - recursivly replace low confidence words?
  - for this I may have to make it predict unmasked words?

# Search for a result in the training text 

To check it's not just remembering

In [None]:
train_txt = open(os.path.join(args.data_dir,'train.txt')).readlines()

In [None]:
import difflib 
candidate_test = 'of the morning of the morning and the evening'

In [None]:
matches = []
for cand_train in tqdm(train_txt):
    diffl  = difflib.SequenceMatcher(None, cand_train.lower().strip(), candidate_test.lower() ).ratio()
    matches.append([diffl, cand_train])

In [None]:
import pandas as pd
df = pd.DataFrame(matches, columns=['ratio', 'str'])
df = df.sort_values('ratio', ascending=False)
df

In [None]:
import pandas as pd
df = pd.DataFrame(matches, columns=['ratio', 'str'])
df = df.sort_values('ratio', ascending=False)
df