In [1]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional
from pathlib import Path


from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

I0508 04:29:29.438768 139738724797312 file_utils.py:38] PyTorch version 1.5.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
home = str(Path.home())
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [3]:
@dataclass
class ModelArguments:
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
    )    
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )


In [4]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    line_by_line: bool = field(
        default=False,
        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
    )

    mlm: bool = field(
        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
    )
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )

    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


In [5]:
TRAIN_DATA_DIR = os.path.join(home, 'data', 'wikitext-2-raw', 'wiki.train.raw')
EVAL_DATA_DIR = os.path.join(home, 'data', 'wikitext-2-raw', 'wiki.valid.raw')

In [45]:
model_args = ModelArguments(model_type='reformer',
                           model_name_or_path = 'google/reformer-crime-and-punishment')
training_args = TrainingArguments(output_dir = os.path.join(home, 'saved_models'),
                                 do_train = True,
                                  evaluate_during_training = True,
                                  do_eval = True,
                                  do_predict = True,
                                 per_gpu_train_batch_size = 4,
                                 per_gpu_eval_batch_size=4)
data_args = DataTrainingArguments(train_data_file = TRAIN_DATA_DIR,
                                  eval_data_file = EVAL_DATA_DIR,
                                 line_by_line = False)

In [9]:
data_args

DataTrainingArguments(train_data_file='/home/u37216/data/wikitext-2-raw/wiki.train.raw', eval_data_file='/home/u37216/data/wikitext-2-raw/wiki.valid.raw', line_by_line=False, mlm=False, mlm_probability=0.15, block_size=-1, overwrite_cache=False)

In [10]:
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
        )
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
        )

In [11]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

I0508 04:30:39.634112 139738724797312 training_args.py:127] PyTorch: setting up devices
W0508 04:30:39.652303 139738724797312 <ipython-input-11-3c50b0fce79f>:13] Process rank: -1, device: cpu, n_gpu: 0, distributed training: False, 16-bits training: False
I0508 04:30:39.654711 139738724797312 <ipython-input-11-3c50b0fce79f>:15] Training/evaluation parameters TrainingArguments(output_dir='/tmp', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=True, per_gpu_train_batch_size=8, per_gpu_eval_batch_size=8, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False)


In [13]:
 config = AutoConfig.from_pretrained(model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

I0508 04:31:30.932655 139738724797312 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/google/reformer-crime-and-punishment/config.json from cache at /home/u37216/.cache/torch/transformers/74be8efac14e2092c5e89b76a2e512b088b4516b3971c6328db52a22278ab3a6.c45c6a743fd11cbe6b5e4c683093a9b5aa7441857744454e0cb36d6168d704b9
I0508 04:31:30.936940 139738724797312 configuration_utils.py:321] Model config ReformerConfig {
  "architectures": [
    "ReformerModelWithLMHead"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "attn_layers": [
    "local",
    "lsh",
    "local",
    "lsh",
    "local",
    "lsh"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": true,
  "axial_pos_embds_dim": [
    64,
    192
  ],
  "axial_pos_shape": [
    512,
    1024
  ],
  "chunk_size_feed_forward": 0,
  "chunk_size_lm_head": 0,
  "eos_token_id": 2,
  "feed_forward_size": 512,
  "hash_seed": 0,
  "hidden_act": "relu",
  "hidden_dr

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path,
                                         cache_dir=model_args.cache_dir)

I0508 04:32:00.536121 139738724797312 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/google/reformer-crime-and-punishment/config.json from cache at /home/u37216/.cache/torch/transformers/74be8efac14e2092c5e89b76a2e512b088b4516b3971c6328db52a22278ab3a6.c45c6a743fd11cbe6b5e4c683093a9b5aa7441857744454e0cb36d6168d704b9
I0508 04:32:00.538853 139738724797312 configuration_utils.py:321] Model config ReformerConfig {
  "architectures": [
    "ReformerModelWithLMHead"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "attn_layers": [
    "local",
    "lsh",
    "local",
    "lsh",
    "local",
    "lsh"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": true,
  "axial_pos_embds_dim": [
    64,
    192
  ],
  "axial_pos_shape": [
    512,
    1024
  ],
  "chunk_size_feed_forward": 0,
  "chunk_size_lm_head": 0,
  "eos_token_id": 2,
  "feed_forward_size": 512,
  "hash_seed": 0,
  "hidden_act": "relu",
  "hidden_dr

In [34]:
model = AutoModelWithLMHead.from_pretrained(model_args.model_name_or_path,
                                           config = config)


I0508 04:56:28.640970 139738724797312 modeling_utils.py:617] loading weights file https://cdn.huggingface.co/google/reformer-crime-and-punishment/pytorch_model.bin from cache at /home/u37216/.cache/torch/transformers/d9a22bfe046ba8e5a5024c1e865fb6427fa65d89dea0ab5ce04203f62b547514.13b78d45da98951635d331bd43a4359c5b4030b5a4d597f9a2a3682600457b46
I0508 04:56:28.742543 139738724797312 modeling_utils.py:708] Weights of ReformerModelWithLMHead not initialized from pretrained model: ['reformer.encoder.layers.0.attention.self_attention.mask_value_float16', 'reformer.encoder.layers.0.attention.self_attention.mask_value_float32', 'reformer.encoder.layers.1.attention.self_attention.self_mask_value_float16', 'reformer.encoder.layers.1.attention.self_attention.self_mask_value_float32', 'reformer.encoder.layers.1.attention.self_attention.mask_value_float16', 'reformer.encoder.layers.1.attention.self_attention.mask_value_float32', 'reformer.encoder.layers.2.attention.self_attention.mask_value_float1

In [18]:
model.resize_token_embeddings(len(tokenizer))

Embedding(320, 256)

In [20]:
if data_args.block_size <= 0:
    data_args.block_size = tokenizer.max_len
else:
    data_args.block_size = min(data_args.block_size, tokenizer.max_len)

In [21]:
train_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank)
    if training_args.do_train
    else None
)
eval_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)
    if training_args.do_eval
    else None
)

I0508 04:34:17.513707 139738724797312 language_modeling.py:43] Loading features from cached file /home/u37216/data/wikitext-2-raw/cached_lm_ReformerTokenizer_524288_wiki.train.raw [took 0.199 s]
I0508 04:34:17.545947 139738724797312 language_modeling.py:43] Loading features from cached file /home/u37216/data/wikitext-2-raw/cached_lm_ReformerTokenizer_524288_wiki.valid.raw [took 0.029 s]


In [24]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
    )

In [22]:
train_dataset[0]

tensor([258,   0, 258,  ..., 210, 262,   6])

In [35]:
trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
            )

In [26]:
model_path = model_args.model_name_or_path

In [36]:
trainer.train(model_path = model_path)

E0508 04:56:36.459468 139738724797312 jupyter.py:106] Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
I0508 04:56:37.079992 139738724797312 run_manager.py:927] system metrics and metadata threads started
I0508 04:56:37.083288 139738724797312 run_manager.py:936] checking resume status, waiting at most 10 seconds
I0508 04:56:37.203402 139738724797312 run_manager.py:954] resuming run from id: UnVuOnYxOjJndGR0amZhOmZsdWVuY2UtZXhhbXBsZXM6Y2Fsdmlu
I0508 04:56:37.249498 139738724797312 run_manager.py:966] upserting run before process can begin, waiting at most 10 seconds
I0508 04:56:37.382089 139734131074816 run_manager.py:1051] saving patches
I0508 04:56:37.620789 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/config.yaml
I0508 04:56:40.993094 139734131074816 run_manager.py:1055] saving pip packages
I0508 04:56:41.004132 139734131074816 run_manager.py:1057] in

A Jupyter Widget

A Jupyter Widget

I0508 04:56:41.243750 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-summary.json
I0508 04:56:41.247102 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 04:56:41.249053 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-events.jsonl
I0508 04:56:41.251227 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-history.jsonl
I0508 04:56:41.253332 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/diff.patch
I0508 04:56:41.254809 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/diff.patch
I0508 04:56:41.256131 139734729029376 run_manager.py:




A Jupyter Widget

I0508 05:19:15.118388 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:19:29.276127 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-events.jsonl
I0508 05:19:31.279046 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:19:47.315123 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:19:59.348497 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-events.jsonl
I0508 05:20:03.353379 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:20:19.499347 13973472




A Jupyter Widget

I0508 05:42:09.411781 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:42:23.719567 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-events.jsonl
I0508 05:42:25.722481 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:42:41.774817 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:42:54.815770 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-events.jsonl
I0508 05:42:57.936394 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metadata.json
I0508 05:43:14.347807 13973472





TrainOutput(global_step=9, training_loss=6.104192998674181)

I0508 06:04:21.543061 139738724797312 run_manager.py:1071] shutting down system stats and metadata service
I0508 06:04:21.547311 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/config.yaml
I0508 06:04:21.698238 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/code/_session_history.ipynb
I0508 06:04:21.700689 139734729029376 run_manager.py:680] file/dir created: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/code
I0508 06:04:22.553502 139738724797312 run_manager.py:1085] stopping streaming files and file change observer
I0508 06:04:22.554096 139734729029376 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-events.jsonl
I0508 06:04:22.559178 139738724797312 run_manager.py:691] file/dir modified: /home/u37216/fluence/examples/wandb/run-20200508_115635-2gtdtjfa/wandb-metada

In [38]:
eval_output = trainer.evaluate()

I0508 06:55:16.517558 139738724797312 trainer.py:609] ***** Running Evaluation *****
I0508 06:55:16.519285 139738724797312 trainer.py:610]   Num examples = 1
I0508 06:55:16.520593 139738724797312 trainer.py:611]   Batch size = 4


A Jupyter Widget




In [39]:
perplexity = math.exp(eval_output["loss"])
result = {"perplexity": perplexity}

In [47]:
trainer.save_model(training_args.output_dir)

I0508 07:02:09.048841 139738724797312 trainer.py:512] Saving model checkpoint to /home/u37216/saved_models
I0508 07:02:09.054607 139738724797312 configuration_utils.py:144] Configuration saved in /home/u37216/saved_models/config.json
I0508 07:02:09.215681 139738724797312 modeling_utils.py:450] Model weights saved in /home/u37216/saved_models/pytorch_model.bin


{'perplexity': 327.21500614323526}