In [1]:
#%pip install --user lightning-bolts -q

In [2]:
import argparse
import time
import os
import gc
import math
import multiprocessing
import io
import logging 
import itertools
import shutil
import pysnooper
import warnings
import glob
import pendulum
import random
import json
import sys
import wandb
import matplotlib
import matplotlib.pyplot as plt
import scikitplot as skplt
import seaborn as sns
import numpy as np
import pandas as pd

from icecream import ic
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from pathlib import Path
from IPython.core.interactiveshell import InteractiveShell

import torch
import torch.nn.functional as F
import transformers
import torchmetrics
from torch import nn
from torch import cuda


InteractiveShell.ast_node_interactivity = "all"

matplotlib.use('Agg')
warnings.filterwarnings("ignore")

seed = 9527
np.set_printoptions(suppress=True)

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARNING)

device = 'cuda' if cuda.is_available() else 'cpu'
print('Torch version: ', torch.__version__)
print('Device available:', torch.cuda.is_available())
print('Device name:', torch.cuda.get_device_name(0))
torch.set_printoptions(precision=8)


Torch version:  1.8.0
Device available: True
Device name: Tesla P100-PCIE-16GB


In [3]:
'''
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
torch.distributed.init_process_group(backend='nccl')

local_rank = -1
if local_rank not in [-1, 0]:
    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)
'''    

'\nfrom torch.utils.data.distributed import DistributedSampler\nfrom torch.nn.parallel import DistributedDataParallel as DDP\ntorch.distributed.init_process_group(backend=\'nccl\')\n\nlocal_rank = -1\nif local_rank not in [-1, 0]:\n    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab\n    \ndef setup(rank, world_size):\n    os.environ[\'MASTER_ADDR\'] = \'localhost\'\n    os.environ[\'MASTER_PORT\'] = \'12355\'\n\n    # initialize the process group\n    dist.init_process_group("gloo", rank=rank, world_size=world_size)\n'

In [4]:
seed_value = 9527
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(seed_value)

In [5]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--input_file', default=None, help="Input raw text file (or comma-separated list of files).")
parser.add_argument('--output_file', default=None, help="Output TF example file (or comma-separated list of files).")
parser.add_argument('--vocab_file', default=None, help="The vocabulary file that the ALBERT model was trained on.")
parser.add_argument('--spm_model_file', default=None, help="The model file for sentence piece tokenization.")
parser.add_argument('--input_file_mode', default="r",  help="The data format of the input file.")
parser.add_argument('--do_lower_case', default=True, help="Whether to lower case the input text. Should be True for uncased models and False for cased models.")
parser.add_argument('--do_whole_word_mask', default=True, help="Whether to use whole word masking rather than per-WordPiece masking.")
parser.add_argument('--do_permutation', default=False, help="Whether to do the permutation training.")
parser.add_argument('--favor_shorter_ngram', default=True, help="Whether to set higher probabilities for sampling shorter ngrams.")
parser.add_argument('--random_next_sentence', default=False, help="Whether to use the sentence that's right before the current sentence "
                    "as the negative sample for next sentence prection, rather than using "
                    "sentences from other random documents.")
parser.add_argument('--max_seq_length', default=512, help="Maximum sequence length.")
parser.add_argument('--ngram', default=3, help="Maximum number of ngrams to mask.")
parser.add_argument('--max_predictions_per_seq', default=20, help="Maximum number of masked LM predictions per sequence.")
parser.add_argument('--random_seed', default=12345, help="Random seed for data generation.")
parser.add_argument('--dupe_factor', default=5, help="Number of times to duplicate the input data (with different masks).")
parser.add_argument('--masked_lm_prob', default=0.15, help="Masked LM probability.")
parser.add_argument('--short_seq_prob', default=0.1, help="Probability of creating sequences which are shorter than the maximum length.")


opt = parser.parse_args(args=[
    '--input_file', '1995_income',  
    '--output_file', 'MLP',
    '--spm_model_file', './wiki-ja_albert.model',
    '--vocab_file', './wiki-ja_albert.vocab',
    '--do_whole_word_mask', False,
    '--do_permutation', False,
    '--favor_shorter_ngram', False,
    '--random_next_sentenc', False
])


_StoreAction(option_strings=['--input_file'], dest='input_file', nargs=None, const=None, default=None, type=None, choices=None, help='Input raw text file (or comma-separated list of files).', metavar=None)

_StoreAction(option_strings=['--output_file'], dest='output_file', nargs=None, const=None, default=None, type=None, choices=None, help='Output TF example file (or comma-separated list of files).', metavar=None)

_StoreAction(option_strings=['--vocab_file'], dest='vocab_file', nargs=None, const=None, default=None, type=None, choices=None, help='The vocabulary file that the ALBERT model was trained on.', metavar=None)

_StoreAction(option_strings=['--spm_model_file'], dest='spm_model_file', nargs=None, const=None, default=None, type=None, choices=None, help='The model file for sentence piece tokenization.', metavar=None)

_StoreAction(option_strings=['--input_file_mode'], dest='input_file_mode', nargs=None, const=None, default='r', type=None, choices=None, help='The data format of the input file.', metavar=None)

_StoreAction(option_strings=['--do_lower_case'], dest='do_lower_case', nargs=None, const=None, default=True, type=None, choices=None, help='Whether to lower case the input text. Should be True for uncased models and False for cased models.', metavar=None)

_StoreAction(option_strings=['--do_whole_word_mask'], dest='do_whole_word_mask', nargs=None, const=None, default=True, type=None, choices=None, help='Whether to use whole word masking rather than per-WordPiece masking.', metavar=None)

_StoreAction(option_strings=['--do_permutation'], dest='do_permutation', nargs=None, const=None, default=False, type=None, choices=None, help='Whether to do the permutation training.', metavar=None)

_StoreAction(option_strings=['--favor_shorter_ngram'], dest='favor_shorter_ngram', nargs=None, const=None, default=True, type=None, choices=None, help='Whether to set higher probabilities for sampling shorter ngrams.', metavar=None)

_StoreAction(option_strings=['--random_next_sentence'], dest='random_next_sentence', nargs=None, const=None, default=False, type=None, choices=None, help="Whether to use the sentence that's right before the current sentence as the negative sample for next sentence prection, rather than using sentences from other random documents.", metavar=None)

_StoreAction(option_strings=['--max_seq_length'], dest='max_seq_length', nargs=None, const=None, default=512, type=None, choices=None, help='Maximum sequence length.', metavar=None)

_StoreAction(option_strings=['--ngram'], dest='ngram', nargs=None, const=None, default=3, type=None, choices=None, help='Maximum number of ngrams to mask.', metavar=None)

_StoreAction(option_strings=['--max_predictions_per_seq'], dest='max_predictions_per_seq', nargs=None, const=None, default=20, type=None, choices=None, help='Maximum number of masked LM predictions per sequence.', metavar=None)

_StoreAction(option_strings=['--random_seed'], dest='random_seed', nargs=None, const=None, default=12345, type=None, choices=None, help='Random seed for data generation.', metavar=None)

_StoreAction(option_strings=['--dupe_factor'], dest='dupe_factor', nargs=None, const=None, default=5, type=None, choices=None, help='Number of times to duplicate the input data (with different masks).', metavar=None)

_StoreAction(option_strings=['--masked_lm_prob'], dest='masked_lm_prob', nargs=None, const=None, default=0.15, type=None, choices=None, help='Masked LM probability.', metavar=None)

_StoreAction(option_strings=['--short_seq_prob'], dest='short_seq_prob', nargs=None, const=None, default=0.1, type=None, choices=None, help='Probability of creating sequences which are shorter than the maximum length.', metavar=None)

In [6]:
seed = 202105

# main'
main_path = Path('/home/jupyter/gogolook')
main_cached_path = Path('/home/jupyter/gogolook/data')

# general setting
main_data_path = main_path / 'data' / 'jp_data' 
main_model_path = main_path / 'models'
cache_data_path = main_cached_path / 'cache_data_dir'
cache_models_path = main_cached_path / 'cache_models_fir'

# models
albert_zh_path = main_model_path / 'albert_zh'

# data
regex_file_format = '*.json'
data_tag = 'pretraining_data'
valid_data_tag = 'pretraining_data'
test_data_tag = 'pretraining_data'

experiment_train_data_path = main_data_path / f'train_{data_tag}'    
experiment_valid_data_path = main_data_path / f'valid_{valid_data_tag}'
experiment_test_data_path = main_data_path / f'test_{test_data_tag}'

training_data_path = experiment_train_data_path
validation_data_path = experiment_valid_data_path
testing_data_path = experiment_test_data_path


In [7]:
project = 'jp-pretrain-model'
project_shortname = 'jp-sms'
group_tag = 'experiment' # 1. functional 2. experiment 3. staging 4. production
job_type = 'baseline' # 1. baseline 2. optimize 3. hyper-tuning
addition_tag = [data_tag, 'pytorch'] # exponential_decay
method_tag = 'pretrain' # pretrain / finetune / pretrain_finetune
time_tag = pendulum.now(tz='Asia/Taipei').strftime('%Y%m%d%H%M%S')
run_id = '{}_{}_{}'.format(project, job_type, time_tag)
print('Run id is {}'.format(run_id))



Run id is jp-pretrain-model_baseline_20210902155112


In [8]:
import pyarrow as pa
from datasets import load_dataset
from datasets import total_allocated_bytes
from dataclasses import dataclass, field
from typing import Dict, Optional, Union, List
from transformers import (
    BertTokenizer, AlbertForPreTraining, AlbertModel, AlbertConfig, PreTrainedTokenizer)

from transformers import AutoModel, AutoTokenizer, BertJapaneseTokenizer
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR

from torch.cuda.amp import GradScaler, autocast
from torch import optim
from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CosineAnnealingLR, CyclicLR
from transformers import get_polynomial_decay_schedule_with_warmup
        
#import datasets
#datasets.logging.set_verbosity_info()
#datasets.logging.get_verbosity()


In [9]:
mecab_tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese", word_tokenizer_type="mecab", cache_dir=cache_models_path)
# Input Japanese Text
line = "アンパサンド (&、英語名：) とは並立助詞「…と…」を意味する記号である。ラテン語の の合字で、Trebuchet MSフォントでは、と表示され \"et\" の合字であることが容易にわかる。"
mecab_inputs = mecab_tokenizer(line, return_tensors="pt")
print(mecab_tokenizer.decode(mecab_inputs['input_ids'][0]))
corpus_size = len(mecab_tokenizer)

[CLS] アンパサンド (&、 英語 名 :) と は 並立 助詞 「... と...」 を 意味 する 記号 で ある 。 ラテン語 の の 合 字 で 、 Trebuchet MS フォント で は 、 と 表示 さ れ " et " の 合 字 で ある こと が 容易 に わかる 。 [SEP]


In [10]:
mecab_tokenizer.max_model_input_sizes

{'cl-tohoku/bert-base-japanese': 512,
 'cl-tohoku/bert-base-japanese-whole-word-masking': 512,
 'cl-tohoku/bert-base-japanese-char': 512,
 'cl-tohoku/bert-base-japanese-char-whole-word-masking': 512}

# Model config

In [11]:
optimizer_config = {
    "SGD": {
        "init_learning_rate": 1e-1, 
        "pre_finetune_learning_rate": 1e-3
    },
    "Adam": {
        "init_learning_rate": 1e-3,
        "pre_finetune_learning_rate": 1e-5
    }, 
    "RAdam": {
        "init_learning_rate": 1e-3,
        "pre_finetune_learning_rate": 2e-5
    }     
}

model_config = {
    "epochs": 3,
    "initial_epochs": 20,
    "batch_size": 128,
    "max_tokens_length": 512,
    "threshold": 0.5,
    "optimizer_method": "Adam",
    "init_learning_rate": optimizer_config['Adam']['init_learning_rate'],
    "pre_finetune_learning_rate": optimizer_config['Adam']['pre_finetune_learning_rate'],
    "end_learning_rate": 1e-5,
    "lsm": 0.0,
    "hidden_dropout_prob": 0.1,
    "use_warmup": True,
    "use_multi_gpus": True
}



# Data loader pipeline

In [12]:
torch.multiprocessing.set_start_method('spawn')
#datasets.config.IN_MEMORY_MAX_SIZE
@dataclass(eq=False)
class GenerateDatasets: 
    #files_list: str = field(
    #    default=None, metadata={"help": "The files list of data path"}
    #)
    data_path: str = field(
        default=None, metadata={"help": "The prefix path of files location"}
    )
    regex_file_format: str = field(
        default='*.parquet', metadata={"help": "The files format."}
    )
    batch_size: int = field(
        default=128, metadata={"help": "Batch size"}
    )
    is_training: bool = field(
        default=True, metadata={"help": "Is use training mode to create data pipeline"}
    )
    device: str = field(
        default='cpu', metadata={"help": "Which device to use [cpu, cuda]"}
    )
    cache_data_path: str = field(
        default=None, metadata={"help": "The path to cache data."}
    )
        
    def __post_init__(self):
        self.get_files_list = glob.glob(os.path.join(str(self.data_path), self.regex_file_format))
        #self.get_files_list = '/home/jupyter/gogolook/data/jp_data/valid_pretraining_data/valid_all-maxseq512_BG.parquet'
        self.encoding_columns = ['input_ids', 'token_type_ids', 'attention_mask']
        self.target_columns = ['masked_lm_labels', 'next_sentence_labels']
        
    def __call__(self, **kwargs):
        # data 已經存在 device (cuda) 裡，所以再用 pin_memory 會出現 error
        # RuntimeError: cannot pin 'torch.cuda.LongTensor' only dense CPU tensors can be pinned        
        dataset = load_dataset('parquet', data_files=self.get_files_list, cache_dir=self.cache_data_path, split='train')
        dataset.set_format(type='torch', columns=self.encoding_columns + self.target_columns) # , device=self.device
        #dataset = dataset.rename_column(self.target_column, 'labels')
        if self.is_training:
            drop_last = True
        else: 
            drop_last = False
            
        dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=self.batch_size,
            pin_memory=True,
            shuffle=True,
            drop_last=drop_last,
            num_workers=multiprocessing.cpu_count())
        return dataloader
        

get_train_dataset = GenerateDatasets(
    data_path=training_data_path,
    batch_size=model_config['batch_size'],
    is_training=True,
    device=device,
    cache_data_path=cache_data_path)

get_valid_dataset = GenerateDatasets(
    data_path=training_data_path,
    batch_size=model_config['batch_size'],
    is_training=False,
    device=device,
    cache_data_path=cache_data_path)

train_dataloader = get_train_dataset()
#val_dataloader = get_valid_dataset()





In [13]:
get_train_dataset.get_files_list
#get_valid_dataset.get_files_list

['/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AC.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AU.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_BA.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AJ.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AN.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AR.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AQ.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_BD.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_AA.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data/train_all-maxseq512_BF.parquet',
 '/home/jupyter/gogolook/data/jp_data/train_pretraining_data

In [14]:
next(iter(train_dataloader))

{'input_ids': tensor([[    2,  7482, 28511,  ...,    31,     4,     3],
         [    2,   265,     5,  ...,     0,     0,     0],
         [    2,  5574,  1941,  ...,    10,     8,     3],
         ...,
         [    2,   106,     6,  ...,  5966,     9,     3],
         [    2, 11752,  5609,  ...,     0,     0,     0],
         [    2,  2413,   225,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'masked_lm_labels': tensor([[-100, -100, -100,  ..., -100,    8, -100],
         [   0, -100, -100,  ..., -100, -100, -100],
   

In [15]:
if model_config['use_warmup']:
    model_config['warmup_steps'] = int(len(train_dataloader) * model_config['epochs'] * 0.1)
    model_config['decay_steps'] = len(train_dataloader) * model_config['epochs']
else:
    model_config['warmup_steps'] = None 
    model_config['decay_steps'] = None
model_config['training_steps'] = len(train_dataloader)


torch.cuda.empty_cache()
albert_config = AlbertConfig.from_json_file(albert_zh_path / 'albert_config' / 'albert_config_tiny.json')
pretrained_model_name_or_path = 'voidful/albert_chinese_tiny'
albert_pretrain_model = AlbertForPreTraining.from_pretrained(
    pretrained_model_name_or_path, 
    config=albert_config,             
    cache_dir=cache_models_path)
albert_pretrain_model.resize_token_embeddings(corpus_size)


Some weights of AlbertForPreTraining were not initialized from the model checkpoint at voidful/albert_chinese_tiny and are newly initialized: ['sop_classifier.classifier.weight', 'sop_classifier.classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(32000, 128)

In [16]:
albert_config

AlbertConfig {
  "_name_or_path": "voidful/albert_chinese_tiny",
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "directionality": "bidi",
  "embedding_size": 128,
  "eos_token_id": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 312,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 1248,
  "layer_norm_eps": 1e-12,
  "ln_type": "postln",
  "max_position_embeddings": 512,
  "model_type": "albert",
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.0",
  "type_vocab_size": 2,
  "vocab_size": 32000
}

In [17]:
if model_config["use_multi_gpus"]:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    device_ids = [idx for idx in range(torch.cuda.device_count())]
    albert_pretrain_model = nn.DataParallel(albert_pretrain_model, device_ids=device_ids)
albert_pretrain_model.to(device)   


Let's use 4 GPUs!


DataParallel(
  (module): AlbertForPreTraining(
    (albert): AlbertModel(
      (embeddings): AlbertEmbeddings(
        (word_embeddings): Embedding(32000, 128)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (encoder): AlbertTransformer(
        (embedding_hidden_mapping_in): Linear(in_features=128, out_features=312, bias=True)
        (albert_layer_groups): ModuleList(
          (0): AlbertLayerGroup(
            (albert_layers): ModuleList(
              (0): AlbertLayer(
                (full_layer_layer_norm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
                (attention): AlbertAttention(
                  (query): Linear(in_features=312, out_features=312, bias=True)
                  (key): Linear(in_features=312, out_features=312, bias=True)
                  (value): L

# Define Optimizer

In [18]:
model_params = list(albert_pretrain_model.named_parameters())
optimizer_grounded_parameters_by_name = [
    {'params': [n for n, p in model_params if not any(nd in n for nd in ['bias', 'gamma', 'beta'])], 
     'weight_decay_rate': 1e-2  },
    {'params': [n for n, p in model_params if any(nd in n for nd in ['bias', 'gamma', 'beta'])], 
     'weight_decay_rate': 0.0 }
]

optimizer_grounded_parameters_by_name

[{'params': ['module.albert.embeddings.word_embeddings.weight',
   'module.albert.embeddings.position_embeddings.weight',
   'module.albert.embeddings.token_type_embeddings.weight',
   'module.albert.embeddings.LayerNorm.weight',
   'module.albert.encoder.embedding_hidden_mapping_in.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.weight',
   'module.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight',
   '

In [19]:
model_params = list(albert_pretrain_model.named_parameters())

optimizer_grounded_parameters = [
    {'params': [p for n, p in model_params if not any(nd in n for nd in ['bias', 'gamma', 'beta'])], 
     'weight_decay_rate': 1e-2  },
    {'params': [p for n, p in model_params if any(nd in n for nd in ['bias', 'gamma', 'beta'])], 
     'weight_decay_rate': 0.0 }
]

In [20]:
from torch.optim.lr_scheduler import _LRScheduler

class PolynomialDecay(_LRScheduler):
    def __init__(self, optimizer, decay_steps, end_learning_rate=0.0001, power=0.5, cycle=False, last_epoch=-1, verbose=False):
        if decay_steps <= 1.:
            raise ValueError('max_decay_steps should be greater than 1.')            
        self.decay_steps = decay_steps
        self.end_learning_rate = end_learning_rate
        self.power = power
        self.cycle = cycle
        super(PolynomialDecay, self).__init__(optimizer, last_epoch, verbose)
    
    def get_lr(self):
        if not self._get_lr_called_within_step:
            warnings.warn("To get the last learning rate computed by the scheduler, "
                          "please use `get_last_lr()`.")
        #dtype = initial_learning_rate.dtype
        #end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
        #power = math_ops.cast(self.power, dtype)
        #global_step_recomp = math_ops.cast(step, dtype)
        #decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
        global_step_recomp = self.last_epoch
        decay_steps_recomp = self.decay_steps
        
        if self.cycle:
            if global_step_recomp == 0:
                multiplier = 1.0 
            else:
                multiplier = math.ceil(global_step_recomp / self.decay_steps)
            decay_steps_recomp = decay_steps_recomp * multiplier
        else:
            global_step_recomp = min(global_step_recomp, decay_steps_recomp)
            
        p = global_step_recomp / decay_steps_recomp
        ic(self.last_epoch, optimizer.param_groups[0]['lr'], p)
        return [((group['lr'] - self.end_learning_rate) * math.pow(1 - p, self.power) + self.end_learning_rate) for group in self.optimizer.param_groups]
    
    def _get_closed_form_lr(self):
        return [(base_lr - self.end_learning_rate) * math.pow(1 - p, self.power) + self.end_learning_rate for base_lr in self.base_lrs]


In [21]:
optimizer = torch.optim.Adam(
    params=optimizer_grounded_parameters,
    lr=model_config["init_learning_rate"],
    betas=(0.9, 0.98),
    weight_decay=0.0,
    eps=1e-6)

#optimizer = optim.SGD(sms_model.parameters(), lr=model_config['init_learning_rate'], weight_decay=1e-4)
#optimizer = optim.SGD(filter(lambda p: p.requires_grad, sms_model.parameters()), lr=model_config['init_learning_rate'], weight_decay=1e-4)

#scheduler = CyclicLR(
#    optimizer, 
#    base_lr=1e-5,
#    max_lr=model_config['init_learning_rate'],
#    step_size_up=model_config['training_steps'] * 1,
#    mode='triangular2',
#    scale_mode='cycle',
#    cycle_momentum=False
#)


#if model_config["use_multi_gpus"]:
#optimizer = nn.DataParallel(optimizer, device_ids=device_ids)

 
scheduler = LinearWarmupCosineAnnealingLR(
    optimizer, 
    warmup_epochs=model_config['warmup_steps'], 
    max_epochs=model_config['training_steps'] * model_config['epochs'], 
    eta_min=model_config["end_learning_rate"])



In [22]:
# Define Mertice
from torchmetrics import MetricCollection
metric_collection = MetricCollection([
    torchmetrics.Accuracy(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
    torchmetrics.Precision(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
    torchmetrics.Recall(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
    torchmetrics.F1(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device)
], prefix='Train_')

val_metric_collection = MetricCollection([
    torchmetrics.Accuracy(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
    torchmetrics.Precision(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
    torchmetrics.Recall(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
    torchmetrics.F1(num_classes=2, average='macro', multiclass=True, dist_sync_on_step=True, mdmc_average='global').to(device),
], prefix='Val_')


# Training model

### Init wandb

In [23]:
if model_config['use_warmup']:
    model_config['warmup_steps'] = int(len(train_dataloader) * model_config['epochs'] * 0.1)
    model_config['decay_steps'] = len(train_dataloader) * model_config['epochs']
else:
    model_config['warmup_steps'] = None 
    model_config['decay_steps'] = None
model_config['training_steps'] = len(train_dataloader)

wandb.init(
    project=project,
    group=group_tag,
    job_type=job_type,
    name=run_id,
    notes=method_tag,
    tags=addition_tag,
    sync_tensorboard=False,
    config={**model_config},
    reinit=True    
)

wandb_config = wandb.config


[34m[1mwandb[0m: Currently logged in as: [33myuyuliao20[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


### Simple test

In [24]:
'''
torch.cuda.empty_cache()
prefix = 'train'
for epoch in tqdm(range(10)): # model_config['epochs']
    start_time = time.time()    
    train_batch_loss = 0
    val_batch_loss = 0    
    
    # Training Step
    albert_pretrain_model = albert_pretrain_model.train()
    for step, train_batch in tqdm(enumerate(train_dataloader), 
                                  dynamic_ncols=False, 
                                  bar_format="{n_fmt}/{total_fmt}{bar} ETA: {remaining}s - {desc}", 
                                  total=len(train_dataloader),
                                  leave=True, 
                                  unit='steps'):        
        input_ids = train_batch['input_ids'].to(device)
        attention_mask = train_batch['attention_mask'].to(device)
        token_type_ids = train_batch['token_type_ids'].to(device)

        mlm_labels = train_batch['masked_lm_labels'].to(device)
        sop_labels = train_batch['next_sentence_labels'].to(device)
        
        outputs = albert_pretrain_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=mlm_labels,
            sentence_order_label=sop_labels
        )
                    
        loss = outputs.loss.mean()
        perplexity  = torch.exp(loss)
        
        optimizer.zero_grad()
        loss.backward()
        if model_config["use_multi_gpus"]:
            optimizer.module.step()
        else:
            optimizer.step()
            
        wandb.log({
            "loss": loss,
            "perplexity": perplexity,            
        }, step=step)
        
'''

'\ntorch.cuda.empty_cache()\nprefix = \'train\'\nfor epoch in tqdm(range(10)): # model_config[\'epochs\']\n    start_time = time.time()    \n    train_batch_loss = 0\n    val_batch_loss = 0    \n    \n    # Training Step\n    albert_pretrain_model = albert_pretrain_model.train()\n    for step, train_batch in tqdm(enumerate(train_dataloader), \n                                  dynamic_ncols=False, \n                                  bar_format="{n_fmt}/{total_fmt}{bar} ETA: {remaining}s - {desc}", \n                                  total=len(train_dataloader),\n                                  leave=True, \n                                  unit=\'steps\'):        \n        input_ids = train_batch[\'input_ids\'].to(device)\n        attention_mask = train_batch[\'attention_mask\'].to(device)\n        token_type_ids = train_batch[\'token_type_ids\'].to(device)\n\n        mlm_labels = train_batch[\'masked_lm_labels\'].to(device)\n        sop_labels = train_batch[\'next_sentence_labels\'

### Define training stepsm

In [25]:
def training_step(model, input_ids, attention_mask, token_type_ids, mlm_labels, sop_labels, scaler, use_multi_gpus=False):
    with autocast():
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=mlm_labels,
            sentence_order_label=sop_labels
        )
        assert outputs.prediction_logits.dtype is torch.float16
        
        loss = outputs.loss.mean()
        assert loss.dtype is torch.float32
    # Backward pass
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    scaler.scale(loss).backward()
    #if use_multi_gpus:
    #    scaler.step(optimizer.module)
    #else:
    scaler.step(optimizer)
    scaler.update()
    #torch.nn.utils.clip_grad_norm_(optimizer_grounded_parameters, max_norm=0.5)
    #if epoch > swa_start:
    #    swa_model.update_parameters(model)
    #    swa_scheduler.step()
    #else:
    #    scheduler.step()
    return loss

@torch.no_grad()
def validataion_step(model, input_ids, attention_mask, token_type_ids, mlm_labels, sop_labels):    
    with autocast():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=mlm_labels,
            sentence_order_label=sop_labels
        )
        loss = outputs.loss.mean()
    return loss

@torch.no_grad()
def testing_step(model, dataset_inputs):
    pass


### Training

In [None]:
print('[RUN ID]: {}'.format(run_id))
torch.cuda.empty_cache()
use_epoch_tracking = False
use_step_tracking = True
#wandb.watch(sms_model, log="all", log_freq=1000)        
def show_logs(loss, step, is_epoch=False, prefix='Train', **kwargs):
    loss = float(loss)
    if is_epoch:
        wandb.log({"epoch": step, f"{prefix}_loss": loss}, step=step)
    else:
        wandb.log({f"{prefix}_step_loss": loss}, step=step)
        #print(f"{prefix} loss after " + str(example_ct).zfill(5) + f" examples: {loss:.3f}")
    if "perplexity" in kwargs.keys():
        wandb.log({f"{prefix}_perplexity": kwargs["perplexity"]}, step=step)

# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()

for epoch in tqdm(range(model_config['epochs'])): # model_config['epochs']
    start_time = time.time()    
    train_batch_loss = 0
    valid_batch_loss = 0  
    
    train_perplexity = 0
    valid_perplexity = 0
    
    # Training Step
    albert_pretrain_model = albert_pretrain_model.train()
    for step, train_batch in tqdm(enumerate(train_dataloader), 
                                  dynamic_ncols=False, 
                                  bar_format="{n_fmt}/{total_fmt}{bar} ETA: {remaining}s - {desc}", 
                                  total=len(train_dataloader),
                                  leave=True, 
                                  unit='steps'):        
        input_ids = train_batch['input_ids'].to(device)
        attention_mask = train_batch['attention_mask'].to(device)
        token_type_ids = train_batch['token_type_ids'].to(device)        
        mlm_labels = train_batch['masked_lm_labels'].to(device)
        sop_labels = train_batch['next_sentence_labels'].to(device)
        
        train_loss = training_step(
            model=albert_pretrain_model, 
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, 
            mlm_labels=mlm_labels, 
            sop_labels=sop_labels,
            scaler=scaler,
            use_multi_gpus=model_config["use_multi_gpus"]
        )
        scheduler.step()
        train_batch_loss += train_loss.item()
        train_perplexity += torch.exp(train_loss)
        
        #if model_config["use_multi_gpus"]:
        #    last_lr = optimizer.module.param_groups[0]['lr']
        #else:
        #last_lr = optimizer.param_groups[0]['lr']
        last_lr = scheduler.optimizer.param_groups[0]["lr"]
          
        if use_step_tracking:
            record_step = (step + 1) * (epoch + 1)
            wandb.log({'learning_rate': last_lr}, step=record_step)
            show_logs(
                train_batch_loss / record_step, 
                record_step, 
                perplexity=train_perplexity.item() / record_step)
    
    if use_epoch_tracking:
        train_epoch_loss = train_batch_loss / step
        wandb.log({'learning_rate': last_lr}, step=epoch)
        show_log(train_epoch_loss, epoch, is_epoch=True)  
        #train_metric_records = metric_collection.compute()
        #wandb.log(train_metric_records, step=epoch)
    
    # Validation Step
    albert_pretrain_model = albert_pretrain_model.eval()
    for step, valid_batch in tqdm(enumerate(val_dataloader), 
                                dynamic_ncols=False, 
                                bar_format="{n_fmt}/{total_fmt}{bar} ETA: {remaining}s - {desc}", 
                                total=len(val_dataloader),
                                leave=True, 
                                unit='steps'):            
        input_ids = valid_batch['input_ids'].to(device)
        attention_mask = valid_batch['attention_mask'].to(device)
        token_type_ids = valid_batch['token_type_ids'].to(device)
        mlm_labels = valid_batch['masked_lm_labels'].to(device)
        sop_labels = valid_batch['next_sentence_labels'].to(device)
            
        valid_loss = validataion_step(
            model=albert_pretrain_model, 
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, 
            mlm_labels=mlm_labels,
            sop_labels=sop_labels
        )
        valid_batch_loss += valid_loss.item()
        valid_perplexity += torch.exp(valid_loss)
        
        if use_step_tracking:
            record_step = (step + 1) * (epoch + 1)
            wandb.log({'learning_rate': last_lr}, step=record_step)
            show_logs(
                valid_batch_loss / record_step, 
                record_step, 
                prefix='valid', 
                perplexity=valid_perplexity.item() / record_step)
            
        #sk_metrics = sklearn_metrics(val_outputs, labels, 'train')
        #ic(sk_metrics)
        #ic(val_metric_collection(outputs, labels).compute())
        #ic(val_metric_collection(outputs, labels))
        
    if use_epoch_tracking:        
        valid_epoch_loss = valid_batch_loss / step 
        show_logs(valid_epoch_loss, epoch, is_epoch=True, prefix='Val')
        #val_metric_records = val_metric_collection.compute()
        #wandb.log(val_metric_records, step=epoch)
    
    loss_template = ("Epoch {}/{} - {:.0f}s {:.0f}ms/step - lr:{:} - loss: {:.6f} - val_loss: {:.6f}")  
    #metrics_template = (
    #    """
    #    categorical_accuracy: {:.4f} - f1_score: {:.4f} - multi_precision: {:.4f} - multi_recall: {:.4f}
    #    val_categorical_accuracy: {:.4f} -  val_f1_score: {:.4f} - val_multi_precision: {:.4f} - val_multi_recall: {:.4f}
    #    """
    #)
    end_time = time.time()
    each_steps_compute_time = (end_time - start_time)
    print(loss_template.format(
        epoch,
        model_config['epochs'], 
        each_steps_compute_time,
        each_steps_compute_time * 1000 / model_config['training_steps'],
        last_lr,
        train_epoch_loss,
        val_epoch_loss)
    )

    #print(metrics_template.format(
    #    train_metric_records['Train_Accuracy'],
    #    train_metric_records['Train_F1'],
    #    train_metric_records['Train_Precision'],
    #    train_metric_records['Train_Recall'],
    #    val_metric_records['Val_Accuracy'],
    #    val_metric_records['Val_F1'],
    #    val_metric_records['Val_Precision'],
    #    val_metric_records['Val_Recall']
    #))
    
    if use_epoch_tracking:
        metric_collection.reset()
        val_metric_collection.reset()
     

[RUN ID]: jp-pretrain-model_baseline_20210902155112


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=131487.0), HTML(value='')))

In [None]:
wandb.finish()


In [None]:
assert 1 == 2

# Test

In [None]:
%matplotlib inline
import math
from torch.optim.lr_scheduler import _LRScheduler
from torch import nn
from torch import cuda
from torch import optim
from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CosineAnnealingLR, CyclicLR
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

net = NeuralNetwork()

optimizer = optim.SGD(net.parameters(), lr = 1e-2)
lambda1 = lambda epoch: 0.2 if epoch % 5 == 0 else 1
lambda2 = lambda epoch: 0.2

#scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda = lambda2)
#scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5,10,15], gamma=0.1)
#scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.9)
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda step: PolynomialDecay(step))

class PolynomialDecay(_LRScheduler):
    def __init__(self, optimizer, decay_steps, end_learning_rate=0.0001, power=0.5, cycle=False, last_epoch=-1, verbose=False):
        if decay_steps <= 1.:
            raise ValueError('max_decay_steps should be greater than 1.')            
        self.decay_steps = decay_steps
        self.end_learning_rate = end_learning_rate
        self.power = power
        self.cycle = cycle
        super(PolynomialDecay, self).__init__(optimizer, last_epoch, verbose)
    
    def get_lr(self):
        if not self._get_lr_called_within_step:
            warnings.warn("To get the last learning rate computed by the scheduler, "
                          "please use `get_last_lr()`.")
        #dtype = initial_learning_rate.dtype
        #end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
        #power = math_ops.cast(self.power, dtype)
        #global_step_recomp = math_ops.cast(step, dtype)
        #decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
        global_step_recomp = self.last_epoch
        decay_steps_recomp = self.decay_steps
        
        if self.cycle:
            if global_step_recomp == 0:
                multiplier = 1.0 
            else:
                multiplier = math.ceil(global_step_recomp / self.decay_steps)
            decay_steps_recomp = decay_steps_recomp * multiplier
        else:
            global_step_recomp = min(global_step_recomp, decay_steps_recomp)
            
        p = global_step_recomp / decay_steps_recomp
        #c(self.last_epoch, optimizer.param_groups[0]['lr'], p)
        return [((group['lr'] - self.end_learning_rate) * math.pow(1 - p, self.power) + self.end_learning_rate) for group in self.optimizer.param_groups]
    
    def _get_closed_form_lr(self):
        return [(base_lr - self.end_learning_rate) * math.pow(1 - p, self.power) + self.end_learning_rate for base_lr in self.base_lrs]


    
def polynomial_decay_scale_fun(global_steps, initial_learning_rate=1e-2, decay_steps=100, power=0.5, end_learning_rate=1e-5, cycle=False):
    if cycle:
        if global_steps == 0:
            multiplier = 1.0 
        else:
            multiplier = math.ceil(global_steps / decay_steps)
            decay_steps = decay_steps * multiplier
    else:
        global_steps = min(global_steps, decay_steps)
    p = global_steps / decay_steps
    #ic(global_steps, p)
    return (initial_learning_rate - end_learning_rate) * math.pow(1 - p, power) + end_learning_rate
    
    
#optimizer = optim.SGD(net.parameters(), lr=1e-2)
optimizer = optim.Adam(net.parameters(), lr=1e-3)
#scheduler = PolynomialDecay(optimizer, decay_steps=1000, end_learning_rate=1e-5)
 
scheduler = LinearWarmupCosineAnnealingLR(
    optimizer, 
    warmup_epochs=model_config['warmup_steps'], 
    max_epochs=model_config['training_steps'] * model_config['epochs'], 
    eta_min=model_config["end_learning_rate"])

#scheduler = optim.lr_scheduler.CyclicLR(
#    optimizer, 
#    base_lr=1e-5,
#    max_lr=1e-2,
#    step_size_up=20,
#    scale_fn=polynomial_decay_scale_fun,
#    mode='triangular2',
#    scale_mode='cycle',
#    cycle_momentum=False)

iteration = model_config['epochs']
scheduler_lr_list = []
for epoch in range(1, iteration):
    scheduler.step()
    #print(epoch, scheduler.get_last_lr()[0])
    scheduler_lr_list.append(scheduler.get_last_lr()[0])

plt.xlabel('Training Iterations')
plt.ylabel('Learning Rate')
plt.title("CLR - 'triangular' Policy")
plt.plot(range(1, iteration), scheduler_lr_list)
