In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.vision.all import *
import sklearn.metrics as skm
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text
from transformers import AutoModelForMaskedLM, AutoConfig, BertTokenizer, LineByLineTextDataset

from shopee_utils import *
from train_utils import *
import codecs
from torch.utils.data.dataset import Dataset

In [3]:
BERT_CONFIG_PATH = './indobert-large-p2'
PATH = Path('/home/slex/data/shopee')

In [4]:
train_df = pd.read_csv(PATH/'train.csv')

In [5]:
model = AutoModelForMaskedLM.from_pretrained(BERT_CONFIG_PATH)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at ./indobert-large-p2 and are newly initialized: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = BertTokenizer.from_pretrained(BERT_CONFIG_PATH)

In [7]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, lines, block_size):
        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [8]:
dataset=TextDataset(tokenizer, train_df.title.to_list(), 128)

In [9]:
tokenizer.decode(dataset[0]['input_ids'])

'[CLS] paper bag victoria secret [SEP]'

In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./LanguageModel",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_gpu_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [12]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
  2%|▏         | 501/21420 [01:39<1:04:03,  5.44it/s]

{'loss': 2.5538, 'learning_rate': 4.88328664799253e-05, 'epoch': 0.47}


  5%|▍         | 1001/21420 [03:15<55:51,  6.09it/s] 

{'loss': 1.8734, 'learning_rate': 4.7665732959850615e-05, 'epoch': 0.93}


  7%|▋         | 1500/21420 [04:52<1:10:44,  4.69it/s]

{'loss': 1.6234, 'learning_rate': 4.6498599439775914e-05, 'epoch': 1.4}


  9%|▉         | 2001/21420 [06:32<1:08:39,  4.71it/s]

{'loss': 1.4662, 'learning_rate': 4.5331465919701214e-05, 'epoch': 1.87}


 12%|█▏        | 2500/21420 [08:10<1:02:30,  5.05it/s]

{'loss': 1.4135, 'learning_rate': 4.416433239962652e-05, 'epoch': 2.33}


 14%|█▍        | 3001/21420 [09:51<1:00:49,  5.05it/s]

{'loss': 1.3284, 'learning_rate': 4.2997198879551826e-05, 'epoch': 2.8}


 16%|█▋        | 3500/21420 [11:29<1:03:26,  4.71it/s]

{'loss': 1.2774, 'learning_rate': 4.1830065359477126e-05, 'epoch': 3.27}


 19%|█▊        | 4000/21420 [13:10<58:33,  4.96it/s]  

{'loss': 1.1871, 'learning_rate': 4.066293183940243e-05, 'epoch': 3.73}


 21%|██        | 4501/21420 [14:47<51:58,  5.42it/s]  

{'loss': 1.1826, 'learning_rate': 3.949579831932773e-05, 'epoch': 4.2}


 23%|██▎       | 5001/21420 [16:26<51:08,  5.35it/s]  

{'loss': 1.1381, 'learning_rate': 3.832866479925304e-05, 'epoch': 4.67}


 26%|██▌       | 5500/21420 [18:03<51:49,  5.12it/s]  

{'loss': 1.127, 'learning_rate': 3.7161531279178344e-05, 'epoch': 5.14}


 28%|██▊       | 6000/21420 [19:41<48:12,  5.33it/s]  

{'loss': 1.0886, 'learning_rate': 3.5994397759103643e-05, 'epoch': 5.6}


 30%|███       | 6501/21420 [21:19<45:59,  5.41it/s]  

{'loss': 1.0573, 'learning_rate': 3.482726423902894e-05, 'epoch': 6.07}


 33%|███▎      | 7000/21420 [22:57<46:11,  5.20it/s]  

{'loss': 1.0322, 'learning_rate': 3.366013071895425e-05, 'epoch': 6.54}


 35%|███▌      | 7500/21420 [24:36<39:35,  5.86it/s]  

{'loss': 1.0098, 'learning_rate': 3.2492997198879555e-05, 'epoch': 7.0}


 37%|███▋      | 8001/21420 [26:15<40:34,  5.51it/s]  

{'loss': 0.9698, 'learning_rate': 3.1325863678804855e-05, 'epoch': 7.47}


 40%|███▉      | 8501/21420 [27:52<42:29,  5.07it/s]

{'loss': 0.9588, 'learning_rate': 3.0158730158730158e-05, 'epoch': 7.94}


 42%|████▏     | 9000/21420 [29:29<41:45,  4.96it/s]  

{'loss': 0.917, 'learning_rate': 2.8991596638655467e-05, 'epoch': 8.4}


 44%|████▍     | 9500/21420 [31:07<35:04,  5.66it/s]

{'loss': 0.9291, 'learning_rate': 2.7824463118580767e-05, 'epoch': 8.87}


 47%|████▋     | 10000/21420 [32:44<36:07,  5.27it/s]

{'loss': 0.8956, 'learning_rate': 2.665732959850607e-05, 'epoch': 9.34}


 49%|████▉     | 10501/21420 [35:31<34:25,  5.29it/s]   

{'loss': 0.8628, 'learning_rate': 2.5490196078431373e-05, 'epoch': 9.8}


 51%|█████▏    | 11000/21420 [37:07<36:07,  4.81it/s]

{'loss': 0.8696, 'learning_rate': 2.4323062558356675e-05, 'epoch': 10.27}


 54%|█████▎    | 11501/21420 [38:44<32:31,  5.08it/s]

{'loss': 0.847, 'learning_rate': 2.315592903828198e-05, 'epoch': 10.74}


 56%|█████▌    | 12001/21420 [40:25<32:43,  4.80it/s]

{'loss': 0.8432, 'learning_rate': 2.1988795518207285e-05, 'epoch': 11.2}


 58%|█████▊    | 12501/21420 [42:03<26:47,  5.55it/s]

{'loss': 0.823, 'learning_rate': 2.0821661998132587e-05, 'epoch': 11.67}


 61%|██████    | 13001/21420 [43:40<28:15,  4.96it/s]

{'loss': 0.8194, 'learning_rate': 1.965452847805789e-05, 'epoch': 12.14}


 63%|██████▎   | 13501/21420 [45:16<24:30,  5.38it/s]

{'loss': 0.7935, 'learning_rate': 1.8487394957983196e-05, 'epoch': 12.61}


 65%|██████▌   | 14001/21420 [46:58<32:09,  3.85it/s]

{'loss': 0.7628, 'learning_rate': 1.7320261437908496e-05, 'epoch': 13.07}


 68%|██████▊   | 14501/21420 [48:38<22:53,  5.04it/s]

{'loss': 0.7666, 'learning_rate': 1.6153127917833802e-05, 'epoch': 13.54}


 70%|███████   | 15001/21420 [50:19<26:24,  4.05it/s]

{'loss': 0.7547, 'learning_rate': 1.4985994397759103e-05, 'epoch': 14.01}


 72%|███████▏  | 15501/21420 [51:59<18:23,  5.36it/s]

{'loss': 0.7207, 'learning_rate': 1.3818860877684408e-05, 'epoch': 14.47}


 75%|███████▍  | 16001/21420 [53:36<15:45,  5.73it/s]

{'loss': 0.7504, 'learning_rate': 1.265172735760971e-05, 'epoch': 14.94}


 77%|███████▋  | 16501/21420 [55:12<15:38,  5.24it/s]

{'loss': 0.7381, 'learning_rate': 1.1484593837535014e-05, 'epoch': 15.41}


 79%|███████▉  | 17001/21420 [56:51<14:02,  5.25it/s]

{'loss': 0.707, 'learning_rate': 1.0317460317460318e-05, 'epoch': 15.87}


 82%|████████▏ | 17501/21420 [58:30<12:02,  5.43it/s]

{'loss': 0.7031, 'learning_rate': 9.150326797385621e-06, 'epoch': 16.34}


 84%|████████▍ | 18000/21420 [1:00:07<11:43,  4.86it/s]

{'loss': 0.7155, 'learning_rate': 7.983193277310924e-06, 'epoch': 16.81}


 86%|████████▋ | 18500/21420 [1:01:45<08:58,  5.42it/s]

{'loss': 0.6876, 'learning_rate': 6.816059757236228e-06, 'epoch': 17.27}


 89%|████████▊ | 19000/21420 [1:03:25<07:24,  5.45it/s]

{'loss': 0.6768, 'learning_rate': 5.648926237161531e-06, 'epoch': 17.74}


 91%|█████████ | 19501/21420 [1:05:01<05:59,  5.33it/s]

{'loss': 0.6984, 'learning_rate': 4.481792717086835e-06, 'epoch': 18.21}


 93%|█████████▎| 20000/21420 [1:06:39<04:21,  5.43it/s]

{'loss': 0.6791, 'learning_rate': 3.3146591970121383e-06, 'epoch': 18.67}


 96%|█████████▌| 20500/21420 [1:08:23<02:44,  5.58it/s]

{'loss': 0.6729, 'learning_rate': 2.1475256769374416e-06, 'epoch': 19.14}


 98%|█████████▊| 21001/21420 [1:10:01<01:14,  5.59it/s]

{'loss': 0.669, 'learning_rate': 9.80392156862745e-07, 'epoch': 19.61}


100%|██████████| 21420/21420 [1:11:24<00:00,  5.00it/s]

{'train_runtime': 4284.635, 'train_samples_per_second': 4.999, 'epoch': 20.0}





TrainOutput(global_step=21420, training_loss=0.9839220048777975, metrics={'train_runtime': 4284.635, 'train_samples_per_second': 4.999, 'epoch': 20.0})

In [14]:
trainer.save_model('./finetuned_indobert-large-p2')