In [1]:
from importlib import reload
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

from transformers import (
    AutoConfig, AutoModelForCausalLM, AutoTokenizer
)

pl.seed_everything(42)

Global seed set to 42


42

## tokenizer and dataset

In [2]:
name = "HooshvareLab/gpt2-fa"
tokenizer = AutoTokenizer.from_pretrained(name)

tokenizer.add_special_tokens({
    "bos_token": '</s>',
    "eos_token": '</s>', 
    "pad_token": '<pad>',
    "unk_token": '<unk>',
})

0

In [20]:
import src.data
reload(src.data)
from src.data import PoemDataset, get_dataloaders

dataset = PoemDataset(tokenizer, 'data/all_poems.json', window=512)
len(dataset)

11643882

In [6]:
tl, vl = get_dataloaders(dataset, val_frac=0.1, batch_size=4)

train dataset has 10421668 samples and val dataset has 1157963 samples


In [7]:
b = next(iter(tl))
for k in b:
    print(k, b[k].shape)

input_ids torch.Size([4, 167])
attention_mask torch.Size([4, 167])


## model

In [5]:
import src.model
reload(src.model)
from src.model import PoetFormer

name = 'GPT2-fa-ganjoor-conditional'
print('model name:',name)

model = PoetFormer(pretrained_name="HooshvareLab/gpt2-fa")
model.load_pretrained()

# model = PoetFormer.load_from_checkpoint(f'weights/{name}/last.ckpt', pretrained="HooshvareLab/gpt2-fa")

model name: GPT2-fa-ganjoor-conditional


In [6]:
model.count_parameters()

118099200

## train

In [10]:
logger = TensorBoardLogger(save_dir='logs/', name=name)
lr_logger = LearningRateMonitor(logging_interval='step')
checkpoint = ModelCheckpoint(
    dirpath=f'weights/{name}/', 
    filename='{epoch}-{val_loss:.2f}', 
    monitor='val_loss',
    save_top_k=1, 
    period=1
)

trainer = pl.Trainer(
    benchmark=True, 
    gpus=1, 
    accumulate_grad_batches=1,
    logger=logger, 
    max_epochs=1,
    callbacks=[checkpoint, lr_logger]
)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [11]:
trainer.fit(model, tl, vl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 118 M 
------------------------------------------
118 M     Trainable params
0         Non-trainable params
118 M     Total params
472.397   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: 0it [00:00, ?it/s]

Process Process-16:
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7cd82d0af0>
Traceback (most recent call last):
  File "/home/soroosh/projects/general_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/home/soroosh/projects/general_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.8/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 
Traceback (most recent call last):
  F

In [12]:
trainer.save_checkpoint(f'weights/{name}/last.ckpt')

## generate

In [None]:
from src.model import PoetFormer

name = 'GPT2-fa-ganjoor-conditional'
print('model name:',name)

# model = PoetFormer(pretrained="HooshvareLab/gpt2-fa")
model = PoetFormer.load_from_checkpoint(f'weights/{name}/last.ckpt', pretrained="HooshvareLab/gpt2-fa")

In [14]:
res = model.generate(prompt='', num_return_sequences=1, max_length=128, n_beam=1)
for r in res:
    print(r)

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


generating poem in "حافظ" style.
حافظ:
به دست می‌دهد این زلف و زلف و می‌گویم
که خوب‌تر شوم یا نه من و نه من مپرس
مرض ز مشک تو و مشک ز مشک تو و مشک تو
بی‌خبر ز تو و بی‌جهالستیم
مرا بباید اندر جهان همی‌شنیم
جفا به کشتن من ای ملامتگر
که کس ندانست قدر ترا که می‌باید
جفا به کشتن من نیز بی‌تو بر تنم
برآ به زاری من ای جفاگر و بی‌تو
مرا به زاری و زاری ز بهر درد تو
به عشق تو گرفتار بدن شدم
