## Train with Pytorch Lighning

In [1]:
import os
import os.path as osp
if osp.exists("imagecaption"):
    os.chdir("imagecaption")

In [None]:
!pip install -r train/requirements.txt

In [2]:
import os.path as osp
import sys
train_dir = osp.abspath("train")
if train_dir not in sys.path:
    sys.path.append(train_dir)
    
import pl_module
import pytorch_lightning as pl
from huggingface_hub import HfApi, notebook_login
from fine_tune import get_input_model_name

pl.seed_everything(42, workers=True)


Global seed set to 42


42

### Login to HuggingFace

In [3]:

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load the model

In [4]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from pl_module import ImageCaptioningModule
from dataset import ImageCaptioningDataset
from transformers import GitProcessor, GitForCausalLM

from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

out_model = dataset_desc_only = "soul11zz/image-caption-desc-only"
model_name = get_input_model_name(out_model)

dt_train = load_dataset(dataset_desc_only, split="train")
dt_val = load_dataset(dataset_desc_only, split="validation")

processor = GitProcessor.from_pretrained(out_model)
train_dataset = ImageCaptioningDataset(dt_train, processor)
val_dataset = ImageCaptioningDataset(dt_val, processor)


Downloading readme:   0%|          | 0.00/554 [00:00<?, ?B/s]



Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/soul11zz___parquet/soul11zz--image-caption-desc-only-7581b595eec46650/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/263M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/446M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/409M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/430M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/428M [00:00<?, ?B/s]



Computing checksums:  55%|#####4    | 6/11 [00:05<00:04,  1.15it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/2713 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2306 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/13065 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/soul11zz___parquet/soul11zz--image-caption-desc-only-7581b595eec46650/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.




### Prelims for PL Module

In [17]:
batch_size = 16
model_dir = "tmp/model"
epochs = 10

if not osp.exists(model_dir):
  os.makedirs(model_dir)
  
num_workers = os.cpu_count() if os.name != "nt" else 0
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

model = GitForCausalLM.from_pretrained(out_model)
callbacks = []

pl_train_module = ImageCaptioningModule(processor, model, train_loader, val_loader, learning_rate=1e-4)

### Trainer
logger = TensorBoardLogger("tb_logs", name="image-captioning")

checkpoint = ModelCheckpoint(dirpath=model_dir, 
                              save_top_k=2, monitor="val_loss", 
                              mode="min", 
                              filename="imcap-{epoch:02d}-{val_loss:.2f}")

callbacks += [checkpoint]


### Create PL Trainer

In [18]:
trainer = pl.Trainer( 
                      logger=logger, 
                      gpus=1,
                      callbacks=callbacks,
                      max_epochs=epochs,
                      check_val_every_n_epoch=1,
                      # val_check_interval=50,
                      precision=16,
                      num_sanity_val_steps=1,
                      )


Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### (optional) Tune Learning Rate

In [14]:
tuner = pl.Trainer(auto_lr_find=True, accelerator="cuda", devices=1, max_epochs=1)
tuner.tune(pl_train_module)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.0009120108393559097
Restoring states from the checkpoint path at /notebooks/imagecaption/.lr_find_915fbacf-b635-4397-850a-07eaf1675ebe.ckpt


{'lr_find': <pytorch_lightning.tuner.lr_finder._LRFinder at 0x7f6222823040>}

In [19]:
pl_train_module.lr


0.0001

### Run Training

In [20]:
trainer.fit(pl_train_module)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type           | Params
-----------------------------------------
0 | model | GitForCausalLM | 176 M 
-----------------------------------------
176 M     Trainable params
0         Non-trainable params
176 M     Total params
353.238   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f628efaf160>
Exception ignored in: Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py", line 1464, in _shutdown_workers
    <function _MultiProcessingDataLoaderIter.__del__ at 0x7f628efaf160>
if w.is_alive():
  File "/usr/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
Traceback (most recent call last):
      File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/dataloader.py", line 1481, in __del__
assert self._parent_pid == os.getpid(), 'can only test a child process'    
Exception ignored in: self._shutdown_workers()AssertionError: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f628efaf160>
can only test a child processException ignored in: 

<function _MultiProcessingDataLoaderIter.__

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


### Upload Best Model

In [24]:
pl_model_best = ImageCaptioningModule.load_from_checkpoint(checkpoint.best_model_path, processor=processor, model=model, train_dataloader=train_loader, val_dataloader=val_loader)
pl_model_best.model.save_pretrained("tb_logs/image-captioning/best_model", push_to_hub=True, repo_id=out_model)
pl_model_best.processor.save_pretrained("tb_logs/image-captioning/best_model", push_to_hub=True, repo_id=out_model)


pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]