In [1]:
! pip install git+https://github.com/huggingface/transformers.git

In [2]:
!wget https://raw.githubusercontent.com/rathiankit03/ImageCaptionHindi/master/Flickr8kHindiDataset/Flickr8k-Hindi.txt

In [3]:
import transformers
print(transformers.__version__)

In [4]:
import pandas as pd
base_path = '../input/flickr8k/Images/'
with open('./Flickr8k-Hindi.txt') as f:
    data = []
    
    for i in f.readlines():
        sp = i.split(' ')
        data.append([sp[0] + '.jpg', ' '.join(sp[1:])])
        
hindi = pd.DataFrame(data, columns = ['images', 'text'])
#hindi['images'] = hindi['images']!='2258277193_58694969e2'
hindi.head()

In [5]:
hindi = hindi[hindi['images']!='2258277193_586949ec62.jpg']
#2258277193_586949ec62
#df = df[df.line_race != 0]
hindi

In [6]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(hindi, test_size=0.2)
# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [7]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class Image_Caption_Dataset(Dataset):
    def __init__(self,root_dir,df, feature_extractor,tokenizer,max_target_length=512):
        self.root_dir = root_dir
        self.df = df
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.max_length=max_target_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self,idx):
        #return image
        image_path = self.df['images'][idx]
        text = self.df['text'][idx]
        #prepare image
        image = Image.open(self.root_dir+'/'+image_path).convert("RGB")
        pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values
        #add captions by encoding the input
        captions = self.tokenizer(text,
                                 padding='max_length',
                                 max_length=self.max_length).input_ids
        captions = [caption if caption != self.tokenizer.pad_token_id else -100 for caption in captions]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(captions)}
        return encoding

In [8]:
from transformers import ViTFeatureExtractor,AutoTokenizer

encoder_checkpoint = 'google/vit-base-patch16-224'
decoder_checkpoint = 'surajp/gpt2-hindi'

feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)

In [9]:
root_dir = "../input/flickr8k/Images"


train_dataset = Image_Caption_Dataset(root_dir=root_dir,
                           df=train_df,
                           feature_extractor=feature_extractor,
                                     tokenizer=tokenizer)
val_dataset = Image_Caption_Dataset(root_dir=root_dir,
                           df=test_df,
                           feature_extractor=feature_extractor,
                                     tokenizer=tokenizer)

In [10]:
len(train_dataset)

In [11]:
encoding = train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

In [12]:
labels = encoding['labels']
labels[labels == -100] = tokenizer.pad_token_id
label_str = tokenizer.decode(labels, skip_special_tokens=True)
print(label_str)

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(val_dataset, batch_size=8)

In [14]:
from transformers import VisionEncoderDecoderModel
# initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_checkpoint, decoder_checkpoint)
#model.to(device)

In [15]:

# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 512
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [16]:
model.decoder.resize_token_embeddings(len(tokenizer))

In [17]:
for param in model.encoder.parameters():
    param.requires_grad = False

In [18]:
total_params = sum(p.numel() for p in model.parameters())
print(total_params)

In [19]:
train_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(train_total_params)

In [20]:
import wandb
wandb.login()

In [27]:
from transformers import AdamW, get_scheduler
from tqdm.notebook import tqdm
import torch


n_epochs = 10
optimizer = AdamW(model.parameters(), lr=5e-5)
nb_train_steps = int(
        len(train_dataloader) / 8 * n_epochs
    )
scheduler = get_scheduler(
        'linear',
        optimizer,
        num_warmup_steps=0,
        num_training_steps=nb_train_steps,
    )
    
scaler = torch.cuda.amp.GradScaler()



In [31]:
device = 'cuda:0'
model.to(device)
print('.')

In [36]:
def train_step(model,train_dl,optimizer,scheduler,epoch):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_dl):
        for k, v in batch.items():
            batch[k] = v.to(device)
            
        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss += loss.item()
        return train_loss
    
    
def valid_step(model,valid_dl,optimizer,scheduler,epoch):
    model.eval()
    valid_loss = 0.0
    for batch in tqdm(valid_dl):
        for k, v in batch.items():
            batch[k] = v.to(device)
            
       
        outputs = model(**batch)
            
        loss = outputs.loss
        
        valid_loss += loss.item()
        return valid_loss

In [37]:
for epoch in range(n_epochs):
    tr_loss = train_step(model,train_dataloader,optimizer,scheduler,epoch)
    vl_loss
    

In [None]:


for epoch in range(n_epochs):  # loop over the dataset multiple times
   # train
   model.train()
   train_loss = 0.0
   for batch in tqdm(train_dataloader):
      # get the inputs
      for k,v in batch.items():
        batch[k] = v.to(device)

      with torch.cuda.amp.autocast():
        outputs = model(**batch)
      #outputs = model(**batch)
      
      loss = outputs.loss
      #loss.backward()
      
      train_loss += loss.item()

   print(f"Loss after epoch {epoch}:", train_loss/len(train_dataloader))
    
   # evaluate
   model.eval()
   with torch.no_grad():
     for batch in tqdm(eval_dataloader):
       # run batch generation
       outputs = model.generate(batch["pixel_values"].to(device))
       # compute metrics
       loss = outputs.loss

   print("Validation LOSS:", valid_loss / len(eval_dataloader))

model.save_pretrained(checkpoints_dir)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    overwrite_output_dir=True,
    fp16=True,
    run_name="first_run",
    load_best_model_at_end=True,
    output_dir="./image_captioning_checkpoint",
    logging_steps=2000,
    save_steps=2000,
    eval_steps=2000,
)

In [None]:
import wandb
wandb.login()

In [None]:
%env WANDB_PROJECT=HF_COMMUNITY_EVENT
%env WANDB_LOG_MODEL=true

In [None]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    #compute_metrics=compute_metrics,
    #it should be set to true only when we login inside
    push_to_hub=True
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)
trainer.train()

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'