In [34]:
!pip install --no-index --no-deps /kaggle/input/lavis-pretrained/salesforce-lavis/transformers* 
!pip install --no-index --no-deps /kaggle/input/lavis-pretrained/salesforce-lavis/hugging*
import torch

Processing /kaggle/input/lavis-pretrained/salesforce-lavis/transformers-4.26.1-py3-none-any.whl
transformers is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0mProcessing /kaggle/input/lavis-pretrained/salesforce-lavis/huggingface_hub-0.12.0-py3-none-any.whl
huggingface-hub is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0m

In [50]:
TRAINING = False

In [36]:
EPOCHS = 10
SEED = 2023
T_MAX = 500
MIN_LR = 1e-6
N_ACCUMULATE = 1
WEIGHT_DECAY = 1e-6
LEARNING_RATE = 1e-4
VALID_BATCH_SIZE = 8
TRAIN_BATCH_SIZE = 4
SCHEDULER = 'CosineAnnealingLR'
DATASET = 'poloclub/diffusiondb'
MODEL_NAME = "/kaggle/input/image-caption-models/git-base"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [49]:
BATCH_SIZE = 8
EMBEDDING_LENGTH = 384
TRAINED_MODEL_PATH = '/kaggle/input/git-base-trained-epoch10/git_base_trained.pt'
OFFLINE_BACKBONE_PATH = "/kaggle/input/image-caption-models/git-base"
SENTENCE_TRANSFORMERS_MODEL = '/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2'

In [38]:
import os
import gc
import copy
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
import torch.optim as optim
from datasets import load_dataset
from collections import defaultdict
from torch.optim import lr_scheduler
from transformers import AutoProcessor, AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import GitVisionModel
import warnings; warnings.filterwarnings("ignore")
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TOKENIZERS_PARALLELISM'] = "False"

In [39]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False    
    os.environ['PYTHONHASHSEED'] = str(seed)    
set_seed(SEED)

In [40]:
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor
    def __len__(self): return len(self.dataset)
    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {k:v.squeeze() for k,v in self.processor(images=item["image"], text=item["prompt"], padding="max_length", return_tensors="pt").items()}

In [41]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()    
    dataset_size = 0
    running_loss = 0.0    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data['input_ids'].to(device)
        pixel_values = data['pixel_values'].to(device)        
        batch_size = input_ids.size(0)
        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)                
        loss = outputs.loss
        loss = loss / N_ACCUMULATE
        loss.backward()    
        if (step + 1) % N_ACCUMULATE == 0:
            optimizer.step()            
            optimizer.zero_grad()
            if scheduler is not None: scheduler.step()                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size        
        epoch_loss = running_loss / dataset_size        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])
    gc.collect()    
    return epoch_loss

In [42]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()    
    dataset_size = 0
    running_loss = 0.0    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        input_ids = data['input_ids'].to(device)
        pixel_values = data['pixel_values'].to(device)        
        batch_size = input_ids.size(0)
        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)                
        loss = outputs.loss        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size        
        epoch_loss = running_loss / dataset_size        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])    
    gc.collect()    
    return epoch_loss

In [43]:
def run_training(model, optimizer, scheduler, num_epochs):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf    
    for epoch in range(1, num_epochs + 1): 
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, dataloader=train_loader, device=DEVICE, epoch=epoch)
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=DEVICE, epoch=epoch)
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss            
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f"BestLoss.bin")
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    model.load_state_dict(best_model_wts)    
    return model

# Using 2m_first_1k instead of 2m_fisrt_5k due to less computational power :(((

In [44]:
if TRAINING:
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    dataset = load_dataset(DATASET, '2m_first_1k')
    dataset = dataset['train']
    dataset = dataset.filter(lambda example: example["step"] == 50)
    dataset = dataset.train_test_split(test_size=0.1)
    train_dataset = ImageCaptioningDataset(dataset['train'], processor)
    valid_dataset = ImageCaptioningDataset(dataset['test'], processor)
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
    valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=VALID_BATCH_SIZE)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# **Loading model** (GiT)

In [45]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("/kaggle/input/image-caption-models/git-base")

**Training the model**

In [46]:
if TRAINING:
    model.to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=T_MAX, eta_min=MIN_LR)
    model = run_training(model, optimizer, scheduler, num_epochs=EPOCHS)
    del train_loader, valid_loader
    _ = gc.collect()
    torch.save(model.state_dict(), 'git_base_trained.pt')

100%|██████████| 225/225 [02:46<00:00,  1.35it/s, Epoch=1, LR=5.82e-5, Train_Loss=1.49]
100%|██████████| 13/13 [00:06<00:00,  1.86it/s, Epoch=1, LR=5.82e-5, Valid_Loss=0.233]


Validation Loss Improved (inf ---> 0.23304848074913026)


100%|██████████| 225/225 [02:49<00:00,  1.33it/s, Epoch=2, LR=3.42e-6, Train_Loss=0.198]
100%|██████████| 13/13 [00:06<00:00,  1.87it/s, Epoch=2, LR=3.42e-6, Valid_Loss=0.189]


Validation Loss Improved (0.23304848074913026 ---> 0.18868886590003967)


100%|██████████| 225/225 [02:50<00:00,  1.32it/s, Epoch=3, LR=2.8e-5, Train_Loss=0.168] 
100%|██████████| 13/13 [00:07<00:00,  1.81it/s, Epoch=3, LR=2.8e-5, Valid_Loss=0.182]


Validation Loss Improved (0.18868886590003967 ---> 0.18191296756267547)


100%|██████████| 225/225 [02:49<00:00,  1.33it/s, Epoch=4, LR=9.05e-5, Train_Loss=0.157]
100%|██████████| 13/13 [00:06<00:00,  1.86it/s, Epoch=4, LR=9.05e-5, Valid_Loss=0.16] 


Validation Loss Improved (0.18191296756267547 ---> 0.16041039705276489)


100%|██████████| 225/225 [02:49<00:00,  1.32it/s, Epoch=5, LR=8.55e-5, Train_Loss=0.118]
100%|██████████| 13/13 [00:06<00:00,  1.87it/s, Epoch=5, LR=8.55e-5, Valid_Loss=0.136]


Validation Loss Improved (0.16041039705276489 ---> 0.1356866678595543)


100%|██████████| 225/225 [02:49<00:00,  1.32it/s, Epoch=6, LR=2.14e-5, Train_Loss=0.0728]
100%|██████████| 13/13 [00:07<00:00,  1.79it/s, Epoch=6, LR=2.14e-5, Valid_Loss=0.118]


Validation Loss Improved (0.1356866678595543 ---> 0.11836093425750732)


100%|██████████| 225/225 [02:49<00:00,  1.33it/s, Epoch=7, LR=6.4e-6, Train_Loss=0.0493] 
100%|██████████| 13/13 [00:07<00:00,  1.81it/s, Epoch=7, LR=6.4e-6, Valid_Loss=0.116]


Validation Loss Improved (0.11836093425750732 ---> 0.11626948237419128)


100%|██████████| 225/225 [02:49<00:00,  1.33it/s, Epoch=8, LR=6.58e-5, Train_Loss=0.0497]
100%|██████████| 13/13 [00:06<00:00,  1.86it/s, Epoch=8, LR=6.58e-5, Valid_Loss=0.12] 
100%|██████████| 225/225 [02:49<00:00,  1.33it/s, Epoch=9, LR=9.94e-5, Train_Loss=0.0559]
100%|██████████| 13/13 [00:06<00:00,  1.87it/s, Epoch=9, LR=9.94e-5, Valid_Loss=0.124]
100%|██████████| 225/225 [02:49<00:00,  1.32it/s, Epoch=10, LR=5.05e-5, Train_Loss=0.0462]
100%|██████████| 13/13 [00:06<00:00,  1.88it/s, Epoch=10, LR=5.05e-5, Valid_Loss=0.114]


Validation Loss Improved (0.11626948237419128 ---> 0.1144895851612091)
Best Loss: 0.1145


# **Loading Trained Model**

In [110]:
# model.load_state_dict(torch.load('/kaggle/working/git_base_trained.pt'))
# model.to(DEVICE)

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (image_encoder): GitVisionModel(
      (vision_model): GitVisionTransformer(
        (embeddings): GitVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
          (position_embedding): Embedding(197, 768)
        )
        (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (encoder): GitVisionEncoder(
          (layers): ModuleList(
            (0-11): 12 x GitVisionEncoderLayer(
              (self_attn): GitVisionAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(in_features=768, out_features=768, bias=True)
             

In [51]:
import sys; sys.path.append('../input/sentence-transformers-222/sentence-transformers')
import os
import sys
import cv2
import torch
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, models
from transformers import AutoProcessor, GitVisionModel,GitVisionModel

In [52]:
# from transformers import AutoModelForCausalLM

# model = AutoModelForCausalLM.from_pretrained("/kaggle/input/image-caption-models/git-base")

In [54]:
if not TRAINING:
    processor = AutoProcessor.from_pretrained(OFFLINE_BACKBONE_PATH)
    model.from_pretrained(OFFLINE_BACKBONE_PATH)
    model.load_state_dict(torch.load(TRAINED_MODEL_PATH))
    model.to(DEVICE)

In [55]:
import glob

In [56]:
if not TRAINING:
    data_directory = "/kaggle/input/stable-diffusion-image-to-prompts/images"
    data_pattern = os.path.sep.join([data_directory,"*.png"])
    image_path_list = list(glob.glob(data_pattern))
    raw_image = Image.open(image_path_list[5]).convert("RGB")
    pixel_values = processor(images=[raw_image], return_tensors="pt").pixel_values.to(DEVICE)
    out = model.generate(pixel_values=pixel_values, max_length=20, min_length=5)
    prompts = processor.batch_decode(out, skip_special_tokens=True)

In [57]:
prompts

['a gauguinesque, russet oil painting on canvas shows four red apples and two']

In [58]:
comp_path = '/kaggle/input/stable-diffusion-image-to-prompts'

In [59]:
if not TRAINING:
    st_model = SentenceTransformer(SENTENCE_TRANSFORMERS_MODEL)
    images = os.listdir(comp_path +"/images")
    image_ids = [i.split('.')[0] for i in images]
    eIds = list(range(EMBEDDING_LENGTH))
    imgId_eId = [
        '_'.join(map(str, i)) for i in zip(
            np.repeat(image_ids, EMBEDDING_LENGTH),
            np.tile(range(EMBEDDING_LENGTH), len(image_ids)))]

In [60]:
def make_batches(image_ids, batch_size=3):
    num_images = len(image_ids)
    num_batches = (num_images + batch_size - 1) // batch_size  # Ceiling division

    batches = []
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min(start_index + batch_size, num_images)
        batch = image_ids[start_index:end_index]
        batches.append(batch)

    return batches


In [61]:
if not TRAINING:
    submissions = []
    ids_ = []
    prompts_=[]
    for batch in make_batches(images):
        images_batch = []
        for i, image in enumerate(batch): 
            img = Image.open(comp_path+"/images/"+image).convert("RGB")
            pixel_values = processor(images = img, return_tensors="pt").pixel_values.to(DEVICE)
            out = model.generate(pixel_values=pixel_values, max_length=20, min_length=5)
            prompts = processor.batch_decode(out, skip_special_tokens=True)
            prompts_.extend(prompts)
            embeddings = st_model.encode(prompts_).flatten()
            submissions.extend(embeddings)
            EMBEDDING_LENGTH = len(embeddings)
            ## 
            image_ids = image
            Ids = list(range(EMBEDDING_LENGTH))
            imgId_eId = [
                '_'.join(map(str, i)) for i in zip(
                    np.repeat(image_ids, EMBEDDING_LENGTH),
                    np.tile(range(EMBEDDING_LENGTH), len(image_ids)))]
    
            ids_.extend(imgId_eId)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [62]:
len(submissions)

10752

In [63]:
if not TRAINING:
    submission = pd.DataFrame({"imgId_eId":ids_, "val": submissions})
    submission.to_csv("submission.csv", index=False)
    submission.head()

In [64]:
len(submissions)

10752

In [65]:
len(imgId_eId)

2688