# Environment

In [1]:
pkgs = [
    '/kaggle/input/diffusion2023-package/transformers-4.27.4-py3-none-any.whl',
    '/kaggle/input/diffusion2023-package/ftfy-6.1.1-py3-none-any.whl',
    '/kaggle/input/diffusion2023-package/timm-0.8.21.dev0-py3-none-any.whl',
    '/kaggle/input/diffusion2023-package/open_clip_torch-2.19.0-py3-none-any.whl',
]
!pip install -qq --no-deps --no-python-version-warning --no-warn-conflicts --no-warn-script-location {' '.join(pkgs)} 

!mkdir -p pkgs
!cp -r /kaggle/input/diffusion2023-package/sentence-transformers-2.2.2 ./pkgs
!cd ./pkgs/sentence-transformers-2.2.2; pip install -qq .

!cp -r /kaggle/input/diffusion2023-pretrained/blip2-opt-2.7b ./
!cat /kaggle/working/blip2-opt-2.7b/pytorch_model-00001-of-00002.bin.tar.gz* > /kaggle/working/blip2-opt-2.7b/pytorch_model-00001-of-00002.bin.tar.gz
!cd /kaggle/working/blip2-opt-2.7b/; tar -xzvf pytorch_model-00001-of-00002.bin.tar.gz
!rm -rf /kaggle/working/blip2-opt-2.7b/pytorch_model-00001-of-00002.bin.tar.gz*

!cp -r /kaggle/input/diffusion2023-pretrained/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup ./
!cd /kaggle/working/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/;cat open_clip_pytorch_model.bin.tar* | tar -xv
!rm -rf /kaggle/working/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/open_clip_pytorch_model.bin.tar*


./pytorch_model-00001-of-00002.bin
open_clip_pytorch_model.bin


In [2]:
!mkdir -p raw
!ln -s /kaggle/input/stable-diffusion-image-to-prompts/* ./raw/

!mkdir -p ./result/models/
!ln -s /kaggle/input/diffusion2023-model/* ./result/models/

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import argparse
import time
import gc
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import open_clip

from tqdm import tqdm
from argparse import Namespace
from PIL import Image
from scipy import spatial
from typing import Optional, Tuple
from multiprocessing import Process
from torch.nn import DataParallel
from torch.nn.parallel import DistributedDataParallel
from torch.cuda.amp import autocast
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from transformers import CLIPProcessor
from transformers import CLIPModel
from transformers.models.clip import CLIPConfig
from transformers import Blip2ForConditionalGeneration

opj = os.path.join
ope = os.path.exists

# Config

In [4]:
sample_submission = pd.read_csv('/kaggle/input/stable-diffusion-image-to-prompts/sample_submission.csv')
IS_KAGGLE_SUBMIT = len(sample_submission) != 7 * 384
RESULT_DIR = '/kaggle/working/result'
DATA_DIR = '/kaggle/working'
TRANSFORMERS = {
    'openai/clip-vit-large-patch14-336': '/kaggle/input/diffusion2023-pretrained/clip-vit-large-patch14-336',
    'Salesforce/blip2-opt-2.7b': '/kaggle/working/blip2-opt-2.7b',
    'laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup': "/kaggle/working/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup",
}
PRETRAINED_MODEL_NAME_OR_PATH = lambda model_name: TRANSFORMERS.get(model_name, model_name)

EMBEDDING_LENGTH = 384
ID = 'imgId_eId'
TARGET = 'val'
IMG_ID = 'imgId'
PROMPT = 'prompt'

# Dataset

In [5]:
class DiffusionDataset(Dataset):
    def __init__(self, args, dataset='valid', transform=None):
        self.args = args
        self.dataset = dataset
        self.transform = transform
        self.df, self.targets = self.load_data()

    def load_data(self):
        self.img_dir = f'{DATA_DIR}/raw/images'
        test_df = pd.read_csv(f'{DATA_DIR}/raw/sample_submission.csv')
        test_df[[IMG_ID, 'eId']] = test_df['imgId_eId'].str.split("_", expand=True)
        df = test_df[[IMG_ID]].drop_duplicates().reset_index(drop=True)
        targets = test_df['val'].values.reshape(-1, EMBEDDING_LENGTH)
        df[PROMPT] = ''
        return df, targets

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row[IMG_ID]
        img_file = f'{self.img_dir}/{img_id}.png'
        image = Image.open(img_file)
        inputs = self.transform(image)
        inputs['target'] = torch.tensor(self.targets[index])
        return inputs

    def __len__(self):
        return len(self.df)

# Predictor

In [6]:
def state_dict_replace(load_state_dict, replace_key='transformer.'):
    new_load_state_dict = dict()
    for key in load_state_dict.keys():
        if key.count(replace_key) > 0:
            dst_key = key.replace(replace_key, '')
        else:
            dst_key = key
        new_load_state_dict[dst_key] = load_state_dict[key]
    load_state_dict = new_load_state_dict
    return load_state_dict


def load_pretrained(net, pretrained_file, strict=False, can_print=True):
    if can_print:
        print(f'load pretrained file: {pretrained_file}')
    load_state_dict = torch.load(pretrained_file, map_location=torch.device('cpu'))
    net = load_pretrained_state_dict(net, load_state_dict, strict=strict, can_print=can_print)
    return net


def load_pretrained_state_dict(net, load_state_dict, strict=False, can_print=True):
    if 'epoch' in load_state_dict and can_print:
        epoch = load_state_dict['epoch']
        print(f'load epoch:{epoch:.2f}')
    if 'state_dict' in load_state_dict:
        load_state_dict = load_state_dict['state_dict']
    if type(net) == DataParallel or type(net) == DistributedDataParallel:
        state_dict = net.module.state_dict()
    else:
        state_dict = net.state_dict()
    load_state_dict = state_dict_replace(load_state_dict, '_orig_mod.')
    for key in list(load_state_dict.keys()):
        if key not in state_dict:
            if strict:
                raise Exception(f'not in {key}')
            if can_print:
                print('not in', key)
            continue
        if load_state_dict[key].size() != state_dict[key].size():
            if strict:
                raise Exception(f'size not the same {key}')
            if can_print:
                print('size not the same', key)
            continue
        state_dict[key] = load_state_dict[key]
    if type(net) == DataParallel or type(net) == DistributedDataParallel:
        net.module.load_state_dict(state_dict)
    else:
        net.load_state_dict(state_dict)
    return net

In [7]:
class Blip2ValidTransformer:
    def __init__(self, args):
        self.args = args
        from transformers import AutoProcessor
        self.blip2_processor = AutoProcessor.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH(F"Salesforce/{args.model_name}"))
        self.blip2_processor.image_processor.size = {'height': self.args.image_size, 'width': self.args.image_size}

    def __call__(self, image, text=None, return_tensors="pt", truncation=True, max_length=None, padding='max_length'):
        data = self.blip2_processor(
            image,
            text=text,
            return_tensors=return_tensors,
            truncation=truncation,
            max_length=max_length,
            padding=padding
        )
        for key in data.keys():
            data[key] = data[key].squeeze(0)
        return data


class CLIPValidTransformer:
    def __init__(self, args):
        self.args = args
        self.clip_processor = CLIPProcessor.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH(args.model_name))
        self.clip_processor.image_processor.size['shortest_edge'] = args.image_size
        self.clip_processor.image_processor.do_center_crop = False

    def __call__(self, image):
        data = self.clip_processor(images=image)['pixel_values'][0]
        return {
            'image': data
        }


In [8]:
class Predictor:
    def __init__(self, args):
        self.args = args
        self.device = 'cuda'
        self.submit_dir = f'{RESULT_DIR}/submissions/{args.out_dir}/fold{args.fold}'
        os.makedirs(self.submit_dir, exist_ok=True)

    def get_dataloader(self):
        args = self.args
        # set transform
        if args.is_clip_model:
            transform = CLIPValidTransformer(args)
        elif args.model_name.startswith('blip'):
            transform = Blip2ValidTransformer(args)
        # create dataset, dataloader
        test_dataset = DiffusionDataset(args, dataset=args.dataset, transform=transform)
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(
            test_dataset,
            sampler=test_sampler,
            drop_last=False,
            pin_memory=True,
            batch_size=args.test_batch_size,
            num_workers=args.num_workers,
        )
        return test_dataloader

    def load_model(self, epoch):
        args = self.args
        model = eval(args.model)(args)
        model_path = f'{RESULT_DIR}/models/{args.out_dir}/fold{args.fold}/{epoch}.pth'
        load_pretrained(model, model_path, strict=False, can_print=True)
        model.cuda()
        model.eval()
        return model

    def do_predict(self, dataloader, model):
        model.eval()
        tbar = tqdm(dataloader, file=sys.stdout)
        N = len(dataloader.dataset)
        print(f'dataset len: {N}')
        preds = []
        with torch.no_grad():
            with autocast():
                for idx, data in enumerate(tbar):
                    logits = get_outputs(data, model)
                    preds.append(logits.detach().cpu().numpy())
        prompt_embeddings = np.vstack(preds)
        imgIds = dataloader.dataset.df[IMG_ID].values
        return prompt_embeddings, imgIds

    def save_embeddings(self, embeddings, ids):
        file = F"{self.submit_dir}/embeddings_{self.args.dataset}_epoch{self.args.epoch}.npz"
        np.savez_compressed(file, embeddings=embeddings, ids=ids)


def make_parser():
    parser = argparse.ArgumentParser(description='generate_submission')
    parser.add_argument('-f', type=str, default=None)
    parser.add_argument('--HistoryManager.hist_file', type=str, default='')
    # model config
    parser.add_argument('--module', '-m', type=str, default='basic_net', help='module')
    parser.add_argument('--model', type=str, default=None, help='model')
    parser.add_argument('--model_name', type=str, default='vit_base_patch16_224', help='model_name')
    parser.add_argument('--embedding_dim', default=384, type=int)
    parser.add_argument("--out_dir", type=str, default=None, required=False)
    parser.add_argument("--epoch", type=str, default='best_ema', required=False)  
    # dataset config
    parser.add_argument('--dataset', default='valid', type=str)  
    parser.add_argument('--split_type', default='random', type=str)
    parser.add_argument('--folds_num', default=1, type=int)
    parser.add_argument('--fold', default=0, type=int)
    parser.add_argument('--image_size', default=224, type=int)
    # predict param
    parser.add_argument('--test_batch_size', default=16, type=int)
    parser.add_argument("--gpus", type=str, default="0", required=False)
    parser.add_argument('--num_workers', default=2, type=int)
    parser.add_argument('--save_embeddings', default=1, type=int, help='') 
    parser.add_argument('--update', default=1, type=int, help='') 
    return parser


def get_outputs(data, model):
    keys = ['image', 'pixel_values', 'target', 'input_ids', 'attention_mask']
    for key in keys:
        if key in data:
            data[key] = Variable(data[key].cuda())
    return model(data)


def generate_submission(embeddings, ids):
    imgId_eId = [
        '_'.join(map(str, i)) for i in zip(
            np.repeat(ids, EMBEDDING_LENGTH),
            np.tile(range(EMBEDDING_LENGTH), len(ids))
        )
    ]
    submission = pd.DataFrame({
        'imgId_eId': imgId_eId,
        'val': embeddings.flatten(),
    })
    return submission


def cosine_similarity(y_trues, y_preds):
    scores = [
        1 - spatial.distance.cosine(y_true, y_pred)
        for y_true, y_pred in zip(y_trues, y_preds)
    ]
    return np.mean(scores)
        
def _do_predict(params):
    parser = make_parser()
    args = parser.parse_args(namespace=Namespace(**params))
    args.can_print = True
    args.is_clip_model = args.model_name.startswith('openai/clip') or args.model_name.startswith('laion/CLIP')
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus

    predictor = Predictor(args)
    pred_path_suffix = f'{args.dataset}_epoch{args.epoch}'
    pred_path = f'{predictor.submit_dir}/{pred_path_suffix}_pred.csv'

    dataloader = predictor.get_dataloader()
    if not ope(pred_path) or args.update == 1:
        model = predictor.load_model(args.epoch)
        if len(args.gpus.split(',')) > 1:
            model = DataParallel(model)
        embeddings, ids = predictor.do_predict(dataloader, model)
        pred_df = generate_submission(embeddings, ids)
        if args.save_embeddings:
            predictor.save_embeddings(embeddings, ids)
        pred_df.to_csv(pred_path, index=False)
        print(f'save pred to: {pred_path}')
    else:
        pred_df = pd.read_csv(pred_path)
        print(f'load pred from: {pred_path}')
    print(pred_df.head())

    check_df = pred_df['imgId_eId'].str.split("_", expand=True)
    check_df.columns = [IMG_ID, 'eId']
    indexs = check_df['eId'].values.reshape(-1, EMBEDDING_LENGTH).astype(np.int16)
    assert np.all(indexs == range(EMBEDDING_LENGTH))

    if not IS_KAGGLE_SUBMIT:
        pred_embeddings = pred_df['val'].values.reshape(-1, EMBEDDING_LENGTH)
        pred_ids = check_df[IMG_ID].drop_duplicates().values
        truth_embeddings = dataloader.dataset.targets
        truth_ids = dataloader.dataset.df[IMG_ID].values
        assert np.all(truth_ids == pred_ids)
        score = cosine_similarity(truth_embeddings, pred_embeddings)
        print(f'score: {score:.5f}')

def do_predict(params):
    p = Process(target=_do_predict, args=(params,))
    p.start()
    p.join()

In [9]:
class LoRALinearLayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super().__init__()

        if rank > min(in_features, out_features):
            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")

        self.down = nn.Linear(in_features, rank, bias=False)
        self.up = nn.Linear(rank, out_features, bias=False)

        nn.init.normal_(self.down.weight, std=1 / rank)
        nn.init.zeros_(self.up.weight)

    def forward(self, hidden_states):
        orig_dtype = hidden_states.dtype
        dtype = self.down.weight.dtype

        down_hidden_states = self.down(hidden_states.to(dtype))
        up_hidden_states = self.up(down_hidden_states)

        return up_hidden_states.to(orig_dtype)


class LoRALinear(nn.Module):
    def __init__(self, attn, rank=4, lora_scale=1):
        super().__init__()
        self.attn = attn
        self.rank = rank
        self.lora_scale = lora_scale

        in_features = attn.in_features
        out_features = attn.out_features
        _rank = min(in_features // rank, out_features // rank)
        self.fc_lora = LoRALinearLayer(in_features, out_features, _rank)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.attn(hidden_states) + self.lora_scale * self.fc_lora(hidden_states)
        return hidden_states

# BLIP-2

In [10]:
class Blip2LoRAAttnProcessor(nn.Module):
    def __init__(self, attn, rank=4, lora_scale=1):
        super().__init__()

        self.attn = attn
        config = attn.config
        embed_dim = config.hidden_size

        # small tweak here compared to CLIP, no bias here
        self.qkv_lora = LoRALinearLayer(embed_dim, 3 * embed_dim, embed_dim // rank)
        self.projection_lora = LoRALinearLayer(embed_dim, embed_dim, embed_dim // rank)
        self.rank = rank
        self.lora_scale = lora_scale

    def forward(
            self,
            hidden_states: torch.Tensor,
            head_mask: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, embed_dim = hidden_states.size()

        mixed_qkv = self.attn.qkv(hidden_states) + self.lora_scale * self.qkv_lora(hidden_states)

        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.attn.num_heads, embed_dim // self.attn.num_heads).permute(
            2, 0, 3, 1, 4
        )
        query_states, key_states, value_states = (
            mixed_qkv[0],
            mixed_qkv[1],
            mixed_qkv[2],
        )

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))

        attention_scores = attention_scores * self.attn.scale

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.attn.dropout(attention_probs)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)

        new_context_layer_shape = context_layer.size()[:-2] + (self.attn.embed_dim,)
        context_layer = context_layer.reshape(new_context_layer_shape)

        output = self.attn.projection(context_layer) + self.lora_scale * self.projection_lora(context_layer)

        outputs = (output, attention_probs) if output_attentions else (output, None)

        return outputs

In [11]:
def lora_blip2_model(model, lora_scale, lora_rate):
    layer_num = len(model.vision_model.encoder.layers)
    lora_num = int(layer_num * lora_rate)
    for _i in range(layer_num - lora_num, layer_num):
        attn = model.vision_model.encoder.layers[_i].self_attn
        new_attn = Blip2LoRAAttnProcessor(attn, rank=4, lora_scale=lora_scale)
        model.vision_model.encoder.layers[_i].self_attn = new_attn
        model.vision_model.encoder.layers[_i].mlp.fc1 = LoRALinear(
            model.vision_model.encoder.layers[_i].mlp.fc1, rank=4, lora_scale=lora_scale)
        model.vision_model.encoder.layers[_i].mlp.fc2 = LoRALinear(
            model.vision_model.encoder.layers[_i].mlp.fc2, rank=4, lora_scale=lora_scale)

In [12]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

class BasicNet(torch.nn.Module):
    def __init__(self, args, lora_scale=1, lora_rate=0.4):
        super().__init__()
        self.args = args
        self.model = Blip2ForConditionalGeneration.from_pretrained(
            TRANSFORMERS.get(F"Salesforce/{args.model_name}"),
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True
        )
        del self.model.language_model
        del self.model.language_projection
        self.model.vision_model.requires_grad_(False)
        self.model.feat_proj = torch.nn.Linear(self.model.qformer.config.hidden_size * 32, self.args.embedding_dim)
        lora_blip2_model(self.model, lora_scale, lora_rate)

    def forward(self, samples):
        pixel_values = samples['pixel_values']
        pixel_values = pixel_values.half()
        image_embeds = self.model.vision_model(pixel_values, return_dict=True).last_hidden_state
        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)

        query_tokens = self.model.query_tokens.expand(image_embeds.shape[0], -1, -1)
        query_outputs = self.model.qformer(
            query_embeds=query_tokens,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            return_dict=True,
        )
        query_output = query_outputs.last_hidden_state

        feat_pred = self.model.feat_proj(query_output.view(len(query_output), -1))
        return feat_pred


def blip2_model_lora04(args):
    model = BasicNet(args, lora_scale=1, lora_rate=0.4)
    return model


In [13]:
do_predict({
    'module': 'blip2_net', 
    'model': 'blip2_model_lora04', 
    'model_name': 'blip2-opt-2.7b',  
    'out_dir': 'train2m_pretrain6-6m_lora04_blip2_opt_2_7b_224x224_1folds', 
    'epoch': 'en3_ema', 
    'fold': 0, 
    'test_batch_size': 256, 
    'dataset': 'test', 
    'gpus': '0',
})

load pretrained file: /kaggle/working/result/models/train2m_pretrain6-6m_lora04_blip2_opt_2_7b_224x224_1folds/fold0/en3_ema.pth
  0%|          | 0/1 [00:00<?, ?it/s]dataset len: 7
100%|██████████| 1/1 [00:05<00:00,  5.38s/it]
save pred to: /kaggle/working/result/submissions/train2m_pretrain6-6m_lora04_blip2_opt_2_7b_224x224_1folds/fold0/test_epochen3_ema_pred.csv
     imgId_eId       val
0  20057f34d_0  4.132812
1  20057f34d_1  6.898438
2  20057f34d_2  1.640625
3  20057f34d_3  0.455566
4  20057f34d_4 -0.048065
score: 0.68943


# ConvNeXt-XXLarge

In [14]:
def lora_convnext_model(model, args, lora_scale, lora_rate, nfz_rate=0.4, rank=4):
    model.trunk.stem.requires_grad_(False)
    model.trunk.stages[0].requires_grad_(False)
    model.trunk.stages[1].requires_grad_(False)
    model.trunk.stages[2].requires_grad_(False)
    layer_num = len(model.trunk.stages[2].blocks)
    lora_num = int(layer_num * lora_rate)
    nfz_num = int(layer_num * nfz_rate)

    for _idx in range(layer_num - lora_num - nfz_num, layer_num):
        if (layer_num-_idx) <= nfz_num:
            model.trunk.stages[2].blocks[_idx].requires_grad_(True)
            continue
        model.trunk.stages[2].blocks[_idx].mlp.fc1 = LoRALinear(model.trunk.stages[2].blocks[_idx].mlp.fc1, rank=rank, lora_scale=lora_scale)
        model.trunk.stages[2].blocks[_idx].mlp.fc2 = LoRALinear(model.trunk.stages[2].blocks[_idx].mlp.fc2, rank=rank, lora_scale=lora_scale)
    if args.can_print: print(f'lora clip_model, scale:{lora_scale} rate:{lora_rate} num:{lora_num}')


In [15]:
class BaseClipNet(torch.nn.Module):
    def __init__(self, args, lora_rate=0.0, nfz_rate=0.0, lora_scale=1.0, rank=4, model_type='ViT-L-14', ebd_dim=768):
        super().__init__()
        self.args = args
        model, _, preprocess = open_clip.create_model_and_transforms(model_type,
                                                                     pretrained=f'{PRETRAINED_MODEL_NAME_OR_PATH(args.model_name)}/open_clip_pytorch_model.bin')
        self.vision_model = model.visual
        fc_dim = 16 * 1024
        self.head = nn.Sequential(
            nn.Linear(ebd_dim, fc_dim),
            nn.BatchNorm1d(fc_dim),
            nn.ReLU(),
            nn.Linear(fc_dim, args.embedding_dim),
        )
        lora_convnext_model(self.vision_model, args, lora_scale, lora_rate, nfz_rate, rank)

    def forward(self, data):
        out = self.vision_model(data['image'])
        logits = self.head(out)
        return logits


def clipnet_convnext_xxlarge_256_lora04_nfz02_rank8(args):
    model = BaseClipNet(args, lora_rate=0.4, nfz_rate=0.2, model_type='convnext_xxlarge', rank=8, ebd_dim=1024)
    return model


In [16]:
do_predict({
    'out_dir': 'train2m_pretrain6-6m_lora04_laion_CLIP_convnext_xxlarge_laion2B_s34B_b82K_augreg_soup_256x256_1folds',
    'module': 'clip_convnext_net',
    'model': 'clipnet_convnext_xxlarge_256_lora04_nfz02_rank8',
    'model_name': 'laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup',
    'image_size': 256,
    'gpus': '0',
    'epoch': 'en3_ema',
    'dataset': 'test',
})

lora clip_model, scale:1.0 rate:0.4 num:12
load pretrained file: /kaggle/working/result/models/train2m_pretrain6-6m_lora04_laion_CLIP_convnext_xxlarge_laion2B_s34B_b82K_augreg_soup_256x256_1folds/fold0/en3_ema.pth
  0%|          | 0/1 [00:00<?, ?it/s]dataset len: 7
100%|██████████| 1/1 [00:05<00:00,  5.04s/it]
save pred to: /kaggle/working/result/submissions/train2m_pretrain6-6m_lora04_laion_CLIP_convnext_xxlarge_laion2B_s34B_b82K_augreg_soup_256x256_1folds/fold0/test_epochen3_ema_pred.csv
     imgId_eId        val
0  20057f34d_0   6.277344
1  20057f34d_1  20.250000
2  20057f34d_2   2.728516
3  20057f34d_3  -0.554199
4  20057f34d_4  -2.425781
score: 0.68653


# ViT-Large

In [17]:
class LoRACLIPAttention(nn.Module):
    def __init__(self, attn, rank=4, lora_scale=1):
        super().__init__()
        self.attn = attn
        self.rank = rank
        self.lora_scale = lora_scale

        config = attn.config
        embed_dim = config.hidden_size
        self.k_proj_lora = LoRALinearLayer(embed_dim, embed_dim, embed_dim // rank)
        self.v_proj_lora = LoRALinearLayer(embed_dim, embed_dim, embed_dim // rank)
        self.q_proj_lora = LoRALinearLayer(embed_dim, embed_dim, embed_dim // rank)
        self.out_proj_lora = LoRALinearLayer(embed_dim, embed_dim, embed_dim // rank)

    def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            causal_attention_mask: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, embed_dim = hidden_states.size()

        # get query proj
        query_states = (self.attn.q_proj(hidden_states) + self.lora_scale * self.q_proj_lora(
            hidden_states)) * self.attn.scale
        key_states = self.attn._shape(
            self.attn.k_proj(hidden_states) + self.lora_scale * self.k_proj_lora(hidden_states), -1, bsz)
        value_states = self.attn._shape(
            self.attn.v_proj(hidden_states) + self.lora_scale * self.v_proj_lora(hidden_states), -1, bsz)

        proj_shape = (bsz * self.attn.num_heads, -1, self.attn.head_dim)
        query_states = self.attn._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.attn.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.attn.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        # apply the causal_attention_mask first
        if causal_attention_mask is not None:
            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                    f" {causal_attention_mask.size()}"
                )
            attn_weights = attn_weights.view(bsz, self.attn.num_heads, tgt_len, src_len) + causal_attention_mask
            attn_weights = attn_weights.view(bsz * self.attn.num_heads, tgt_len, src_len)

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights.view(bsz, self.attn.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.attn.num_heads, tgt_len, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if output_attentions:
            # this operation is a bit akward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.attn.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.attn.num_heads, tgt_len, src_len)
        else:
            attn_weights_reshaped = None

        attn_probs = nn.functional.dropout(attn_weights, p=self.attn.dropout, training=self.training)

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.attn.num_heads, tgt_len, self.attn.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.attn.num_heads, tgt_len, self.attn.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.attn.num_heads, tgt_len, self.attn.head_dim)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)

        attn_output = self.attn.out_proj(attn_output) + self.lora_scale * self.out_proj_lora(attn_output)

        return attn_output, attn_weights_reshaped
    
def lora_clip_model(model, args, lora_scale, lora_rate, nfz_rate=0.4, rank=4):
    model.embeddings.requires_grad_(False)
    model.pre_layrnorm.requires_grad_(False)
    model.encoder.layers.requires_grad_(False)
    layer_num = len(model.encoder.layers)
    lora_num = int(layer_num * lora_rate)
    nfz_num = int(layer_num * nfz_rate)
    for _i in range(layer_num - lora_num - nfz_num, layer_num):
        if (layer_num - _i) <= nfz_num:
            model.encoder.layers[_i].requires_grad_(True)
            continue
        attn = model.encoder.layers[_i].self_attn
        new_attn = LoRACLIPAttention(attn, rank=rank, lora_scale=lora_scale)
        model.encoder.layers[_i].self_attn = new_attn
    if args.can_print: print(f'lora clip_model, scale:{lora_scale} rate:{lora_rate} num:{lora_num}')


In [18]:
class BaseClipNet(torch.nn.Module):
    def __init__(self, args, lora_rate=0.0, nfz_rate=0.0, lora_scale=1.0, rank=4):
        super().__init__()
        self.args = args
        config = CLIPConfig.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH(args.model_name))
        config.vision_config.image_size = args.image_size
        clip = CLIPModel(config)

        pretrained_file = f'{PRETRAINED_MODEL_NAME_OR_PATH(args.model_name)}/pytorch_model.pt'
        load_pretrained(clip, pretrained_file, strict=False, can_print=True)

        self.vision_model = clip.vision_model
        fc_dim = 16 * 1024
        ebd_dim = self.vision_model.embeddings.position_embedding.embedding_dim
        self.head = nn.Sequential(
            nn.Linear(ebd_dim, fc_dim),
            nn.BatchNorm1d(fc_dim),
            nn.ReLU(),
            nn.Linear(fc_dim, args.embedding_dim),
        )
        lora_clip_model(self.vision_model, args, lora_scale, lora_rate, nfz_rate, rank)

    def forward(self, data):
        out = self.vision_model(data['image'])
        logits = self.head(out['pooler_output'])
        return logits

def clipnet_lora06(args):
    model = BaseClipNet(args, lora_rate=0.6, nfz_rate=0.4)
    return model


In [19]:
do_predict({
    'out_dir': 'train2m_pretrain6-6m_lora06_hd_openai_clip_vit_large_patch14_336_336x336_1folds',
    'model': 'clipnet_lora06',
    'module': 'clip_net',
    'model_name': 'openai/clip-vit-large-patch14-336',
    'image_size': 336,
    'epoch': 'en3_ema',
    'dataset': 'test',
    'gpus': '0',
})

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


load pretrained file: /kaggle/input/diffusion2023-pretrained/clip-vit-large-patch14-336/pytorch_model.pt
lora clip_model, scale:1.0 rate:0.6 num:14
load pretrained file: /kaggle/working/result/models/train2m_pretrain6-6m_lora06_hd_openai_clip_vit_large_patch14_336_336x336_1folds/fold0/en3_ema.pth
  0%|          | 0/1 [00:00<?, ?it/s]dataset len: 7
100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
save pred to: /kaggle/working/result/submissions/train2m_pretrain6-6m_lora06_hd_openai_clip_vit_large_patch14_336_336x336_1folds/fold0/test_epochen3_ema_pred.csv
     imgId_eId        val
0  20057f34d_0   9.328125
1  20057f34d_1  22.015625
2  20057f34d_2   2.335938
3  20057f34d_3   0.904297
4  20057f34d_4   9.601562
score: 0.69406


# Ensemble

In [20]:
en_config = [
    {
        'out_dir': 'train2m_pretrain6-6m_lora06_hd_openai_clip_vit_large_patch14_336_336x336_1folds', 
        'weight': 0.25
    }, 
    {
        'out_dir': 'train2m_pretrain6-6m_lora04_blip2_opt_2_7b_224x224_1folds', 
        'weight': 0.35
    }, 
    {
        'out_dir': 'train2m_pretrain6-6m_lora04_laion_CLIP_convnext_xxlarge_laion2B_s34B_b82K_augreg_soup_256x256_1folds', 
        'weight': 0.4
    }
]

In [21]:
dataset = 'test'
meta_df = pd.read_csv(f'{DATA_DIR}/raw/sample_submission.csv')
values_sum = 0.
weight_sum = 0.
for idx, cfg in enumerate(en_config):
    out_dir = cfg['out_dir']
    weight = cfg['weight']
    fold = 0
    epoch = 'en3_ema'
    pred_path_suffix = f'{dataset}_epoch{epoch}'
    pred_df = pd.read_csv(f"{RESULT_DIR}/submissions/{out_dir}/fold{fold}/{pred_path_suffix}_pred.csv")
    values = meta_df[[ID]].merge(pred_df, on=ID, how='left')[TARGET].values
    values = values.reshape(-1, EMBEDDING_LENGTH)
    mean = values.mean(axis=1, keepdims=True)
    std = values.std(axis=1, keepdims=True)
    values = (values - mean) / (std + 1e-8)
    values = values.reshape(-1)
    values_sum += values * weight
    weight_sum += weight
values = values_sum / weight_sum
pred_df = pd.DataFrame({
    ID: meta_df[ID].values,
    TARGET: values,
})
if not IS_KAGGLE_SUBMIT:
    pred_embeddings = pred_df[TARGET].values.reshape(-1, EMBEDDING_LENGTH)
    truth_embeddings = meta_df[TARGET].values.reshape(-1, EMBEDDING_LENGTH)
    score = cosine_similarity(truth_embeddings, pred_embeddings)
    print(f'score: {score:.5f}')
pred_df.to_csv('submission.csv', index=False)

score: 0.70161


In [22]:
dir_list = os.listdir('/kaggle/working')
for file in dir_list:
    if os.path.isdir(file):
        shutil.rmtree(file)
    elif file != 'submission.csv':
        os.remove(file)

In [23]:
submit_df = pd.read_csv('./submission.csv')
print(f'submission: {submit_df.shape}')
print(submit_df.head())

submission: (2688, 2)
     imgId_eId       val
0  20057f34d_0  0.544083
1  20057f34d_1  1.333054
2  20057f34d_2  0.204143
3  20057f34d_3  0.021722
4  20057f34d_4  0.083885
