In [2]:
from embedding_reader import EmbeddingReader
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

# Load LAION CLIP Data

In [3]:
# emebeddings are .npy
# metadata is .paraquet
embedding_reader = EmbeddingReader(
    embeddings_folder="https://mystic.the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/",
    metadata_folder="https://mystic.the-eye.eu/public/AI/cah/laion5b/metadata/laion2B-en/",
    meta_columns=['SAMPLE_ID', 'TEXT'],
    file_format="parquet_npy"
)
print("embedding count", embedding_reader.count)
print("dimension", embedding_reader.dimension)
print("total size", embedding_reader.total_size)
print("byte per item", embedding_reader.byte_per_item)

100%|██████████████████████████████████████████████████████████████| 4611/4611 [03:10<00:00, 24.17it/s]

embedding count 116341562
dimension 768
total size 178700639232
byte per item 1536





In [4]:
for emb, meta in embedding_reader(batch_size=10 ** 4, start=0, end=10 ** 4, show_progress=True):
    print(emb.shape)
    print(meta.size)

100%|████████████████████████████████████████████████████████████████████| 1/1 [01:27<00:00, 87.03s/it]

(10000, 768)
30000





In [5]:
meta

Unnamed: 0,SAMPLE_ID,TEXT,i
0,2641080021034,"Blue Beach Umbrellas, Point Of Rocks, Crescent...",0
1,1069682003121,BMW-M2-M-Performance-Dekor-Long-Beach-Blue-05,1
2,748078005989,Becoming More Than a Good Bible Study Girl: Li...,2
3,3203431012473,"""Dynabrade 52632 4-1/2"""" Dia. Right Angle Depr...",3
4,1517065001181,MANETTE XBOX ONE,4
...,...,...,...
9995,524204001683,Sandbags at the flood closeup photo - stock photo,9995
9996,1021435003676,"Jeff Bezos to step down as Amazon CEO, Andy Ja...",9996
9997,3951388000808,Custom Birthday Message Cookies,9997
9998,3930703006714,Swedish alphabet with pictures - Learn swedish...,9998


In [6]:
train_idx = int(len(meta) * 0.8)
x_train, y_train = meta['TEXT'][:train_idx], torch.Tensor(emb[:train_idx])
x_test, y_test = meta['TEXT'][train_idx:], torch.Tensor(emb[train_idx:])

In [7]:
x_train.shape, y_train.size(), x_test.shape, y_test.size()

((8000,), torch.Size([8000, 768]), (2000,), torch.Size([2000, 768]))

# Fine-tune LM -> Predict CLIP image embeddings

In [8]:
from transformers import AutoModel, AutoTokenizer

In [9]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [10]:
epochs = 10
batch_size = 64

## BERT

In [37]:
class CLIPEmbBERT(nn.Module):
    def __init__(self):
        super(CLIPEmbBERT, self).__init__()
        self.model = AutoModel.from_pretrained("bert-base-cased")
          ### New layers: None

    def forward(self, tokens, mask):
        cls_emebdding = self.model(tokens, attention_mask=mask).pooler_output

        return cls_emebdding 

In [38]:
bert_model = CLIPEmbBERT()
bert_model.to(device)

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(bert_model.parameters(), lr=0.0001, weight_decay=1e-6)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
x_train_bert = bert_tokenizer.batch_encode_plus(list(x_train), return_tensors='pt', padding=True, add_special_tokens=True)
x_test_bert = bert_tokenizer.batch_encode_plus(list(x_test), return_tensors='pt', padding=True, add_special_tokens=True)

In [40]:
x_train_bert

{'input_ids': tensor([[  101,  2770,  3808,  ...,     0,     0,     0],
        [  101, 13439,   118,  ...,     0,     0,     0],
        [  101,  4108,  9331,  ...,     0,     0,     0],
        ...,
        [  101, 14763, 17704,  ...,     0,     0,     0],
        [  101,   107, 11336,  ...,     0,     0,     0],
        [  101, 20452, 10308,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
bert_model.train()
for epoch in range(epochs):
    
    rand_ids = torch.randperm(x_train_bert['input_ids'].size()[0])
    X = x_train_bert['input_ids'][rand_ids]
    masks = x_train_bert['attention_mask'][rand_ids]
    
    for i in tqdm(range(0, X.size()[0], batch_size)):
        optimizer.zero_grad()
        
        outputs = bert_model(X[i:i+batch_size], mask=masks[i:i+batch_size])
        
        loss = criterion(outputs, y_train[i:i+batch_size])

        loss.backward()
        optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Loss Epoch {epoch}: {loss}")

100%|█████████████████████████████████████████| 125/125 [1:18:42<00:00, 37.78s/it]


Loss Epoch 0: 0.002549578435719013


 18%|███████▋                                  | 23/125 [14:20<1:00:06, 35.35s/it]

In [None]:
bert_model.eval()
bert_model(x_test_bert['input_ids'], mask=x_test_bert['attention_mask'])
    
test_loss = criterion(outputs, y_test_bert)
test_loss

## DistilBERT

In [1]:
class CLIPEmbDistilBERT(nn.Module):
    def __init__(self):
        super(CLIPEmbDistilBERT, self).__init__()
        self.model = AutoModel.from_pretrained("distilbert-base-cased")
          ### New layers: None

    def forward(self, tokens, mask):
        cls_emebdding = self.model(tokens, attention_mask=mask).pooler_output

        return cls_emebdding

NameError: name 'nn' is not defined

In [None]:
distilbert_model = CLIPEmbDistilBERT()
distilbert_model.to(device)

distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(distilbert_model.parameters(), lr=0.0001, weight_decay=1e-6)

In [None]:
x_train_distilbert = distilbert_tokenizer.batch_encode_plus(list(x_train), return_tensors='pt', padding=True, add_special_tokens=True)
x_test_distilbert = distilbert_tokenizer.batch_encode_plus(list(x_test), return_tensors='pt', padding=True, add_special_tokens=True)

In [None]:
distilbert_model.train()
for epoch in range(epochs):
    
    rand_ids = torch.randperm(x_train_distilbert['input_ids'].size()[0])
    X = x_train_distilbert['input_ids'][rand_ids]
    masks = x_train_distilbert['attention_mask'][rand_ids]
    
    for i in tqdm(range(0, X.size()[0], batch_size)):
        optimizer.zero_grad()
        
        outputs = distilbert_model(X[i:i+batch_size], mask=masks[i:i+batch_size])
        
        loss = criterion(outputs, y_train[i:i+batch_size])

        loss.backward()
        optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Loss Epoch {epoch}: {loss}")