Dataset preprocess (COCO)

In [6]:
import os
import json
import random
import shutil
from pathlib import Path
from skimage import io as skio
from PIL import Image

def load_json(path: Path):
    """Load captions JSON file into memory."""
    with path.open("r") as fp:
        captions = json.load(fp)
    print(f"{len(captions)} captions loaded from json")
    return captions

def sample_dataset(captions, fraction=0.1):
    """Randomly sample a fraction of the dataset."""
    sample_size = int(len(captions) * fraction)
    sampled_captions = random.sample(captions, sample_size)
    print(f"Sampled {sample_size} captions from the dataset.")
    return sampled_captions

def create_new_dirs(base_path: Path):
    """Create directories for the new dataset."""
    os.makedirs(base_path / "train2014", exist_ok=True)
    os.makedirs(base_path / "val2014", exist_ok=True)
    os.makedirs(base_path / "annotations", exist_ok=True)
    print(f"Created new directories at {base_path}")

def find_image(img_id: int, train_path: Path, val_path: Path) -> Path:
    """Find image in train2014 or val2014 based on image_id."""
    try:
        img_id = int(img_id)
    except ValueError:
        raise ValueError(f"Image id {img_id} is not a valid integer.")

    filename = f"COCO_train2014_{img_id:012d}.jpg"
    train_image_path = train_path / filename
    if train_image_path.is_file():
        return train_image_path

    filename = f"COCO_val2014_{img_id:012d}.jpg"
    val_image_path = val_path / filename
    if val_image_path.is_file():
        return val_image_path

    raise FileNotFoundError(f"Image {img_id} not found in train/val splits.")

def copy_image(img_path: Path, new_dir: Path):
    """Copy image to the new directory."""
    shutil.copy(img_path, new_dir)

def preprocess_and_copy_dataset(captions, output_dir: Path, train_path: Path, val_path: Path):
    """Preprocess the dataset: sample 1/10th, copy images, and store captions."""
    sampled_captions = sample_dataset(captions, fraction=0.1)
    create_new_dirs(output_dir)

    new_train_dir = output_dir / "train2014"
    new_val_dir = output_dir / "val2014"

    new_captions = []

    for item in sampled_captions:
        img_id = item["image_id"]
        try:
            img_path = find_image(img_id, train_path, val_path)
        except FileNotFoundError:
            continue

        if "train" in str(img_path):
            new_img_dir = new_train_dir
        else:
            new_img_dir = new_val_dir

        copy_image(img_path, new_img_dir)
        new_captions.append(item)

    annotations_path = output_dir / "annotations" / "train_caption.json"
    with annotations_path.open("w") as fp:
        json.dump(new_captions, fp)
    print(f"Saved {len(new_captions)} sampled captions to {annotations_path}")

def main():
    original_train_path = Path("./data/coco/train2014")
    original_val_path = Path("./data/coco/val2014")
    output_dir = Path("./data/coco_sampled")

    caption_file = Path("train_caption.json")
    captions = load_json(caption_file)

    preprocess_and_copy_dataset(captions, output_dir, original_train_path, original_val_path)
    print("Preprocessing and dataset reduction complete!")

if __name__ == "__main__":
    main()


56674 captions loaded from json
Sampled 5667 captions from the dataset.
Created new directories at data/coco_sampled
Saved 0 sampled captions to data/coco_sampled/annotations/train_caption.json
Preprocessing and dataset reduction complete!


Clip preprocess using HugginFace

In [None]:
import os
import json
import pickle
import argparse
from pathlib import Path

import tensorflow as tf
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, TFCLIPModel

import numpy as np

def load_json(path: Path):
    """Read caption json to memory."""
    with path.open("r") as fp:
        captions = json.load(fp)
    print(f"{len(captions)} captions loaded from json")
    return captions

def find_image(img_id: int) -> Path:
    """
    Given a COCO image id, locate the JPG file in either train2014 or val2014 folder.
    """
    try:
        img_id = int(img_id)
    except ValueError:
        raise ValueError(f"Image id {img_id} is not a valid integer.")

    filename = f"COCO_train2014_{img_id:012d}.jpg"
    train_path = Path("./data/coco_sampled/train2014") / filename
    if train_path.is_file():
        return train_path

    filename = f"COCO_val2014_{img_id:012d}.jpg"
    val_path = Path("./data/coco_sampled/val2014") / filename
    if val_path.is_file():
        return val_path

    #raise FileNotFoundError(f"Image {img_id} not found in train/val splits.")

def extract_embeddings(model, processor, captions) -> tuple[list[np.ndarray], list[dict]]:
    """
    Iterate over captions, load corresponding images,
    and compute CLIP image embeddings.
    """
    features, meta = [], []

    for idx in tqdm(range(len(captions))):
        item = captions[idx]
        img_path = find_image(item["image_id"])
        if img_path is None:
            tqdm.write(f"Skipping missing image {item['image_id']}")
            continue

        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="tf")

        outputs = model.get_image_features(**inputs)
        emb = outputs.numpy()

        item["clip_embedding"] = idx
        features.append(emb)
        meta.append(item)

        # checkpoint every 10k samples
        if (idx + 1) % 10_000 == 0:
            yield idx + 1, features, meta
            features, meta = [], []

    # final remainder
    yield len(captions), features, meta

def dump_checkpoint(out_path: Path, feats: list[np.ndarray], infos: list[dict]):
    """Serialize current batch to a pkl file."""
    tensor_cat = np.concatenate(feats, axis=0)
    payload = {"clip_embedding": tensor_cat, "captions": infos}
    with out_path.open("wb") as fp:
        pickle.dump(payload, fp)

def run(model_name: str):
    # Use GPU if available
    print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

    from transformers import TFAutoModel, AutoTokenizer

    model_name = "openai/clip-vit-base-patch32"  # Correct Hugging Face model ID

    model = TFAutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    processor = CLIPProcessor.from_pretrained(model_name)

    output_pkl = Path(f"./data/coco/oscar_split_{model_name.replace('/', '_')}_train.pkl")

    caption_file = Path("train_caption.json")
    captions = load_json(caption_file)

    progress = 0
    for progress, feats, infos in extract_embeddings(model, processor, captions):
        dump_checkpoint(output_pkl, feats, infos)

    print("Done")
    print(f"{progress} embeddings saved to {output_pkl}")

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--clip_model_name', type=str, default="ViT-B/32")
    args = parser.parse_known_args()[0]

    # call your main function here with args
    clip_model_name = args.clip_model_name
    run(args.clip_model_name)

predict.py:

In [5]:
import tensorflow as tf
from transformers import CLIPProcessor, TFAutoModelForImageClassification, GPT2Tokenizer
import numpy as np
import skimage.io as io
import PIL.Image
!pip install cog
!pip install --upgrade cog
#import cog

# Model weights and paths
WEIGHTS_PATHS = {
    "coco": "coco_weights.h5",
    "conceptual-captions": "conceptual_weights.h5",
}

class Predictor:
    def __init__(self):
        """Load the model into memory to make running multiple predictions efficient"""
        self.device = "GPU"  # or "CPU" if needed
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_model = TFAutoModelForImageClassification.from_pretrained("openai/clip-vit-base-patch32")
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        self.models = {}
        self.prefix_length = 10

    def predict(self, image_path: str):
        """Run inference on an image."""
        # Load and preprocess the image
        image = io.imread(image_path)
        inputs = self.processor(images=image, return_tensors="pt", padding=True)

        # Run the image through the model
        with tf.device(self.device):
            outputs = self.clip_model(**inputs)

        return outputs



class ClipCaptionModel(tf.keras.Model):
    def __init__(self, prefix_length: int, prefix_size: int = 512):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained("gpt2")
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        self.clip_project = tf.keras.layers.Dense(self.gpt_embedding_size * prefix_length)

    def call(self, tokens, prefix, mask=None, labels=None):
        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix)
        embedding_cat = tf.concat([prefix_projections, embedding_text], axis=1)

        if labels is not None:
            dummy_token = tf.zeros_like(tokens)
            labels = tf.concat([dummy_token, tokens], axis=1)

        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out


def generate_beam(
    model,
    tokenizer,
    beam_size=5,
    prompt=None,
    embed=None,
    entry_length=67,
    temperature=1.0,
    stop_token=".",
):
    model.eval()
    stop_token_index = tokenizer.encode(stop_token)[0]
    tokens = None
    scores = None
    device = next(model.parameters()).device
    seq_lengths = tf.ones(beam_size)
    is_stopped = tf.zeros(beam_size, dtype=tf.bool)

    if embed is not None:
        generated = embed
    else:
        tokens = tokenizer.encode(prompt)
        tokens = tf.convert_to_tensor(tokens)
        generated = model.gpt.transformer.wte(tokens)

    for i in range(entry_length):
        outputs = model.gpt(inputs_embeds=generated)
        logits = outputs.logits
        logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
        logits = tf.nn.log_softmax(logits, axis=-1)

        if scores is None:
            scores, next_tokens = tf.math.top_k(logits, k=beam_size)
            generated = tf.expand_dims(generated, axis=0)
            next_tokens, scores = tf.transpose(next_tokens), scores[0]

            tokens = next_tokens
        else:
            logits[is_stopped] = -float("inf")
            logits[is_stopped, 0] = 0
            scores_sum = scores[:, None] + logits
            seq_lengths[~is_stopped] += 1
            scores_sum_average = scores_sum / seq_lengths[:, None]
            scores_sum_average, next_tokens = tf.math.top_k(scores_sum_average, k=beam_size)

            next_tokens_source = next_tokens // scores_sum.shape[1]
            seq_lengths = seq_lengths[next_tokens_source]
            next_tokens = next_tokens % scores_sum.shape[1]
            tokens = next_tokens

            generated = tf.concat([generated, next_token_embed], axis=1)
            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
            if is_stopped.all():
                break

    output_texts = [tokenizer.decode(output) for output in tokens]
    return output_texts


def generate2(
    model,
    tokenizer,
    tokens=None,
    prompt=None,
    embed=None,
    entry_count=1,
    entry_length=67,
    top_p=0.8,
    temperature=1.0,
    stop_token=".",
):
    model.eval()
    generated_list = []
    stop_token_index = tokenizer.encode(stop_token)[0]

    if embed is not None:
        generated = embed
    else:
        tokens = tokenizer.encode(prompt)
        tokens = tf.convert_to_tensor(tokens)
        generated = model.gpt.transformer.wte(tokens)

    for i in range(entry_length):
        outputs = model.gpt(inputs_embeds=generated)
        logits = outputs.logits
        logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

        sorted_logits, sorted_indices = tf.math.top_k(logits, k=beam_size)
        cumulative_probs = tf.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1]
        sorted_indices_to_remove[:, 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[:, indices_to_remove] = -float("Inf")

        next_token = tf.argmax(logits, axis=-1)
        next_token_embed = model.gpt.transformer.wte(next_token)

        tokens = tf.concat([tokens, next_token], axis=1)
        generated = tf.concat([generated, next_token_embed], axis=1)

        if next_token == stop_token_index:
            break

    output_text = tokenizer.decode(tokens)
    generated_list.append(output_text)

    return generated_list[0]




train.py:

In [37]:
import tensorflow as tf
from transformers import GPT2Model, GPT2Tokenizer, CLIPModel, CLIPProcessor
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import pickle
import os
import argparse
import json
from typing import Tuple, Optional
from typing import Union
from enum import Enum
from tensorflow.keras import layers, Model
from tensorflow.keras.activations import tanh, relu
from tensorflow.keras.optimizers import Adam
import sys
from tqdm import tqdm

class MappingType(Enum):
    MLP = 'mlp'
    Transformer = 'transformer'


class ClipCocoDataset:
    def __init__(self, data_path: str, prefix_length: int, normalize_prefix=False):
        # Initialize tokenizer and dataset parameters
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
        self.prefix_length = prefix_length
        self.normalize_prefix = normalize_prefix

        # Load dataset
        with open(data_path, 'rb') as f:
            all_data = pickle.load(f)
        print("Data size is %0d" % len(all_data["clip_embedding"]))

        self.prefixes = all_data["clip_embedding"]
        captions_raw = all_data["captions"]
        self.image_ids = [caption["image_id"] for caption in captions_raw]
        self.captions = [caption['caption'] for caption in captions_raw]

        if os.path.isfile(f"{data_path[:-4]}_tokens.pkl"):
            with open(f"{data_path[:-4]}_tokens.pkl", 'rb') as f:
                self.captions_tokens, self.caption2embedding, self.max_seq_len = pickle.load(f)
        else:
            self.captions_tokens = []
            self.caption2embedding = []
            max_seq_len = 0
            for caption in captions_raw:
                # Tokenize the captions using Hugging Face tokenizer
                tokens = self.processor.tokenizer.encode(caption['caption'], truncation=True, padding='max_length')
                self.captions_tokens.append(tokens)
                self.caption2embedding.append(caption["clip_embedding"])
                max_seq_len = max(max_seq_len, len(tokens))

            with open(f"{data_path[:-4]}_tokens.pkl", 'wb') as f:
                pickle.dump([self.captions_tokens, self.caption2embedding, max_seq_len], f)

        # Calculate the maximum sequence length
        all_len = np.array([len(self.captions_tokens[i]) for i in range(len(self))], dtype=float)
        self.max_seq_len = min(int(all_len.mean() + all_len.std() * 10), int(all_len.max()))
        self.max_length = max(len(caption) for caption in self.captions_tokens)

    def __len__(self):
        return len(self.captions_tokens)

    def pad_tokens(self, tokens):
        # Padding logic to handle sequences
        padding = self.max_length - len(tokens)  # Calculate padding
        # Convert the padding list to a Tensor
        padding_tensor = tf.zeros(padding, dtype=tf.int32)

        # Concatenate the original tokens with the padding
        tokens = tf.concat([tokens, padding_tensor], axis=0)

        return tokens


    def __getitem__(self, item):
        tokens, mask = self.pad_tokens(self.captions_tokens[item])
        prefix = self.prefixes[self.caption2embedding[item]]

        # Normalize the prefix if required
        if self.normalize_prefix:
            prefix = prefix / np.linalg.norm(prefix)  # Normalize prefix

        return {
            'tokens': tf.convert_to_tensor(tokens, dtype=tf.int32),
            'mask': tf.convert_to_tensor(mask, dtype=tf.float32),
            'prefix': tf.convert_to_tensor(prefix, dtype=tf.float32)
        }

    def create_tf_dataset(self, batch_size=32):
        # Create a TensorFlow dataset from the data
        def generator():
            for i in range(len(self)):
                yield self[i]

        dataset = tf.data.Dataset.from_generator(generator,
                                                 output_signature={
                                                     'tokens': tf.TensorSpec(shape=(self.max_seq_len,), dtype=tf.int32),
                                                     'mask': tf.TensorSpec(shape=(self.max_seq_len,), dtype=tf.float32),
                                                     'prefix': tf.TensorSpec(shape=(512,), dtype=tf.float32)  # Adjust the prefix dimension
                                                 })

        # Batching, shuffling, and prefetching
        dataset = dataset.batch(batch_size).shuffle(1000).prefetch(tf.data.experimental.AUTOTUNE)
        return dataset

    @property
    def element_spec(self):
        # Define the shape and dtype of each element in the dataset
        return {
            'tokens': tf.TensorSpec(shape=(self.max_seq_len,), dtype=tf.int32),
            'mask': tf.TensorSpec(shape=(self.max_seq_len,), dtype=tf.float32),
            'prefix': tf.TensorSpec(shape=(512,), dtype=tf.float32)  # Adjust the prefix dimension
        }


class MLP(Model):
    def __init__(self, sizes, bias=True, act=tanh):
        super(MLP, self).__init__()
        layers_list = []
        for i in range(len(sizes) - 1):
            layers_list.append(layers.Dense(sizes[i + 1], use_bias=bias))
            if i < len(sizes) - 2:
                layers_list.append(act)
        self.model = tf.keras.Sequential(layers_list)

    def call(self, x):
        return self.model(x)


# MlpTransformer Model in TensorFlow
class MlpTransformer(Model):
    def __init__(self, in_dim, h_dim, out_d=None, act=relu, dropout=0.):
        super(MlpTransformer, self).__init__()
        out_d = out_d if out_d is not None else in_dim
        self.fc1 = layers.Dense(h_dim)
        self.act = act
        self.fc2 = layers.Dense(out_d)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x


# MultiHeadAttention in TensorFlow
class MultiHeadAttention(Model):
    def __init__(self, dim_self, dim_ref, num_heads, bias=True, dropout=0.):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        head_dim = dim_self // num_heads
        self.scale = head_dim ** -0.5

        self.to_queries = layers.Dense(dim_self, use_bias=bias)
        self.to_keys_values = layers.Dense(dim_ref * 2, use_bias=bias)
        self.project = layers.Dense(dim_self)
        self.dropout = layers.Dropout(dropout)

    def call(self, x, y=None, mask=None):
        y = y if y is not None else x
        b, n, c = x.shape
        _, m, d = y.shape
        # Calculate queries and keys-values
        queries = self.to_queries(x)
        queries = tf.reshape(queries, (b, n, self.num_heads, c // self.num_heads))
        keys_values = self.to_keys_values(y)
        keys_values = tf.reshape(keys_values, (b, m, 2, self.num_heads, c // self.num_heads))
        keys, values = keys_values[:, :, 0], keys_values[:, :, 1]

        # Calculate attention
        attention = tf.einsum('bnhd,bmhd->bnmh', queries, keys) * self.scale
        if mask is not None:
            if len(mask.shape) == 2:
                mask = tf.expand_dims(mask, 1)
            attention = attention + (mask * -1e9)

        attention = tf.nn.softmax(attention, axis=2)

        # Apply attention to values
        out = tf.einsum('bnmh,bmhd->bnhd', attention, values)
        out = tf.reshape(out, (b, n, c))
        out = self.project(out)
        out = self.dropout(out)
        return out, attention


class ClipCaptionModel(Model):
    def get_dummy_token(self, batch_size: int) -> tf.Tensor:
        return tf.zeros((batch_size, self.prefix_length), dtype=tf.int64)

    def call(self, tokens: tf.Tensor, prefix: tf.Tensor, mask: Optional[tf.Tensor] = None,
             labels: Optional[tf.Tensor] = None):
        # Embedding for tokens
        embedding_text = self.gpt.transformer.wte(tokens)

        # Prefix projection
        prefix_projections = self.clip_project(prefix)
        prefix_projections = tf.reshape(prefix_projections, (-1, self.prefix_length, self.gpt_embedding_size))

        # Concatenate prefix projections with the token embeddings
        embedding_cat = tf.concat([prefix_projections, embedding_text], axis=1)

        # Prepare labels by adding dummy token if labels are provided
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0])
            labels = tf.concat([dummy_token, tokens], axis=1)

        # Forward pass through the GPT model
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

    def __init__(self, prefix_length: int, clip_length: Optional[int] = None, prefix_size: int = 512,
                 num_layers: int = 8, mapping_type='MLP'):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = TFGPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt_embedding_size = self.gpt.transformer.wte.embeddings.shape[1]


        # Mapping type condition for the prefix projection
        if mapping_type == 'MLP':
            self.clip_project = self.build_mlp(prefix_size)
        else:
            self.clip_project = self.build_transformer_mapper(prefix_size, self.gpt_embedding_size, prefix_length,
                                                                clip_length, num_layers)

    def build_mlp(self, prefix_size: int):
        model = tf.keras.Sequential([
            layers.Dense((self.gpt_embedding_size * self.prefix_length) // 2, activation='relu'),
            layers.Dense(self.gpt_embedding_size * self.prefix_length)
        ])
        return model

    def build_transformer_mapper(self, prefix_size: int, gpt_embedding_size: int, prefix_length: int,
                                  clip_length: Optional[int], num_layers: int):
        # Custom transformer mapper (you may want to define it or use an equivalent)
        # This is just a placeholder, you can adjust it as needed
        return tf.keras.Sequential([
            layers.Dense(prefix_size, activation='relu'),
            layers.LayerNormalization(),
            layers.MultiHeadAttention(num_heads=4, key_dim=gpt_embedding_size),
            layers.Dropout(0.1),
            layers.Dense(prefix_size)
        ])


class ClipCaptionPrefix(ClipCaptionModel):

    def train(self, mode: bool = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.trainable = False  # Freeze GPT layers
        return self



def save_config(args: argparse.Namespace):
    config = {}
    for key, item in args._get_kwargs():
        config[key] = item
    out_path = os.path.join(args.out_dir, f"{args.prefix}.json")
    with open(out_path, 'w') as outfile:
        json.dump(config, outfile)


def load_model(config_path: str, epoch_or_latest: Union[str, int] = '_latest'):
    with open(config_path) as f:
        config = json.load(f)
    parser = argparse.ArgumentParser()
    parser.set_defaults(**config)
    args = parser.parse_args()
    if type(epoch_or_latest) is int:
        epoch_or_latest = f"-{epoch_or_latest:03d}"
    model_path = os.path.join(args.out_dir, f"{args.prefix}{epoch_or_latest}.pt")
    model = ClipCaptionGPT2Model(args.prefix_length)
    if os.path.isfile(model_path):
        print(f"loading model from {model_path}")
        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    else:
        print(f"{model_path} does not exist")
    return model, parser


def train(dataset, model, args, lr: float = 2e-5, warmup_steps: int = 5000, output_dir: str = ".", output_prefix: str = ""):
    batch_size = args.bs
    epochs = args.epochs

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Initialize optimizer
    optimizer = Adam(learning_rate=lr)

    # Assuming dataset is already in tf.data.Dataset format
    train_dataset = dataset.create_tf_dataset(batch_size=batch_size)

    # Define the number of steps per epoch
    steps_per_epoch = args.epochs  # If you have a fixed number of steps per epoch, specify this value

    # Training loop
    for epoch in range(epochs):
        print(f">>> Training epoch {epoch}")
        sys.stdout.flush()
        progress = tqdm(total=steps_per_epoch, desc=output_prefix)

        for idx, (images, text_inputs, labels) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                # Model forward pass
                outputs = model(input_ids=text_inputs, pixel_values=images, labels=labels, return_dict=True)
                logits_per_image = outputs.logits_per_image  # Image-text similarity
                logits_per_text = outputs.logits_per_text  # Text-image similarity
                loss = outputs.loss  # Contrastive loss

                # Compute gradients and update weights
                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            progress.set_postfix({"loss": loss.numpy()})
            progress.update()

            if (idx + 1) % 100 == 0:  # Adjust checkpoint frequency
                model.save_pretrained(os.path.join(output_dir, f"{output_prefix}_latest"))

        progress.close()

        # Save the model every 'save_every' epochs
        if epoch % args.save_every == 0 or epoch == epochs - 1:
            model.save_pretrained(os.path.join(output_dir, f"{output_prefix}-{epoch:03d}"))

    return model



def main():
    class Args:
        def __init__(self):
            self.data = 'oscar_split_ViT-B_32_train.pkl'
            self.out_dir = './checkpoints'
            self.prefix = 'coco_prefix'
            self.epochs = 10
            self.save_every = 1
            self.prefix_length = 10
            self.prefix_length_clip = 10
            self.bs = 40
            self.only_prefix = False
            self.mapping_type = 'mlp'
            self.num_layers = 8
            self.is_rn = False
            self.normalize_prefix = False

    args = Args()

    # Dataset and model setup
    dataset = ClipCocoDataset(args.data, args.prefix_length, normalize_prefix=args.normalize_prefix)
    prefix_dim = 640 if args.is_rn else 512
    args.mapping_type = {'mlp': MappingType.MLP, 'transformer': MappingType.Transformer}[args.mapping_type]

    if args.only_prefix:
        model = ClipCaptionPrefix(args.prefix_length, clip_length=args.prefix_length_clip,
                                  prefix_size=prefix_dim, num_layers=args.num_layers,
                                  mapping_type=args.mapping_type)
        print("Train only prefix")
    else:
        model = ClipCaptionModel(args.prefix_length, clip_length=args.prefix_length_clip,
                                 prefix_size=prefix_dim, num_layers=args.num_layers,
                                 mapping_type=args.mapping_type)
        print("Train both prefix and GPT")

    # Start training
    train(dataset, model, args, output_dir=args.out_dir, output_prefix=args.prefix)

if __name__ == '__main__':
    main()


Data size is 56674


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Train both prefix and GPT
>>> Training epoch 0




coco_prefix:   0%|          | 0/10 [00:00<?, ?it/s][A[A

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} ValueError: too many values to unpack (expected 2)
Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "<ipython-input-37-618bae5c5c12>", line 94, in generator
    yield self[i]
          ~~~~^^^

  File "<ipython-input-37-618bae5c5c12>", line 77, in __getitem__
    tokens, mask = self.pad_tokens(self.captions_tokens[item])
    ^^^^^^^^^^^^

ValueError: too many values to unpack (expected 2)


	 [[{{node PyFunc}}]] [Op:IteratorGetNext] name: 