In [None]:
import os
import json
import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
import logging
from torchvision.datasets import CIFAR100
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def prepare_cifar100_dataset(output_dir: str, num_classes: int = 5, samples_per_class: int = 10):
    """
    Prepares a subset of CIFAR100 dataset for LLAVA fine-tuning.

    Args:
        output_dir: Directory to save the processed dataset
        num_classes: Number of classes to include
        samples_per_class: Number of samples per class
    """
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    image_dir = os.path.join(output_dir, "images")
    os.makedirs(image_dir, exist_ok=True)

    # Download CIFAR100
    logger.info("Downloading CIFAR100 dataset...")
    dataset = CIFAR100(root='./data', train=True, download=True)

    # Get class names
    class_names = dataset.classes[:num_classes]

    # Initialize annotations dictionary
    annotations = {}

    # Track indices for each class
    class_indices = {i: [] for i in range(num_classes)}
    for idx, (_, label) in enumerate(dataset):
        if label < num_classes:
            class_indices[label].append(idx)

    # Process each class
    for class_idx, class_name in enumerate(class_names):
        # Get random samples for this class
        selected_indices = np.random.choice(
            class_indices[class_idx],
            size=min(samples_per_class, len(class_indices[class_idx])),
            replace=False
        )

        for sample_idx, idx in enumerate(selected_indices):
            image, _ = dataset[idx]

            # Create filename
            image_filename = f"{class_name}_{sample_idx}.png"
            image_path = os.path.join(image_dir, image_filename)

            # Save image
            image.save(image_path)

            # Create annotation
            annotations[image_filename] = {
                "category": class_name,
                "conversations": [
                    {
                        "from": "human",
                        "value": "What category does this image belong to?"
                    },
                    {
                        "from": "assistant",
                        "value": f"This image belongs to category {class_name}."
                    }
                ]
            }

        logger.info(f"Processed {samples_per_class} images for class {class_name}")

    # Save annotations
    annotations_path = os.path.join(output_dir, "annotations.json")
    with open(annotations_path, 'w') as f:
        json.dump(annotations, f, indent=2)

    logger.info(f"Dataset created at {output_dir}")
    logger.info(f"Total classes: {len(class_names)}")
    logger.info(f"Classes included: {', '.join(class_names)}")
    logger.info(f"Total images: {len(annotations)}")

    return image_dir, annotations_path

def verify_dataset(image_dir: str, annotations_path: str):
    """
    Verify that all images in the annotations exist and can be opened.
    """
    with open(annotations_path, 'r') as f:
        annotations = json.load(f)

    all_valid = True
    for image_filename in annotations.keys():
        image_path = os.path.join(image_dir, image_filename)
        if not os.path.exists(image_path):
            logger.error(f"Missing image: {image_filename}")
            all_valid = False
            continue

        try:
            with Image.open(image_path) as img:
                img.verify()
        except Exception as e:
            logger.error(f"Invalid image {image_filename}: {str(e)}")
            all_valid = False

    return all_valid

if __name__ == "__main__":
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Create dataset
    output_dir = "cifar100_llava_dataset"
    image_dir, annotations_path = prepare_cifar100_dataset(
        output_dir,
        num_classes=5,  # Using 5 classes
        samples_per_class=10  # 10 images per class
    )

    # Verify dataset
    if verify_dataset(image_dir, annotations_path):
        logger.info("Dataset verified successfully!")
    else:
        logger.error("Dataset verification failed!")

Files already downloaded and verified


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import os
import json
import torch
import logging
import requests
from PIL import Image
from transformers import AutoProcessor
from llama_cpp import Llama
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score
import urllib.request

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def download_file(url, filepath):
    """
    Download a file with progress bar
    """
    response = urllib.request.urlopen(url)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024 * 1024  # 1MB chunks

    with open(filepath, 'wb') as f, tqdm(
        total=total_size,
        unit='iB',
        unit_scale=True,
        desc=f"Downloading {os.path.basename(filepath)}"
    ) as pbar:
        while True:
            buffer = response.read(block_size)
            if not buffer:
                break
            f.write(buffer)
            pbar.update(len(buffer))

def ensure_model_downloaded(model_dir="models"):
    """
    Ensure the GGUF model is downloaded
    """
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "llava-v1.5-7b-q4_k.gguf")

    if not os.path.exists(model_path):
        logger.info("Model not found. Downloading...")
        model_url = "https://huggingface.co/mys/ggml_llava-v1.5-7b/tree/main/ggml-model-q4_k.gguf"
        try:
            download_file(model_url, model_path)
            logger.info(f"Model downloaded successfully to {model_path}")
        except Exception as e:
            logger.error(f"Error downloading model: {str(e)}")
            logger.error("""
            Please download manually from:
            https://huggingface.co/mys/ggml_llava-v1.5-7b/tree/main
            """)
            raise
    else:
        logger.info(f"Model already exists at {model_path}")

    return model_path

class LlavaGGUFEvaluator:
    def __init__(self, model_path, n_gpu_layers=20, n_ctx=2048):
        """
        Initialize LLAVA GGUF model
        """
        logger.info(f"Loading model from {model_path}")
        self.model = Llama(
            model_path=model_path,
            n_gpu_layers=n_gpu_layers,
            n_ctx=n_ctx,
            verbose=False
        )

        self.processor = AutoProcessor.from_pretrained("liuhaotian/llava-v1.5-7b")

    def predict_single(self, image_path):
        """Predict category for a single image"""
        image = Image.open(image_path).convert("RGB")
        image_tensor = self.processor.image_processor(image, return_tensors="pt")["pixel_values"]

        prompt = "What category does this image belong to?"

        response = self.model.create_chat_completion(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": image_path}}
                    ]
                }
            ],
            max_tokens=100,
            temperature=0.0
        )

        return response["choices"][0]["message"]["content"]

    def evaluate_dataset(self, image_dir, annotations_path):
        """Evaluate the model on entire dataset"""
        with open(annotations_path, 'r') as f:
            annotations = json.load(f)

        true_labels = []
        pred_labels = []

        logger.info("Starting evaluation...")
        for image_file, anno in tqdm(annotations.items()):
            image_path = os.path.join(image_dir, image_file)
            true_category = anno['category']

            prediction = self.predict_single(image_path)

            try:
                pred_category = prediction.split("category ")[-1].rstrip(".")
            except:
                pred_category = "unknown"

            true_labels.append(true_category)
            pred_labels.append(pred_category)

            logger.debug(f"True: {true_category}, Predicted: {pred_category}")

        accuracy = accuracy_score(true_labels, pred_labels)
        report = classification_report(true_labels, pred_labels)

        return {
            'accuracy': accuracy,
            'classification_report': report,
            'predictions': list(zip(true_labels, pred_labels))
        }

def main():
    # Paths
    dataset_dir = "cifar100_llava_dataset"
    image_dir = os.path.join(dataset_dir, "images")
    annotations_path = os.path.join(dataset_dir, "annotations.json")

    try:
        # Download/verify model
        model_path = ensure_model_downloaded()

        # Initialize evaluator
        evaluator = LlavaGGUFEvaluator(
            model_path=model_path,
            n_gpu_layers=20  # Reduce this if you run into memory issues
        )

        # Run evaluation
        logger.info("Starting evaluation...")
        results = evaluator.evaluate_dataset(image_dir, annotations_path)

        # Print results
        logger.info("\nEvaluation Results:")
        logger.info(f"Accuracy: {results['accuracy']:.4f}")
        logger.info("\nClassification Report:")
        logger.info(results['classification_report'])

        # Save results
        with open('evaluation_results.json', 'w') as f:
            json.dump({
                'accuracy': results['accuracy'],
                'predictions': results['predictions']
            }, f, indent=2)

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

ERROR:__main__:Error downloading model: HTTP Error 404: Not Found
ERROR:__main__:
            Please download manually from:
            https://huggingface.co/mys/ggml_llava-v1.5-7b/tree/main
            
ERROR:__main__:An error occurred: HTTP Error 404: Not Found


HTTPError: HTTP Error 404: Not Found

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, Dataset
from torchvision import models
from transformers import AutoTokenizer

###############################################
# 1. Define a Simple Multimodal LLaVA‑like Model
###############################################
class SimpleLLaVA(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SimpleLLaVA, self).__init__()
        # Use a pre‑trained ResNet‑18 as the vision encoder.
        self.vision_encoder = models.resnet18(pretrained=True)
        # Remove the final classification layer.
        self.vision_encoder.fc = nn.Identity()
        # Project the ResNet output (512‑dim) to our embedding dimension.
        self.vision_projection = nn.Linear(512, embed_dim)

        # Text embedding layer.
        self.text_embedding = nn.Embedding(vocab_size, embed_dim)

        # A small transformer decoder: 2 layers with 8 heads each.
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=8)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=2)

        # Language modeling head that maps transformer outputs to vocabulary logits.
        self.lm_head = nn.Linear(embed_dim, vocab_size)
        self.embed_dim = embed_dim

    def forward(self, input_ids, attention_mask, images, labels=None):
        # Encode the image.
        vision_features = self.vision_encoder(images)  # [batch, 512]
        vision_embeds = self.vision_projection(vision_features).unsqueeze(1)  # [batch, 1, embed_dim]

        # Embed the combined text (prompt + target).
        text_embeds = self.text_embedding(input_ids)  # [batch, seq_len, embed_dim]

        # Prepend the vision embedding to the text embeddings.
        decoder_input = torch.cat([vision_embeds, text_embeds], dim=1)  # [batch, 1+seq_len, embed_dim]

        # Use the vision embedding (transposed) as memory for the decoder.
        memory = vision_embeds.transpose(0, 1)  # [1, batch, embed_dim]

        # Transformer expects (seq_len, batch, embed_dim)
        decoder_input = decoder_input.transpose(0, 1)  # [1+seq_len, batch, embed_dim]
        seq_len_total = decoder_input.size(0)
        # Create a causal mask.
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len_total).to(decoder_input.device)

        # Transformer decoding.
        decoder_output = self.transformer_decoder(decoder_input, memory, tgt_mask=tgt_mask)
        decoder_output = decoder_output.transpose(0, 1)  # [batch, 1+seq_len, embed_dim]

        # Only the text part is used for language modeling (skip the vision token).
        lm_logits = self.lm_head(decoder_output[:, 1:, :])  # [batch, seq_len, vocab_size]

        loss = None
        if labels is not None:
            # Compute cross-entropy loss over the text tokens.
            # Flatten inputs: shape becomes [batch * seq_len, vocab_size] and labels [batch * seq_len].
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.reshape(-1, lm_logits.size(-1)), labels.reshape(-1))
        return {"loss": loss, "logits": lm_logits}


###############################################
# 2. Setup Tokenizer and Prepare CIFAR‑100 Data
###############################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# For demonstration, we use GPT‑2’s tokenizer.
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# GPT‑2 does not have a pad token so we set it to its eos_token.
tokenizer.pad_token = tokenizer.eos_token

# Define image transforms for CIFAR‑100.
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to suit ResNet input.
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Download CIFAR‑100.
train_cifar = datasets.CIFAR100(root="./data", train=True, download=True, transform=transform)
test_cifar  = datasets.CIFAR100(root="./data", train=False, download=True, transform=transform)

# Custom dataset: for each sample, we build a prompt and a target answer.
class CIFAR100LLAVADataset(Dataset):
    def __init__(self, cifar_dataset, tokenizer):
        self.cifar_dataset = cifar_dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.cifar_dataset)

    def __getitem__(self, idx):
        image, label = self.cifar_dataset[idx]
        class_name = self.cifar_dataset.classes[label]
        # Define prompt and target text.
        prompt = "Please classify the following image:"
        target = f" Answer: {class_name}"
        return {"image": image, "prompt": prompt, "target": target}

# Collate function to combine image and text data.
def collate_fn(batch):
    images = torch.stack([item["image"] for item in batch])
    prompts = [item["prompt"] for item in batch]
    targets = [item["target"] for item in batch]

    # Tokenize the prompt (same for all samples).
    prompt_encodings = tokenizer(prompts, padding=True, return_tensors="pt")
    # Tokenize the target answer.
    target_encodings = tokenizer(targets, padding=True, return_tensors="pt")

    # Create combined input_ids by concatenating prompt and target tokens.
    input_ids = torch.cat([prompt_encodings.input_ids, target_encodings.input_ids], dim=1)
    # Create attention_mask for the combined input.
    attention_mask = torch.cat([prompt_encodings.attention_mask, target_encodings.attention_mask], dim=1)

    # Prepare labels: ignore the prompt tokens by setting them to -100.
    prompt_len = prompt_encodings.input_ids.size(1)
    labels_prompt = torch.full(prompt_encodings.input_ids.shape, -100)
    labels = torch.cat([labels_prompt, target_encodings.input_ids], dim=1)

    return {
        "images": images,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Create dataset instances.
train_dataset = CIFAR100LLAVADataset(train_cifar, tokenizer)
test_dataset  = CIFAR100LLAVADataset(test_cifar, tokenizer)

# Create data loaders.
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

###############################################
# 3. Initialize Model and Optimizer
###############################################
vocab_size = tokenizer.vocab_size
embed_dim = 256  # Chosen embedding dimension.
model = SimpleLLaVA(vocab_size, embed_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
num_epochs = 3

###############################################
# 4. Fine‑Tuning Loop
###############################################
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in train_loader:
        images = batch["images"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            images=images,
            labels=labels  # Labels for language modeling (with prompt tokens masked).
        )
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

###############################################
# 5. Save the Fine‑Tuned Model and Tokenizer
###############################################
model_path = "./llava_finetuned_cifar100"
torch.save(model.state_dict(), model_path + "/pytorch_model.bin")
tokenizer.save_pretrained(model_path)
print("Model and tokenizer saved.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:02<00:00, 70.0MB/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 129MB/s]


Epoch 1/3, Loss: 1.7707
