In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import json
import pandas as pd
import os
from PIL import Image, ImageFile  # Added ImageFile
import torch
import torch.nn as nn
from transformers import CLIPVisionModel, CLIPImageProcessor, GPT2LMHeadModel, GPT2Tokenizer
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

# Enable truncated image loading
ImageFile.LOAD_TRUNCATED_IMAGES = True  # Skip corrupt images

# Configuration (update paths as needed)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "/kaggle/input/finetuned_vqa2/pytorch/default/1/earth_vqa_finetuned_model"
test_csv = "/kaggle/input/val-dataset-vqa2/filtered_dataset_test (1).csv"
image_folder = "/kaggle/input/val-dataset/dataset/Val/Val/images_png"
output_json = "/kaggle/working/evaluation_results.json"

# Load test dataset
test_df = pd.read_csv(test_csv)
test_df["image_path"] = test_df["image_name"].apply(
    lambda name: os.path.join(image_folder, name)
)
test_df = test_df[test_df["image_path"].apply(os.path.exists)]
test_df = test_df.reset_index(drop=True)
print(f"Loaded {len(test_df)} test samples")

# Define model with fixed projection output size (768)
class FullVisionTextModel(nn.Module):
    def __init__(self, vision_encoder, llm, projection, num_image_tokens=32):
        super().__init__()
        self.vision_encoder = vision_encoder
        self.llm = llm
        self.projection = projection
        self.num_image_tokens = num_image_tokens
        # Ensure projection output matches GPT-2 hidden size
        self.proj_out_dim = llm.config.hidden_size  # 768 for GPT-2

    def forward(self, images, input_ids, attention_mask=None, labels=None):
        # Process images
        vision_output = self.vision_encoder(images)
        image_features = vision_output.last_hidden_state.mean(dim=1)
        
        # Project features to [batch, num_tokens * 768]
        projected = self.projection(image_features)
        # Reshape to [batch, num_tokens, 768]
        projected = projected.view(-1, self.num_image_tokens, self.proj_out_dim)
        projected = projected * 0.1  # Stabilize training
        
        
        # Process text embeddings
        text_embeds = self.llm.get_input_embeddings()(input_ids)
        
        # Combine embeddings
        inputs_embeds = torch.cat([projected, text_embeds], dim=1)
        
        # Adjust attention mask
        if attention_mask is not None:
            image_mask = torch.ones(
                attention_mask.shape[0], 
                self.num_image_tokens, 
                device=device, 
                dtype=attention_mask.dtype
            )
            attention_mask = torch.cat([image_mask, attention_mask], dim=1)
        
        # Adjust labels if provided
        if labels is not None:
            image_labels = torch.full(
                (labels.shape[0], self.num_image_tokens), 
                -100, 
                device=device,
                dtype=labels.dtype
            )
            labels = torch.cat([image_labels, labels], dim=1)

        # Forward through LLM
        outputs = self.llm(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

# Load model components
clip_processor = CLIPImageProcessor.from_pretrained(model_path)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(model_path)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Load config
with open(os.path.join(model_path, "model_config.json"), "r") as f:
    config = json.load(f)
    config["num_image_tokens"] = 32

# Initialize models
clip_vision_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Create projection layer with OUTPUT SIZE 768 (GPT-2 hidden size)
projection = nn.Sequential(
    nn.Linear(config["projection_in_dim"], config["num_image_tokens"] * gpt2_model.config.hidden_size),  # Fixed!
    nn.LayerNorm(config["num_image_tokens"] * gpt2_model.config.hidden_size)
)

# Create full model
full_model = FullVisionTextModel(
    vision_encoder=clip_vision_model,
    llm=gpt2_model,
    projection=projection,
    num_image_tokens=config["num_image_tokens"]
)

# Load weights
state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location=device)
if all(k.startswith('module.') for k in state_dict.keys()):
    state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
full_model.load_state_dict(state_dict)
full_model = full_model.to(device).eval()
print("Model loaded")



2025-07-31 15:25:06.233385: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753975506.447428      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753975506.508145      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loaded 6676 test samples


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded


In [3]:
# Optimized generation function (precompute image features)
def generate_answer(image, question, max_new_tokens=100, temperature=0.7):
    # Process image
    pixel_values = clip_processor(image, return_tensors="pt")["pixel_values"].to(device)
    
    # Precompute image features
    with torch.no_grad():
        vision_output = full_model.vision_encoder(pixel_values)
        image_features = vision_output.last_hidden_state.mean(dim=1)
        projected = full_model.projection(image_features)
        projected = projected.view(-1, full_model.num_image_tokens, full_model.proj_out_dim)
        projected = projected * 0.1
    
    # Prepare prompt WITHOUT padding
    prompt = f"User: {question}\nAssistant:"
    inputs = gpt2_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,  # No max_length, no padding
        add_special_tokens=True
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # Initialize generation
    generated_ids = input_ids.clone()
    current_attention_mask = attention_mask.clone()
    
    # Generate tokens
    for i in range(max_new_tokens):
        # Check sequence length
        seq_len = generated_ids.shape[1]
        if seq_len >= 1024:  # GPT-2 max context
            break
            
        # Get text embeddings
        text_embeds = full_model.llm.get_input_embeddings()(generated_ids)
        
        # Combine with image features
        inputs_embeds = torch.cat([projected, text_embeds], dim=1)
        
        # Create full attention mask
        img_mask = torch.ones(1, full_model.num_image_tokens, device=device)
        full_attention_mask = torch.cat([img_mask, current_attention_mask], dim=1)
        
        # Forward pass
        outputs = full_model.llm(
            inputs_embeds=inputs_embeds,
            attention_mask=full_attention_mask
        )
        
        # Get last token logits
        next_token_logits = outputs.logits[:, -1, :]
        
        # Sample next token
        probs = torch.nn.functional.softmax(next_token_logits / temperature, dim=-1)
        next_token = torch.multinomial(probs, 1)
        
        # Stop if EOS
        # if next_token.item() == gpt2_tokenizer.eos_token_id:
        #     break
        
        # Update sequence
        generated_ids = torch.cat([generated_ids, next_token], dim=1)
        current_attention_mask = torch.cat([
            current_attention_mask, 
            torch.ones(1, 1, device=device)
        ], dim=1)
        
        # # Print progress every 5 tokens
        # if i % 5 == 0:
        #     current_text = gpt2_tokenizer.decode(
        #         generated_ids[0, input_ids.shape[1]:], 
        #         skip_special_tokens=True
        #     )
        #     print(f"Step {i}: {current_text}")
    
    # Extract only the generated response
    full_output = gpt2_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return full_output.split("Assistant:")[-1].strip()

# Updated evaluation function
def evaluate_model(test_df, num_samples=None, display_examples=3):
    if num_samples is not None:
        test_df = test_df.sample(min(num_samples, len(test_df)))
    
    results = []
    correct = 0
    pbar = tqdm(total=len(test_df), desc="Evaluating")
    
    for idx, row in test_df.iterrows():
        try:
            image = Image.open(row["image_path"]).convert("RGB")
            question = row["question"]
            ground_truth = row["answer"]
            
            generated_answer = generate_answer(image, question)
            
            # Simple accuracy check
            gt_keywords = set(ground_truth.lower().split())
            gen_keywords = set(generated_answer.lower().split())
            
            results.append({
                "image_name": row["image_name"],
                "question": question,
                "ground_truth": ground_truth,
                "generated_answer": generated_answer,
            })
            
            # Display examples
            if idx < display_examples:
                plt.figure(figsize=(10, 5))
                plt.imshow(image)
                plt.title(f"Q: {question}\nGT: {ground_truth}\nGen: {generated_answer}")
                plt.axis('off')
                plt.show()
                
        except Exception as e:
            print(f"Skipped {row['image_name']} due to error: {str(e)}")
        pbar.update(1)
    
    pbar.close()
    
    if len(results) == 0:
        print("WARNING: No samples processed successfully")
    
    # Save results
    with open(output_json, "w") as f:
        json.dump({
            "total_samples": len(results),
            "results": results
        }, f, indent=2)
    

# Run evaluation
evaluate_model(test_df, num_samples=100)

Evaluating: 100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
