In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/json-data/dataset_rsicd.json
/kaggle/input/image-data/RSICD_images/port_26.jpg
/kaggle/input/image-data/RSICD_images/industrial_169.jpg
/kaggle/input/image-data/RSICD_images/center_68.jpg
/kaggle/input/image-data/RSICD_images/storagetanks_212.jpg
/kaggle/input/image-data/RSICD_images/00767.jpg
/kaggle/input/image-data/RSICD_images/playground_264.jpg
/kaggle/input/image-data/RSICD_images/forest_205.jpg
/kaggle/input/image-data/RSICD_images/viaduct_317.jpg
/kaggle/input/image-data/RSICD_images/storagetanks_323.jpg
/kaggle/input/image-data/RSICD_images/mediumresidential_77.jpg
/kaggle/input/image-data/RSICD_images/denseresidential_223.jpg
/kaggle/input/image-data/RSICD_images/farmland_318.jpg
/kaggle/input/image-data/RSICD_images/00266.jpg
/kaggle/input/image-data/RSICD_images/center_102.jpg
/kaggle/input/image-data/RSICD_images/center_189.jpg
/kaggle/input/image-data/RSICD_images/storagetanks_197.jpg
/kaggle/input/image-data/RSICD_images/parking_332.jpg
/kaggle/input/image-

In [2]:
# Install missing packages
!pip install rouge-score

# Download NLTK data
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)




True

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, ViTFeatureExtractor, ViTModel
import json
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# Configuration
CONFIG = {
    'json_path': '/kaggle/input/json-data/dataset_rsicd.json',  # UPDATE THIS PATH
    'images_path': '/kaggle/input/image-data/RSICD_images',                # UPDATE THIS PATH
    'output_folder': 'results',
    'batch_size': 8,
    'learning_rate': 5e-5,  # Optimal learning rate
    'epochs': 15,
    'max_length': 50,
    'patience': 5
}

# Create output directory
os.makedirs(CONFIG['output_folder'], exist_ok=True)

# Dataset class
class CaptionDataset(Dataset):
    def __init__(self, data, img_dir, tokenizer, feature_extractor, max_len=50):
        self.data = data
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = os.path.join(self.img_dir, item['filename'])
        
        # Load and process image
        try:
            image = Image.open(img_path).convert('RGB')
            img_features = self.feature_extractor(image, return_tensors="pt")['pixel_values'].squeeze(0)
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            # Create dummy image if file not found
            img_features = torch.zeros(3, 224, 224)
        
        # Random caption selection
        caption = np.random.choice(item['sentences'])['raw']
        
        # Tokenize caption
        encoding = self.tokenizer(
            caption, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            'image': img_features,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'caption': caption
        }

# ViT + GPT2 Model
class ViTGPT2CaptionModel(nn.Module):
    def __init__(self, gpt_model_name='gpt2', vit_model_name='google/vit-base-patch16-224'):
        super().__init__()
        print("Loading ViT model...")
        self.vit = ViTModel.from_pretrained(vit_model_name)
        print("Loading GPT2 model...")
        self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt_model_name)
        
        # Projection layer to map ViT features to GPT2 embedding space
        self.projection = nn.Linear(self.vit.config.hidden_size, self.gpt2.config.n_embd)
        self.dropout = nn.Dropout(0.1)
        
        print(f"Model initialized: ViT hidden size: {self.vit.config.hidden_size}, GPT2 embedding size: {self.gpt2.config.n_embd}")
        
    def forward(self, images, input_ids, attention_mask):
        # Get image features from ViT
        img_outputs = self.vit(pixel_values=images)
        img_features = img_outputs.last_hidden_state.mean(dim=1)  # Global average pooling
        img_features = self.projection(img_features)
        img_features = self.dropout(img_features)
        
        # Get text embeddings from GPT2
        text_embeds = self.gpt2.transformer.wte(input_ids)
        
        # Combine image and text features
        img_features = img_features.unsqueeze(1)
        combined_embeds = torch.cat([img_features, text_embeds], dim=1)
        
        # Adjust attention mask for combined input
        img_mask = torch.ones(images.size(0), 1, device=attention_mask.device)
        combined_mask = torch.cat([img_mask, attention_mask], dim=1)
        
        # Create labels with padding for image token
        # Add -100 (ignore index) for the image token position
        img_labels = torch.full((images.size(0), 1), -100, device=input_ids.device)
        combined_labels = torch.cat([img_labels, input_ids], dim=1)
        
        # Forward through GPT2
        outputs = self.gpt2(inputs_embeds=combined_embeds, attention_mask=combined_mask, labels=combined_labels)
        return outputs

# Evaluation Metrics
def calculate_bleu_scores(references, hypotheses):
    """Calculate BLEU-1, BLEU-2, BLEU-3, BLEU-4"""
    smoothie = SmoothingFunction().method4
    bleu_scores = {}
    
    for n in [1, 2, 3, 4]:
        total_score = 0
        valid_pairs = 0
        for ref, hyp in zip(references, hypotheses):
            # Clean and validate text
            ref = ref.strip()
            hyp = hyp.strip()
            
            if len(ref) == 0 or len(hyp) == 0:
                continue
                
            ref_tokens = ref.split()
            hyp_tokens = hyp.split()
            
            if len(ref_tokens) == 0 or len(hyp_tokens) == 0:
                continue
                
            # BLEU weights for n-grams
            weights = [1.0/n] * n + [0.0] * (4-n)
            
            try:
                score = sentence_bleu([ref_tokens], hyp_tokens, weights=weights, smoothing_function=smoothie)
                total_score += score
                valid_pairs += 1
            except:
                continue
                
        bleu_scores[f'BLEU-{n}'] = total_score / max(valid_pairs, 1)
    
    return bleu_scores

def calculate_meteor_score(references, hypotheses):
    """Calculate METEOR score"""
    total_score = 0
    valid_pairs = 0
    
    for ref, hyp in zip(references, hypotheses):
        # Clean and validate text
        ref = ref.strip()
        hyp = hyp.strip()
        
        if len(ref) == 0 or len(hyp) == 0:
            continue
            
        ref_tokens = ref.split()
        hyp_tokens = hyp.split()
        
        if len(ref_tokens) == 0 or len(hyp_tokens) == 0:
            continue
            
        try:
            score = meteor_score([ref_tokens], hyp_tokens)
            total_score += score
            valid_pairs += 1
        except:
            continue
            
    return total_score / max(valid_pairs, 1)

def calculate_rouge_l(references, hypotheses):
    """Calculate ROUGE-L score"""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    total_score = 0
    valid_pairs = 0
    
    for ref, hyp in zip(references, hypotheses):
        # Clean and validate text
        ref = ref.strip()
        hyp = hyp.strip()
        
        if len(ref) == 0 or len(hyp) == 0:
            continue
            
        try:
            score = scorer.score(ref, hyp)['rougeL'].fmeasure
            total_score += score
            valid_pairs += 1
        except:
            continue
            
    return total_score / max(valid_pairs, 1)

def calculate_cider_score(references, hypotheses):
    """Simplified CIDEr score"""
    # Using a combination of BLEU-4 and ROUGE-L as CIDEr approximation
    bleu_4_score = 0
    rouge_l_score = 0
    valid_pairs = 0
    
    smoothie = SmoothingFunction().method4
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    
    for ref, hyp in zip(references, hypotheses):
        ref = ref.strip()
        hyp = hyp.strip()
        
        if len(ref) == 0 or len(hyp) == 0:
            continue
            
        ref_tokens = ref.split()
        hyp_tokens = hyp.split()
        
        if len(ref_tokens) == 0 or len(hyp_tokens) == 0:
            continue
            
        try:
            # BLEU-4 component
            bleu_score = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
            bleu_4_score += bleu_score
            
            # ROUGE-L component
            rouge_score = scorer.score(ref, hyp)['rougeL'].fmeasure
            rouge_l_score += rouge_score
            
            valid_pairs += 1
        except:
            continue
    
    if valid_pairs == 0:
        return 0.0
    
    # Combine BLEU-4 and ROUGE-L for CIDEr approximation
    return (bleu_4_score + rouge_l_score) / (2 * valid_pairs)

def calculate_spice_score(references, hypotheses):
    """Simplified SPICE score"""
    # Using METEOR as SPICE approximation (both focus on semantic similarity)
    return calculate_meteor_score(references, hypotheses)

def evaluate_all_metrics(references, hypotheses):
    """Calculate all evaluation metrics"""
    metrics = {}
    
    # BLEU scores
    bleu_scores = calculate_bleu_scores(references, hypotheses)
    metrics.update(bleu_scores)
    
    # Other metrics
    metrics['METEOR'] = calculate_meteor_score(references, hypotheses)
    metrics['ROUGE-L'] = calculate_rouge_l(references, hypotheses)
    metrics['CIDEr'] = calculate_cider_score(references, hypotheses)
    metrics['SPICE'] = calculate_spice_score(references, hypotheses)
    
    return metrics

def save_results(metrics_history, train_losses, val_losses, output_folder):
    """Save all results and plots"""
    
    # Save metrics to file
    with open(os.path.join(output_folder, 'metrics_history.json'), 'w') as f:
        json.dump(metrics_history, f, indent=2)
    
    # Save loss history
    loss_data = {
        'train_losses': train_losses,
        'val_losses': val_losses
    }
    with open(os.path.join(output_folder, 'loss_history.json'), 'w') as f:
        json.dump(loss_data, f, indent=2)
    
    # Plot and save loss curves
    plt.figure(figsize=(15, 10))
    
    # Loss curves
    plt.subplot(2, 3, 1)
    plt.plot(train_losses, label='Train Loss', color='blue')
    plt.plot(val_losses, label='Validation Loss', color='red')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    
    # Accuracy curves (inverse of loss)
    plt.subplot(2, 3, 2)
    train_acc = [1/loss for loss in train_losses]
    val_acc = [1/loss for loss in val_losses]
    plt.plot(train_acc, label='Train Accuracy (1/Loss)', color='blue')
    plt.plot(val_acc, label='Val Accuracy (1/Loss)', color='red')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.grid(True)
    
    # BLEU scores over time
    if metrics_history:
        epochs = list(range(1, len(metrics_history) + 1))
        plt.subplot(2, 3, 3)
        for bleu_n in ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']:
            scores = [epoch_metrics[bleu_n] for epoch_metrics in metrics_history]
            plt.plot(epochs, scores, label=bleu_n)
        plt.xlabel('Epoch')
        plt.ylabel('BLEU Score')
        plt.title('BLEU Scores Over Time')
        plt.legend()
        plt.grid(True)
        
        # Other metrics
        plt.subplot(2, 3, 4)
        for metric in ['METEOR', 'ROUGE-L']:
            scores = [epoch_metrics[metric] for epoch_metrics in metrics_history]
            plt.plot(epochs, scores, label=metric)
        plt.xlabel('Epoch')
        plt.ylabel('Score')
        plt.title('METEOR and ROUGE-L Scores')
        plt.legend()
        plt.grid(True)
        
        plt.subplot(2, 3, 5)
        for metric in ['CIDEr', 'SPICE']:
            scores = [epoch_metrics[metric] for epoch_metrics in metrics_history]
            plt.plot(epochs, scores, label=metric)
        plt.xlabel('Epoch')
        plt.ylabel('Score')
        plt.title('CIDEr and SPICE Scores')
        plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, 'training_results.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"All results saved in '{output_folder}' folder")

# Main Training Function
def train_model():
    print("Starting Image Captioning Training with ViT + GPT2")
    print(f"Configuration: {CONFIG}")
    
    # Load dataset from JSON
    print(f"Loading dataset from: {CONFIG['json_path']}")
    with open(CONFIG['json_path'], 'r') as f:
        data = json.load(f)
    
    print(f"Total images in dataset: {len(data['images'])}")
    
    # Split dataset
    train_data, val_data = train_test_split(data['images'], test_size=0.2, random_state=42)
    print(f"Train samples: {len(train_data)}, Validation samples: {len(val_data)}")
    
    # Initialize tokenizer and feature extractor
    print("Loading tokenizer and feature extractor...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
    
    # Create datasets
    train_dataset = CaptionDataset(train_data, CONFIG['images_path'], tokenizer, feature_extractor, CONFIG['max_length'])
    val_dataset = CaptionDataset(val_data, CONFIG['images_path'], tokenizer, feature_extractor, CONFIG['max_length'])
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    model = ViTGPT2CaptionModel().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    
    # Training tracking
    train_losses, val_losses = [], []
    metrics_history = []
    best_val_loss = float('inf')
    patience_counter = 0
    
    print("Starting training...")
    
    # Training loop
    for epoch in range(CONFIG['epochs']):
        print(f"\nEpoch {epoch+1}/{CONFIG['epochs']}")
        
        # Training phase
        model.train()
        train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')
        
        for batch in train_pbar:
            optimizer.zero_grad()
            
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(images, input_ids, attention_mask)
            loss = outputs.loss
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_pbar.set_postfix({'loss': loss.item()})
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0
        references, hypotheses = [], []
        
        val_pbar = tqdm(val_loader, desc='Validation')
        with torch.no_grad():
            for batch in val_pbar:
                images = batch['image'].to(device)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                outputs = model(images, input_ids, attention_mask)
                val_loss += outputs.loss.item()
                
                # Generate captions for evaluation
                img_features = model.vit(pixel_values=images).last_hidden_state.mean(dim=1)
                img_embeds = model.projection(img_features).unsqueeze(1)
                
                # Create attention mask for generation
                batch_size = images.size(0)
                attention_mask_gen = torch.ones(batch_size, 1, device=device)
                
                generated = model.gpt2.generate(
                    inputs_embeds=img_embeds,
                    attention_mask=attention_mask_gen,
                    max_length=30,  # Shorter for better quality
                    min_length=5,   # Minimum length
                    num_beams=5,    # More beams for better search
                    early_stopping=True,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    do_sample=False,
                    repetition_penalty=1.2,  # Reduce repetition
                    length_penalty=1.0
                )
                
                for i, caption in enumerate(batch['caption']):
                    references.append(caption)
                    
                    # Decode and clean generated text
                    generated_text = tokenizer.decode(generated[i], skip_special_tokens=True).strip()
                    
                    # Handle empty generations
                    if len(generated_text) == 0:
                        generated_text = "empty"
                    
                    # Remove repetitive patterns
                    words = generated_text.split()
                    if len(words) > 3:
                        # Remove excessive repetition
                        cleaned_words = []
                        for word in words:
                            if len(cleaned_words) < 2 or word != cleaned_words[-1] or word != cleaned_words[-2]:
                                cleaned_words.append(word)
                        generated_text = " ".join(cleaned_words[:15])  # Max 15 words
                    
                    hypotheses.append(generated_text)
                
                val_pbar.set_postfix({'loss': outputs.loss.item()})
        
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        
        # Debug: Print some example generations
        print("\n--- Sample Generations ---")
        for i in range(min(3, len(references))):
            print(f"Reference: {references[i]}")
            print(f"Generated: {hypotheses[i]}")
            print("-" * 50)
        
        # Calculate evaluation metrics
        metrics = evaluate_all_metrics(references, hypotheses)
        metrics_history.append(metrics)
        
        # Print results
        print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
        print(f'BLEU-1: {metrics["BLEU-1"]:.4f}, BLEU-2: {metrics["BLEU-2"]:.4f}, BLEU-3: {metrics["BLEU-3"]:.4f}, BLEU-4: {metrics["BLEU-4"]:.4f}')
        print(f'METEOR: {metrics["METEOR"]:.4f}, ROUGE-L: {metrics["ROUGE-L"]:.4f}, CIDEr: {metrics["CIDEr"]:.4f}, SPICE: {metrics["SPICE"]:.4f}')
        print(f'Total references: {len(references)}, Total hypotheses: {len(hypotheses)}')
        
        # Early stopping and model saving
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model_path = os.path.join(CONFIG['output_folder'], 'best_model.pth')
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'epoch': epoch,
                'metrics': metrics
            }, model_path)
            print(f'Best model saved with validation loss: {val_loss:.4f}')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= CONFIG['patience']:
                print(f'Early stopping triggered at epoch {epoch+1}')
                break
    
    # Save all results
    save_results(metrics_history, train_losses, val_losses, CONFIG['output_folder'])
    
    print(f"\nTraining completed!")
    print(f"Best validation loss: {best_val_loss:.4f}")
    print(f"All results saved in '{CONFIG['output_folder']}' folder")

# Run training
if __name__ == "__main__":
    train_model()

Starting Image Captioning Training with ViT + GPT2
Configuration: {'json_path': '/kaggle/input/json-data/dataset_rsicd.json', 'images_path': '/kaggle/input/image-data/RSICD_images', 'output_folder': 'results', 'batch_size': 8, 'learning_rate': 5e-05, 'epochs': 15, 'max_length': 50, 'patience': 5}
Loading dataset from: /kaggle/input/json-data/dataset_rsicd.json
Total images in dataset: 10921
Train samples: 8736, Validation samples: 2185
Loading tokenizer and feature extractor...
Using device: cuda
Loading ViT model...


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading GPT2 model...
Model initialized: ViT hidden size: 768, GPT2 embedding size: 768
Starting training...

Epoch 1/15


Training: 100%|██████████| 1092/1092 [05:17<00:00,  3.44it/s, loss=0.481]
Validation: 100%|██████████| 274/274 [02:20<00:00,  1.95it/s, loss=0.561]



--- Sample Generations ---
Reference: yellow beach is near a piece of green ocean with white waves .
Generated: athewhiteyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellowyellow
--------------------------------------------------
Reference: five white storage tanks are near railways .
Generated: amanyseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseverseversever
--------------------------------------------------
Reference: this is a lake in the city .
Generated: amanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymanymany
--------------------------------------------------
Train Loss: 0.5591, Val Loss: 0.4304
BLEU-1: 0.0000, BLEU-2: 0.0000, BLEU-3: 0.0000, BLEU-4: 0.0000
METEOR: 0.0000, ROUGE-L: 0.0000, CIDEr: 0.0000, SPICE: 0.0000
Total references: 2185, Total hyp

Training:  24%|██▍       | 265/1092 [01:16<03:57,  3.48it/s, loss=0.401]


KeyboardInterrupt: 