In [8]:
# Cell 1: Install Java 17
# language_tool_python requires Java >= 17. Colab often defaults to Java 11.
!apt-get update
!apt-get install -y openjdk-17-jdk-headless
!update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/bin/java
!java -version

print("Java 17 installed and set as default.")

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
            Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [Waiting for headers] [1 InRelease 3,632 B/3,632 B 1000% [Waiting for headers] [Waiting for headers] [1 InRelease 3,632 B/3,632 B 1000% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [3 InRelease 14.2 kB/129 kB 11%] [Connected to r2u.sta                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,853 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRele

In [9]:
# Cell 2: Install Required Python Libraries

!pip install language_tool_python
!pip install transformers # For potential VLM use (e.g., BLIP/GIT if running locally)
!pip install openai       # For GPT-4 API (if using)
!pip install nltk         # For tokenization, though simple split() is used for this example
!pip install scikit-learn # For TF-IDF and cosine similarity
!pip install tqdm         # For progress bar

print("Required Python libraries installed.")

Required Python libraries installed.


In [10]:
# Cell 3: Import Libraries and Mount Google Drive

import json
import re
import language_tool_python
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm # For progress bar in Colab

# Optional: For LLM/VLM integration later, if running locally (e.g., BLIP/GIT)
# from transformers import pipeline, set_seed
# from PIL import Image

# For OpenAI API (if using)
# import openai
# import os

# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
    # Define file paths. ADJUST THESE PATHS to where your files are located.
    # >>>>>> IMPORTANT: ENSURE THESE PATHS ARE CORRECT FOR YOUR FILE LOCATION <<<<<<
    json_file_path = '/content/dataset_rsicd.json' # <--- ADJUST THIS!
    output_file_path = '/content/drive/MyDrive/captions_train_augmented.json' # <--- ADJUST THIS!
except ImportError:
    print("Not in Google Colab environment. Please adjust file paths.")
    json_file_path = 'dataset_rsicd.json' # Example for local execution
    output_file_path = 'captions_train_augmented.json'

print(f"Input JSON path: {json_file_path}")
print(f"Output JSON path: {output_file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.
Input JSON path: /content/dataset_rsicd.json
Output JSON path: /content/drive/MyDrive/captions_train_augmented.json


In [11]:
# Cell 4: Load Dataset and Initialize LanguageTool

# Load the Dataset
try:
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Successfully loaded {len(data['images'])} images from {json_file_path}")
except FileNotFoundError:
    print(f"Error: Input JSON file '{json_file_path}' not found. Please double-check the path in Cell 3.")
    data = {"images": []} # Fallback to prevent immediate crash, but main processing will skip
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from '{json_file_path}'. Check file format for validity.")
    data = {"images": []}

# Initialize LanguageTool (this may still take a moment, but now with correct Java)
print("\nInitializing LanguageTool for grammar correction (this may take a moment)...")
lang_tool = language_tool_python.LanguageTool('en-US') # You can change to 'en-GB' if preferred
print("LanguageTool initialized.")

# Find the maximum existing sentid to ensure uniqueness for new captions
next_sentid = 0
if 'images' in data:
    for img_data in data['images']:
        for sent in img_data.get('sentences', []):
            if 'sentid' in sent and sent['sentid'] >= next_sentid:
                next_sentid = sent['sentid'] + 1
print(f"Starting new sentence ID (sentid) assignment from: {next_sentid}")

Successfully loaded 10921 images from /content/dataset_rsicd.json

Initializing LanguageTool for grammar correction (this may take a moment)...


Downloading LanguageTool latest: 100%|██████████| 252M/252M [00:08<00:00, 30.9MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp4zkg3dg_.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


LanguageTool initialized.
Starting new sentence ID (sentid) assignment from: 54605


In [12]:
# Cell 5: Define Helper Functions (Text Similarity & Duplicate Filtering)

def get_sentence_embeddings(sentences):
    """Generates TF-IDF vectors for sentences for similarity comparison."""
    if not sentences:
        return None, None
    # Adjust min_df to ignore terms that appear too infrequently
    vectorizer = TfidfVectorizer(min_df=1, stop_words='english').fit(sentences)
    tfidf_matrix = vectorizer.transform(sentences)
    return tfidf_matrix, vectorizer

def filter_duplicates_by_similarity(new_captions_to_check, existing_captions_baseline, threshold=0.9):
    """
    Filters out new captions that are too similar to existing ones or to each other.

    Args:
        new_captions_to_check (list): List of new raw captions to be filtered.
        existing_captions_baseline (list): List of raw captions already present (cleaned originals).
        threshold (float): Cosine similarity threshold. Captions above this are considered duplicates.

    Returns:
        list: Filtered list of new captions.
    """
    if not new_captions_to_check:
        return []

    # Combine all captions for TF-IDF vectorization
    all_captions = existing_captions_baseline + new_captions_to_check

    if len(all_captions) < 2:
        return new_captions_to_check # Not enough captions to compare

    tfidf_matrix, vectorizer = get_sentence_embeddings(all_captions)
    if tfidf_matrix is None: # No features found
        return new_captions_to_check

    # Compute cosine similarity for all pairs
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)

    filtered_new_captions = []
    # Indices of new captions start after the existing ones
    new_captions_start_idx = len(existing_captions_baseline)

    # Keep track of indices that are already considered 'kept' to avoid adding very similar new captions
    kept_indices_in_all_captions = list(range(new_captions_start_idx)) # Initially, all baseline captions are 'kept'

    for i in range(len(new_captions_to_check)):
        current_new_caption_idx = new_captions_start_idx + i
        is_duplicate = False
        # Compare current new caption against all captions already in 'kept_indices_in_all_captions'
        for j in kept_indices_in_all_captions:
            if cosine_sim_matrix[current_new_caption_idx, j] >= threshold:
                is_duplicate = True
                break
        if not is_duplicate:
            filtered_new_captions.append(new_captions_to_check[i])
            kept_indices_in_all_captions.append(current_new_caption_idx) # Add current caption to 'kept' set

    return filtered_new_captions

print("Helper functions for text similarity and duplicate filtering defined.")

Helper functions for text similarity and duplicate filtering defined.


In [13]:
# Cell 6: Main Processing Loop (Cleaning, Grammar, Enrichment, Normalization)

processed_images = []
normalization_map = {
    "residential area": "neighborhood",
    "railway": "train tracks",
    "road": "street",
    "cars": "vehicles",
    "plane": "aircraft",
    "planes": "aircraft",
    "building": "structure",
    "buildings": "structures",
    "court": "field", # Example based on project description "An old court is surrounded by white houses."
    "football field": "sports field",
    "basketball court": "sports field",
    "tennis court": "sports field"
    # Add more mappings as you analyze your dataset's vocabulary
}

# --- Main Processing Loop ---
# Using tqdm for a progress bar in Colab
if 'images' in data and len(data['images']) > 0:
    for i, img_data in tqdm(enumerate(data['images']), total=len(data['images']), desc="Processing Images"):
        current_filename = img_data['filename']
        current_imgid = img_data['imgid']
        original_sentences = img_data.get('sentences', [])
        processed_sentences_initial = [] # Will hold cleaned and grammar-fixed original captions
        existing_raw_captions_for_sim = [] # For similarity filtering against new captions

        # --- 1. Clean Captions & 2. Grammar Fix ---
        for sent in original_sentences:
            cleaned_raw = sent['raw'].lower().strip()
            cleaned_raw = re.sub(r'\s+', ' ', cleaned_raw) # Remove multiple spaces
            cleaned_raw = re.sub(r'([.!?])\1+', r'\1', cleaned_raw) # Fix duplicate punctuation
            if not cleaned_raw.endswith(('.', '!', '?')):
                cleaned_raw += '.' # Ensure ends with punctuation

            # Apply grammar correction
            matches = lang_tool.check(cleaned_raw)
            grammatically_fixed_caption = language_tool_python.utils.correct(cleaned_raw, matches)

            processed_sentences_initial.append({
                'original_sentid': sent['sentid'], # Keep original ID to link back
                'raw': grammatically_fixed_caption
            })
            existing_raw_captions_for_sim.append(grammatically_fixed_caption)

        # --- 3. Caption Enrichment ---
        # This is the placeholder where you integrate your LLM/VLM call.
        # Replace this section with your actual code for GPT-4 or BLIP/GIT.
        newly_generated_captions = []

        # Example placeholder for LLM call (replace this with your real LLM/VLM logic)
        # --- START OF LLM/VLM INTEGRATION ---
        # Example using a dummy generator (replace this with your real LLM/VLM logic):
        # For a real implementation, you'd loop through existing_raw_captions_for_sim or use the image itself
        if i % 100 == 0: # Only generate for a few images to demonstrate, adjust as needed
            for original_cap in existing_raw_captions_for_sim[:1]: # Take first original caption for example
                # Replace with your actual API call or model inference
                # For GPT-4 example:
                # try:
                #     openai.api_key = "YOUR_OPENAI_API_KEY" # Replace with your actual key or os.getenv()
                #     response = openai.chat.completions.create(
                #         model="gpt-4",
                #         messages=[
                #             {"role": "system", "content": "You are a creative assistant providing diverse descriptions for remote sensing images."},
                #             {"role": "user", "content": f"Generate 2 new, diverse, and concise paraphrases for this remote sensing image caption: '{original_cap}'"}
                #         ],
                #         n=2, # Request 2 paraphrases
                #         temperature=0.8 # Higher temperature for more diversity
                #     )
                #     for choice in response.choices:
                #         gen_text = choice.message.content.strip()
                #         # Apply basic cleaning to generated text
                #         gen_text = re.sub(r'\s+', ' ', re.sub(r'([.!?])\1+', r'\1', gen_text.lower().strip()))
                #         newly_generated_captions.append(gen_text)
                # except Exception as e:
                #     print(f"Warning: Error generating captions for '{current_filename}': {e}")
                #     # Handle API rate limits, connection errors etc.

                # Simple synthetic generation for demonstration if no LLM/VLM is set up
                if "airport" in current_filename.lower() and len(newly_generated_captions) < 2:
                    newly_generated_captions.append(f"an aerial view of a bustling {current_filename.split('_')[0]} with numerous runways.")
                    newly_generated_captions.append(f"the scene depicts an active {current_filename.split('_')[0]} facility from above.")
                elif "playground" in current_filename.lower() and len(newly_generated_captions) < 2:
                    newly_generated_captions.append(f"a recreational ground with various play structures visible from the air.")
                    newly_generated_captions.append(f"overhead shot of a park area featuring playground equipment.")
                elif len(newly_generated_captions) < 2:
                    newly_generated_captions.append(f"a remote sensing image showing a {current_filename.split('_')[0]} area.")
                    newly_generated_captions.append(f"the image captures the {current_filename.split('_')[0]} from an aerial perspective.")

        # --- END OF LLM/VLM INTEGRATION ---

        # Filter newly generated captions for diversity and remove duplicates against existing ones
        filtered_new_captions = filter_duplicates_by_similarity(
            new_captions_to_check=newly_generated_captions,
            existing_captions_baseline=existing_raw_captions_for_sim,
            threshold=0.9 # ADJUST THIS THRESHOLD based on desired similarity
        )


        # --- 4. Token Normalization ---
        all_captions_for_normalization = \
            [s['raw'] for s in processed_sentences_initial] + \
            [cap for cap in filtered_new_captions] # Add filtered new captions

        normalized_final_captions_raw = []
        for caption_text in all_captions_for_normalization:
            normalized_caption = caption_text
            for old_term, new_term in normalization_map.items():
                # Use regex with word boundaries to replace whole words/phrases, case-insensitively
                normalized_caption = re.sub(r'\b' + re.escape(old_term) + r'\b', new_term, normalized_caption, flags=re.IGNORECASE)
            normalized_final_captions_raw.append(normalized_caption)

        # Re-assemble all captions for the image, assigning new sentids where necessary
        final_processed_sentences_for_image = []
        current_img_sentids = []

        # Add processed original captions
        for j, sent_data in enumerate(processed_sentences_initial):
            normalized_raw = normalized_final_captions_raw[j]
            final_processed_sentences_for_image.append({
                'tokens': normalized_raw.split(),
                'raw': normalized_raw,
                'imgid': current_imgid,
                'sentid': sent_data['original_sentid'] # Retain original sentids for original captions
            })
            current_img_sentids.append(sent_data['original_sentid'])

        # Add filtered and normalized new captions
        for j, new_caption_raw in enumerate(normalized_final_captions_raw[len(processed_sentences_initial):]):
            final_processed_sentences_for_image.append({
                'tokens': new_caption_raw.split(),
                'raw': new_caption_raw,
                'imgid': current_imgid,
                'sentid': next_sentid # Assign unique new sentid for augmented captions
            })
            current_img_sentids.append(next_sentid)
            next_sentid += 1 # Increment for the next new sentence

        # --- 6. Optional: Scene Class Metadata ---
        # This is a basic example. For a robust solution, consider
        # image classification or more sophisticated text-based scene inference.
        scene_label = "unknown"
        if "_" in current_filename:
            # Tries to infer scene from filename (e.g., "airport_1.jpg" -> "airport")
            # You'll need to verify if this is accurate for your dataset.
            scene_label = current_filename.split('_')[0].lower()
        # Add more rules or a lookup table here for scene inference if needed
        # e.g., if "stadium" in any caption, scene_label = "sports_stadium"


        # Prepare the image entry for the new JSON structure
        processed_image_entry = {
            'filename': current_filename,
            'imgid': current_imgid,
            'sentences': final_processed_sentences_for_image,
            'split': img_data.get('split', 'unknown'), # Preserve original split, default if missing
            'sentids': current_img_sentids,
            'scene': scene_label # Add optional scene metadata
        }
        processed_images.append(processed_image_entry)

    print("\nFinished processing all images.")
else:
    print("No images found in the dataset to process. Please check the JSON file and its path.")

Processing Images:   0%|          | 0/10921 [00:00<?, ?it/s]


Finished processing all images.


In [14]:
# Cell 7: Merge & Save as COCO-like JSON

output_data = {'images': processed_images}

try:
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=4)
    print(f"\nSuccessfully saved augmented dataset to: {output_file_path}")
    print(f"Total images processed: {len(processed_images)}")
    total_sentences = sum(len(img['sentences']) for img in processed_images)
    print(f"Total sentences in augmented dataset: {total_sentences}")
except Exception as e:
    print(f"Error saving output file: {e}")

print("\n--- Script Execution Complete ---")
print("Please review the generated captions and adjust parameters (like normalization_map, similarity threshold, and LLM prompts) as needed.")


Successfully saved augmented dataset to: /content/drive/MyDrive/captions_train_augmented.json
Total images processed: 10921
Total sentences in augmented dataset: 54825

--- Script Execution Complete ---
Please review the generated captions and adjust parameters (like normalization_map, similarity threshold, and LLM prompts) as needed.
