# Data Validation and Analysis

This notebook validates and analyzes the preprocessed Flickr8k and Flickr30k datasets.


In [None]:
# Setup
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/GTech\ OMSCS/CS\ 7643/group\ project/CS7643_project

import sys
import os
sys.path.append('/content/drive/MyDrive/GTech OMSCS/CS 7643/group project/CS7643_project')

%pip install nltk matplotlib seaborn -q
import nltk
nltk.download('punkt', quiet=True)

print("✅ Setup complete!")
print(f"Current directory: {os.getcwd()}")

try:
    from data.multi_dataset_loader import create_multi_dataset
    from robustness import create_training_augmentation_pipeline
    from analysis import validate_dataset, analyze_dataset, compare_datasets
    from analysis import create_analysis_report, create_comparison_report, print_validation_results
    print("✅ All modules imported successfully!")
except Exception as e:
    print(f"❌ Error: {e}")


## Step 1: Validate Data Quality
Check for missing images, corrupted files, invalid captions, etc.



In [None]:
print("="*60)
print("Validating Flickr8k Dataset")
print("="*60)

flickr8k_image_dirs = {
    'train': ['./Flickr8k_Data/Flicker8k_Dataset', './Flickr8k_Data/Flicker8k_Dataset_resized'],
    'dev': ['./Flickr8k_Data/Flicker8k_Dataset', './Flickr8k_Data/Flicker8k_Dataset_resized'],
    'test': ['./Flickr8k_Data/Flicker8k_Dataset', './Flickr8k_Data/Flicker8k_Dataset_resized']
}

flickr8k_validation = validate_dataset(
    dataset_name='flickr8k',
    splits=['train', 'dev', 'test'],
    captions_dir='./',
    image_dirs=flickr8k_image_dirs
)

print_validation_results(flickr8k_validation)


In [None]:
print("="*60)
print("Validating Flickr30k Dataset")
print("="*60)

flickr30k_image_dirs = {
    'train': ['./Flickr30k_Data/flickr30k_images', './Flickr30k_Data/flickr30k_images_resized'],
    'dev': ['./Flickr30k_Data/flickr30k_images', './Flickr30k_Data/flickr30k_images_resized'],
    'test': ['./Flickr30k_Data/flickr30k_images', './Flickr30k_Data/flickr30k_images_resized']
}

flickr30k_validation = validate_dataset(
    dataset_name='flickr30k',
    splits=['train', 'dev', 'test'],
    captions_dir='./',
    image_dirs=flickr30k_image_dirs
)

print_validation_results(flickr30k_validation)


## Step 2: Analyze Dataset Statistics
Get detailed statistics about captions, images, vocabulary, etc.



In [None]:
print("="*60)
print("Analyzing Flickr8k Dataset")
print("="*60)

flickr8k_analysis_dirs = {
    'train': './Flickr8k_Data/Flicker8k_Dataset',
    'dev': './Flickr8k_Data/Flicker8k_Dataset',
    'test': './Flickr8k_Data/Flicker8k_Dataset'
}

flickr8k_analysis = analyze_dataset(
    dataset_name='flickr8k',
    splits=['train', 'dev', 'test'],
    captions_dir='./',
    image_dirs=flickr8k_analysis_dirs
)

from analysis import print_analysis
print_analysis(flickr8k_analysis, detailed=True)


In [None]:
print("="*60)
print("Analyzing Flickr30k Dataset")
print("="*60)

flickr30k_analysis_dirs = {
    'train': './Flickr30k_Data/flickr30k_images',
    'dev': './Flickr30k_Data/flickr30k_images',
    'test': './Flickr30k_Data/flickr30k_images'
}

flickr30k_analysis = analyze_dataset(
    dataset_name='flickr30k',
    splits=['train', 'dev', 'test'],
    captions_dir='./',
    image_dirs=flickr30k_analysis_dirs
)

print_analysis(flickr30k_analysis, detailed=True)


## Step 3: Compare Datasets
Compare Flickr8k and Flickr30k side-by-side



In [None]:
print("="*60)
print("Comparing Flickr8k and Flickr30k")
print("="*60)

comparison = compare_datasets(
    dataset_names=['flickr8k', 'flickr30k'],
    splits=['train', 'dev', 'test'],
    captions_dir='./'
)

from analysis import print_comparison
print_comparison(comparison)


## Step 4: Generate Reports and Visualizations
Create visualizations and save reports



In [None]:
os.makedirs('./analysis_reports', exist_ok=True)

print("Generating Flickr8k analysis report...")
create_analysis_report(flickr8k_analysis, output_dir='./analysis_reports/flickr8k')

print("Generating Flickr30k analysis report...")
create_analysis_report(flickr30k_analysis, output_dir='./analysis_reports/flickr30k')

print("Generating comparison report...")
create_comparison_report(comparison, output_dir='./analysis_reports')

print("\n✅ All reports generated in ./analysis_reports/")
print("\nFiles created:")
print("  - ./analysis_reports/flickr8k/ (Flickr8k analysis)")
print("  - ./analysis_reports/flickr30k/ (Flickr30k analysis)")
print("  - ./analysis_reports/ (Comparison report)")
