In [1]:
# Import required libraries
import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone.brain import compute_exact_duplicates
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import os
import gc
import pandas as pd

In [2]:

# Define dataset paths and classes
TRAIN_DIR = "/Users/rohanreddy/Documents/fifty/intel-dataset/seg_train/seg_train"
TEST_DIR = "/Users/rohanreddy/Documents/fifty/intel-dataset/seg_test/seg_test"
DATASET_NAME = "intel_testfinal"
CLASSES = ["buildings", "forest", "glacier", "mountain", "sea", "street"]

# Simple label mapping for ImageNet to Intel classes (approximate)
# Extend this mapping based on your needs
LABEL_MAPPING = {
    "castle": "buildings", "church": "buildings", "monastery": "buildings", 
    "palace": "buildings", "cinema": "buildings", "library": "buildings",
    "barn": "buildings", "boathouse": "buildings", "greenhouse": "buildings",
    "lakeside": "sea", "seashore": "sea", "liner": "sea", "yawl": "sea",
    "dock": "sea", "pier": "sea", "fireboat": "sea", "lifeboat": "sea",
    "container ship": "sea", "aircraft carrier": "sea", "trimaran": "sea",
    "snowplow": "glacier", "drilling platform": "glacier",
    "stupa": "mountain", "dome": "mountain", "obelisk": "mountain",
    "street sign": "street", "streetcar": "street", "traffic light": "street",
    "trolleybus": "street", "cab": "street", "passenger car": "street"
}


In [3]:
# Load dataset from both train and test directories
try:
    train_dataset = fo.Dataset.from_dir(
        dataset_dir=TRAIN_DIR,
        dataset_type=fo.types.ImageClassificationDirectoryTree,
        name=f"{DATASET_NAME}_train",
        persistent=True
    )
    test_dataset = fo.Dataset.from_dir(
        dataset_dir=TEST_DIR,
        dataset_type=fo.types.ImageClassificationDirectoryTree,
        name=f"{DATASET_NAME}_test",
        persistent=True
    )
    # Merge datasets
    dataset = fo.Dataset(DATASET_NAME, persistent=True)
    dataset.add_collection(train_dataset)
    dataset.add_collection(test_dataset)
    print(f"Loaded dataset with {len(dataset)} samples")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise SystemExit(1)

 100% |█████████████| 14034/14034 [2.9s elapsed, 0s remaining, 4.6K samples/s]      
 100% |███████████████| 3000/3000 [489.8ms elapsed, 0s remaining, 6.1K samples/s]      
Loaded dataset with 17034 samples


In [4]:
!fiftyone plugins download https://github.com/jacobmarks/image-deduplication-plugin

Downloading jacobmarks/image-deduplication-plugin...
  102.1Kb [453.3us elapsed, ? remaining, 220.0Mb/s] 
Skipping existing plugin '@jacobmarks/image_deduplication'


In [5]:

# Find exact duplicates and tag for review
try:
    dups = compute_exact_duplicates(dataset)
    duplicate_count = sum(len(dup_ids) for _, dup_ids in dups.items())
    print(f"Found {duplicate_count} duplicate samples")
    duplicate_data = []
    for sample_id, dup_ids in dups.items():
        for dup_id in dup_ids:
            try:
                sample = dataset[dup_id]  # Strong reference
                sample.tags.append("duplicate")
                sample.save()
                duplicate_data.append({"id": dup_id, "filepath": sample.filepath, "ground_truth": sample.ground_truth.label})
            except Exception as e:
                print(f"Error tagging sample {dup_id}: {e}")
        # To delete duplicates (keeping one), uncomment:
        # dataset.delete_samples(dup_ids[1:])  # Keep first sample, delete rest
    # Export duplicates to CSV
    if duplicate_data:
        pd.DataFrame(duplicate_data).to_csv("duplicates.csv", index=False)
        print("Exported duplicates to duplicates.csv")
    gc.collect()  # Free memory
except Exception as e:
    print(f"Error computing duplicates: {e}")
    raise SystemExit(1)

Computing filehashes...
 100% |█████████████| 17034/17034 [2.3s elapsed, 0s remaining, 7.1K samples/s]      
Found 18 duplicate samples
Exported duplicates to duplicates.csv


In [6]:
# Detect blurry images and tag for review
def compute_blurriness(image_path):
    try:
        img = Image.open(image_path).convert("L")
        img_tensor = transforms.ToTensor()(img)
        laplacian = torch.nn.functional.conv2d(
            img_tensor.unsqueeze(0).unsqueeze(0),
            torch.tensor([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=torch.float32).reshape(1, 1, 3, 3),
            padding=1
        )
        return float(laplacian.var())
    except:
        return float('inf')  # Skip corrupted images

blurry_count = 0
blurry_data = []
for sample in dataset:
    try:
        blurriness = compute_blurriness(sample.filepath)
        sample["blurriness"] = blurriness
        if blurriness < 100:
            sample.tags.append("blurry")
            blurry_count += 1
            blurry_data.append({"id": sample.id, "filepath": sample.filepath, "ground_truth": sample.ground_truth.label, "blurriness": blurriness})
        sample.save()
    except Exception as e:
        print(f"Error processing sample {sample.id}: {e}")
print(f"Tagged {blurry_count} blurry samples")
# To delete blurry images, uncomment:
# dataset.delete_samples(dataset.match_tags("blurry"))
# Export blurry images to CSV
if blurry_data:
    pd.DataFrame(blurry_data).to_csv("blurry_images.csv", index=False)
    print("Exported blurry images to blurry_images.csv")

Tagged 0 blurry samples


In [7]:
# Identify annotation inconsistencies and wrong predictions
try:
    # Apply model
    model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
    dataset.apply_model(model, label_field="predictions", batch_size=None)  # Disable batching

    # Map ImageNet predictions to Intel classes
    for sample in dataset:
        try:
            pred_label = sample.predictions.label
            mapped_label = LABEL_MAPPING.get(pred_label, pred_label)  # Keep original if not mapped
            sample.predictions.label = mapped_label
            sample.save()
        except Exception as e:
            print(f"Error mapping prediction for sample {sample.id}: {e}")

# Evaluate classifications
# dataset.evaluate_classifications(
#     pred_field="predictions",
#     gt_field="ground_truth",
#     method="top-k",
#     k=5,
#     classes=CLASSES,  # Use the six Intel classes
#     eval_key="eval_correct"
# )

# Compute mistakenness
# try:
#     compute_mistakenness(dataset, pred_field="predictions", label_field="ground_truth", use_logits=False)
#     if "mistakenness" not in dataset.get_field_schema():
#         raise ValueError("Mistakenness field was not added to the dataset")
    
#     # Tag high mistakenness samples and create view
#     error_count = 0
#     mistakes_data = []
#     for sample in dataset.sort_by("mistakenness", reverse=True)[:50]:
#         sample.tags.append("potential_error")
#         sample.save()
#         error_count += 1
#         mistakes_data.append({
#             "id": sample.id,
#             "filepath": sample.filepath,
#             "ground_truth": sample.ground_truth.label,
#             "predicted": sample.predictions.label,
#             "mistakenness": sample.mistakenness
#         })
#     print(f"Tagged {error_count} samples with potential annotation errors")
    
#     # Create and save view for annotation mistakes
#     mistakes_view = dataset.match_tags("potential_error")
#     dataset.save_view("mistakes_view", mistakes_view)
#     print(f"Saved view 'mistakes_view' with {len(mistakes_view)} samples")
    
#     # Export annotation mistakes to CSV
#     if mistakes_data:
#         pd.DataFrame(mistakes_data).to_csv("annotation_mistakes.csv", index=False)
#         print("Exported annotation mistakes to annotation_mistakes.csv")
# except Exception as e:
#     print(f"Error in mistakenness computation: {e}. Skipping mistakenness-based tagging.")

    # Detect wrong predictions, create and save view
    wrong_pred_count = 0
    wrong_pred_data = []
    for sample in dataset:
        try:
            pred_label = sample.predictions.label
            gt_label = sample.ground_truth.label
            if pred_label not in CLASSES:  # Skip unmapped predictions
                continue
            if pred_label != gt_label:
                sample.tags.append("wrong_prediction")
                sample.save()
                wrong_pred_count += 1
                wrong_pred_data.append({
                    "id": sample.id,
                    "filepath": sample.filepath,
                    "ground_truth": sample.ground_truth.label,
                    "predicted": sample.predictions.label
                })
        except Exception as e:
            print(f"Error checking prediction for sample {sample.id}: {e}")
    print(f"Tagged {wrong_pred_count} samples with wrong predictions")
    
    # Create and save view for wrong predictions
    wrong_predictions_view = dataset.match_tags("wrong_prediction")
    dataset.save_view("wrong_predictions_view", wrong_predictions_view, overwrite=True)
    print(f"Saved view 'wrong_predictions_view' with {len(wrong_predictions_view)} samples")
    
    # Export wrong predictions to CSV
    if wrong_pred_data:
        pd.DataFrame(wrong_pred_data).to_csv("wrong_predictions.csv", index=False)
        print("Exported wrong predictions to wrong_predictions.csv")

except Exception as e:
    print(f"Error computing wrong predictions: {e}")
    raise SystemExit(1)



 100% |█████████████| 17034/17034 [7.5m elapsed, 0s remaining, 40.1 samples/s]      
Tagged 1770 samples with wrong predictions
Saved view 'wrong_predictions_view' with 1770 samples
Exported wrong predictions to wrong_predictions.csv


In [8]:
# Launch FiftyOne App for visual review
fo.close_app()  # Ensure clean session
try:
    session = fo.launch_app(dataset, address="127.0.0.1", port=5151, auto=True)
except Exception as e:
    print(f"Port 5151 failed: {e}. Trying port 5152.")
    session = fo.launch_app(dataset, address="127.0.0.1", port=5152, auto=True)

Connected to FiftyOne on port 5151 at 127.0.0.1.
If you are not connecting to a remote session, you may need to start a new session and specify a port
Server version (1.7.0) does not match client version (1.7.1)


In [9]:
# Set view to wrong predictions
session.view = dataset.match_tags("wrong_prediction")


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 s