# Import Dataset

In [1]:
import pandas as pd
import json

# 1. Load and Explore the Excel File
def load_excel(file_path):
    """Load an Excel file and display its contents."""
    print(f"Loading Excel file: {file_path}")
    try:
        data = pd.read_excel(file_path)
        print("Excel File Preview:")
        print(data.head())
        return data
    except Exception as e:
        print(f"Error loading Excel file: {e}")
        return None

# 2. Load and Explore COCO JSON Files
def load_coco_json(file_path):
    """Load a COCO JSON file and display its structure."""
    print(f"\nLoading COCO JSON file: {file_path}")
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        print("COCO JSON Keys:")
        print(list(data.keys()))
        print("Example data (annotations):")
        if "annotations" in data:
            print(data["annotations"][:2])  # Show a preview of annotations
        return data
    except Exception as e:
        print(f"Error loading JSON file: {e}")
        return None

# 3. Paths to Dataset Files
excel_file = "CarDD_release/CarDD_COCO/annotations/image_info.xlsx"  # Update with the correct path if necessaryy
coco_train_json = "CarDD_release/CarDD_COCO/annotations/instances_train2017.json"
coco_val_json = "CarDD_release/CarDD_COCO/annotations/instances_val2017.json"
coco_test_json = "CarDD_release/CarDD_COCO/annotations/instances_test2017.json"

# 4. Run the Functions to Explore the Dataset
# Uncomment the following lines if files are in the same directory
excel_data = load_excel(excel_file)
train_data = load_coco_json(coco_train_json)
val_data = load_coco_json(coco_val_json)
test_data = load_coco_json(coco_test_json)

# 5. Example of Inspecting Data
# Check number of images and annotations in the train set
if train_data:
    num_images = len(train_data.get("images", []))
    num_annotations = len(train_data.get("annotations", []))
    print(f"\nTraining Dataset: {num_images} images, {num_annotations} annotations")

Loading Excel file: CarDD_release/CarDD_COCO/annotations/image_info.xlsx
Excel File Preview:
   id   file_name  width  height  file_size (KB)  #instances  #categories  \
0   1  000001.jpg   1000     750            1114           2            2   
1   2  000002.jpg   1000     667             637           1            1   
2   3  000003.jpg   1000     667             966           1            1   
3   4  000004.jpg   1000     667             806           1            1   
4   5  000005.jpg   1000     667             959           1            1   

  shooting angle complete or partial   color  
0           side              partial    red  
1           side              partial  white  
2           side              partial  white  
3           side              partial   gray  
4           side              partial   gray  

Loading COCO JSON file: CarDD_release/CarDD_COCO/annotations/instances_train2017.json
COCO JSON Keys:
['licenses', 'info', 'categories', 'images', 'annotations']

# Generating Multi-Label Vector

In [2]:
from collections import defaultdict
import pandas as pd

# Step 6: Generate Multi-Label Vectors
def generate_multi_label_vectors(data, num_categories):
    """Generate multi-label vectors for images based on COCO annotations."""
    print("\nGenerating multi-label vectors for images...")

    # Map image_id to its category_ids
    image_to_categories = defaultdict(set)
    for ann in data["annotations"]:
        image_to_categories[ann["image_id"]].add(ann["category_id"])

    # Create multi-label vectors
    image_multi_labels = []
    for image_id, categories_present in image_to_categories.items():
        multi_label_vector = [0] * num_categories  # Initialize all labels to 0
        for category in categories_present:
            multi_label_vector[category - 1] = 1  # Assuming category IDs are 1-indexed
        image_multi_labels.append({"image_id": image_id, "multi_label_vector": multi_label_vector})

    # Convert to a DataFrame for easy exploration
    df = pd.DataFrame(image_multi_labels)
    return df

def process_dataset(data, split_name, num_categories):
    """Process a dataset and save multi-label vectors for a specific split."""
    print(f"\nProcessing {split_name} dataset...")
    if data:
        # Generate the multi-label vectors
        multi_label_df = generate_multi_label_vectors(data, num_categories)
        
        # Display the first few rows
        print(f"\n{split_name.capitalize()} Multi-Label DataFrame Preview:")
        print(multi_label_df.head())
        
        # Save the multi-label vectors to a CSV file
        output_file = f"multi_label_vectors_{split_name}.csv"
        multi_label_df.to_csv(output_file, index=False)
        print(f"\nMulti-label vectors for {split_name} saved to '{output_file}'")

# Get the number of categories from the train dataset (categories are consistent across splits)
if train_data:
    categories = train_data.get("categories", [])
    num_categories = len(categories)
    print(f"\nNumber of categories: {num_categories}")

    # Process train, validation, and test datasets
    process_dataset(train_data, "train", num_categories)
    process_dataset(val_data, "val", num_categories)
    process_dataset(test_data, "test", num_categories)


Number of categories: 6

Processing train dataset...

Generating multi-label vectors for images...

Train Multi-Label DataFrame Preview:
   image_id  multi_label_vector
0         1  [0, 1, 0, 0, 0, 1]
1         2  [0, 0, 0, 0, 0, 1]
2         3  [0, 0, 0, 0, 0, 1]
3         4  [0, 0, 0, 0, 0, 1]
4         5  [0, 0, 0, 0, 0, 1]

Multi-label vectors for train saved to 'multi_label_vectors_train.csv'

Processing val dataset...

Generating multi-label vectors for images...

Val Multi-Label DataFrame Preview:
   image_id  multi_label_vector
0        13  [0, 0, 1, 1, 0, 1]
1        16  [1, 1, 0, 0, 0, 0]
2        17  [0, 1, 0, 0, 0, 0]
3        24  [0, 1, 0, 0, 0, 0]
4        25  [0, 1, 0, 0, 0, 0]

Multi-label vectors for val saved to 'multi_label_vectors_val.csv'

Processing test dataset...

Generating multi-label vectors for images...

Test Multi-Label DataFrame Preview:
   image_id  multi_label_vector
0        12  [0, 0, 0, 0, 0, 1]
1        15  [0, 1, 0, 0, 0, 0]
2        23  [1, 1, 0,

# Label Mapping

The following table describes the mapping of numerical labels to their respective descriptions:

| Label Number | Description        |
|--------------|--------------------|
| 1            | Dent              |
| 2            | Scratch           |
| 3            | Crack             |
| 4            | Glass Shatter     |
| 5            | Lamp Broken       |
| 6            | Tire Flat         |