# Dataset

In [2]:
import os
import re
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

def analyze_dataset_quality(dataset_dir, dataset_name="Dataset"):
    """Comprehensive analysis of Market1501 format dataset"""
    
    # Market1501 naming pattern: PPPP_CC_SSSSSS.jpg
    pattern = re.compile(r'(\d+)_c(\d+)_')  # Adjusted for your pattern
    market_pattern = re.compile(r'(\d{4})_(\d{2})_(\d{6})\.jpg')  # Standard Market1501
    
    id_counts = defaultdict(int)
    cam_counts = defaultdict(int)
    id_cam_pairs = defaultdict(set)
    cam_id_pairs = defaultdict(set)
    
    images = [f for f in os.listdir(dataset_dir) if f.endswith(('.jpg', '.png'))]
    
    for img in images:
        # Try Market1501 pattern first
        m = market_pattern.search(img)
        if m:
            person_id = int(m.group(1))
            camera_id = int(m.group(2))
        else:
            # Try your custom pattern
            m = pattern.search(img)
            if m:
                person_id = int(m.group(1))
                camera_id = int(m.group(2))
            else:
                continue
        
        id_counts[person_id] += 1
        cam_counts[camera_id] += 1
        id_cam_pairs[person_id].add(camera_id)
        cam_id_pairs[camera_id].add(person_id)
    
    # Calculate statistics
    id_counts_list = list(id_counts.values())
    
    print(f"\n{'='*50}")
    print(f"{dataset_name} Analysis")
    print(f"{'='*50}")
    print(f"Total images: {len(images)}")
    print(f"Unique person IDs: {len(id_counts)}")
    print(f"Unique cameras: {len(cam_counts)}")
    
    if id_counts:
        print(f"\nImages per person ID:")
        print(f"  Average: {np.mean(id_counts_list):.1f}")
        print(f"  Median: {np.median(id_counts_list):.1f}")
        print(f"  Min: {min(id_counts_list)}")
        print(f"  Max: {max(id_counts_list)}")
        print(f"  Std: {np.std(id_counts_list):.1f}")
        
        # Distribution analysis
        low_count_ids = [id for id, count in id_counts.items() if count < 5]
        medium_count_ids = [id for id, count in id_counts.items() if 5 <= count < 20]
        high_count_ids = [id for id, count in id_counts.items() if count >= 20]
        
        print(f"\nDistribution:")
        print(f"  IDs with < 5 images: {len(low_count_ids)} ({len(low_count_ids)/len(id_counts)*100:.1f}%)")
        print(f"  IDs with 5-19 images: {len(medium_count_ids)} ({len(medium_count_ids)/len(id_counts)*100:.1f}%)")
        print(f"  IDs with >= 20 images: {len(high_count_ids)} ({len(high_count_ids)/len(id_counts)*100:.1f}%)")
        
        # Camera distribution
        print(f"\nCamera distribution:")
        for cam_id in sorted(cam_counts.keys()):
            print(f"  Camera {cam_id}: {cam_counts[cam_id]} images, {len(cam_id_pairs[cam_id])} unique persons")
        
        # Cross-camera analysis
        multi_cam_ids = [id for id, cams in id_cam_pairs.items() if len(cams) > 1]
        print(f"\nCross-camera statistics:")
        print(f"  IDs appearing in multiple cameras: {len(multi_cam_ids)} ({len(multi_cam_ids)/len(id_counts)*100:.1f}%)")
        
        if multi_cam_ids:
            cam_counts_per_id = [len(id_cam_pairs[id]) for id in multi_cam_ids]
            print(f"  Average cameras per multi-cam ID: {np.mean(cam_counts_per_id):.1f}")
    
    return id_counts, cam_counts, id_cam_pairs

def plot_distribution(id_counts, title="Images per Person Distribution"):
    """Plot histogram of images per person"""
    plt.figure(figsize=(10, 6))
    counts = list(id_counts.values())
    plt.hist(counts, bins=50, edgecolor='black')
    plt.xlabel('Number of Images')
    plt.ylabel('Number of Persons')
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()

def check_data_consistency(train_dir, query_dir, gallery_dir):
    """Check for data consistency across splits"""
    print(f"\n{'='*50}")
    print("Data Consistency Check")
    print(f"{'='*50}")
    
    # Get person IDs from each split
    train_ids = set()
    query_ids = set()
    gallery_ids = set()
    
    pattern = re.compile(r'(\d+)_')
    
    for img in os.listdir(train_dir):
        m = pattern.search(img)
        if m:
            train_ids.add(int(m.group(1)))
    
    for img in os.listdir(query_dir):
        m = pattern.search(img)
        if m:
            query_ids.add(int(m.group(1)))
    
    for img in os.listdir(gallery_dir):
        m = pattern.search(img)
        if m:
            gallery_ids.add(int(m.group(1)))
    
    # Check overlaps
    train_query_overlap = train_ids & query_ids
    train_gallery_overlap = train_ids & gallery_ids
    query_gallery_overlap = query_ids & gallery_ids
    
    print(f"Train IDs: {len(train_ids)}")
    print(f"Query IDs: {len(query_ids)}")
    print(f"Gallery IDs: {len(gallery_ids)}")
    
    print(f"\nOverlaps (should be non-zero for proper evaluation):")
    print(f"  Train ∩ Query: {len(train_query_overlap)}")
    print(f"  Train ∩ Gallery: {len(train_gallery_overlap)}")
    print(f"  Query ∩ Gallery: {len(query_gallery_overlap)} (should equal Query IDs)")
    
    # Check if query IDs are subset of gallery IDs
    if query_ids.issubset(gallery_ids):
        print("✓ All query IDs exist in gallery (correct)")
    else:
        missing = query_ids - gallery_ids
        print(f"✗ {len(missing)} query IDs missing from gallery: {list(missing)[:5]}...")
    
    return train_ids, query_ids, gallery_ids

def generate_reid_statistics(base_dir):
    """Generate comprehensive statistics for the converted dataset"""
    train_dir = os.path.join(base_dir, "bounding_box_train")
    query_dir = os.path.join(base_dir, "query")
    gallery_dir = os.path.join(base_dir, "bounding_box_test")
    
    # Analyze each split
    train_stats = analyze_dataset_quality(train_dir, "TRAIN SET")
    query_stats = analyze_dataset_quality(query_dir, "QUERY SET")
    gallery_stats = analyze_dataset_quality(gallery_dir, "GALLERY SET")
    
    # Check consistency
    check_data_consistency(train_dir, query_dir, gallery_dir)
    
    # Generate summary report
    print(f"\n{'='*50}")
    print("SUMMARY REPORT")
    print(f"{'='*50}")
    
    total_images = len(os.listdir(train_dir)) + len(os.listdir(query_dir)) + len(os.listdir(gallery_dir))
    print(f"Total images in dataset: {total_images}")
    
    # Save statistics to file
    stats_file = os.path.join(base_dir, "dataset_statistics.txt")
    with open(stats_file, 'w') as f:
        f.write("CCVID to Market1501 Conversion Statistics\n")
        f.write("="*50 + "\n")
        f.write(f"Total images: {total_images}\n")
        f.write(f"Train images: {len(os.listdir(train_dir))}\n")
        f.write(f"Query images: {len(os.listdir(query_dir))}\n")
        f.write(f"Gallery images: {len(os.listdir(gallery_dir))}\n")
    
    print(f"\nStatistics saved to: {stats_file}")

# Main execution
if __name__ == "__main__":
    base_dir = "/home/ika/yzlm/TwinProject/ReID_Experiments/LTCC_ReID/data"
    
    # Run comprehensive analysis
    generate_reid_statistics(base_dir)
    
    # Optional: Plot distributions
    # train_dir = os.path.join(base_dir, "bounding_box_train")
    # train_stats, _, _ = analyze_dataset_quality(train_dir, "TRAIN SET")
    # plot_distribution(train_stats, "Train Set: Images per Person")


TRAIN SET Analysis
Total images: 9576
Unique person IDs: 14
Unique cameras: 12

Images per person ID:
  Average: 684.0
  Median: 204.0
  Min: 20
  Max: 3615
  Std: 993.1

Distribution:
  IDs with < 5 images: 0 (0.0%)
  IDs with 5-19 images: 0 (0.0%)
  IDs with >= 20 images: 14 (100.0%)

Camera distribution:
  Camera 1: 521 images, 7 unique persons
  Camera 2: 941 images, 5 unique persons
  Camera 3: 436 images, 4 unique persons
  Camera 4: 1078 images, 10 unique persons
  Camera 5: 798 images, 8 unique persons
  Camera 6: 1276 images, 13 unique persons
  Camera 7: 229 images, 5 unique persons
  Camera 8: 500 images, 5 unique persons
  Camera 9: 1674 images, 13 unique persons
  Camera 10: 619 images, 5 unique persons
  Camera 11: 923 images, 9 unique persons
  Camera 12: 581 images, 6 unique persons

Cross-camera statistics:
  IDs appearing in multiple cameras: 13 (92.9%)
  Average cameras per multi-cam ID: 6.8

QUERY SET Analysis
Total images: 493
Unique person IDs: 13
Unique cameras:

# Train 

In [9]:
!tao model re_identification train -e /home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/combined.yaml

2025-08-05 05:59:03,146 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2025-08-05 05:59:03,191 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:6.0.0-pyt
Docker will run the commands as root. If you would like to retain your
local host permissions, please add the "user":"UID:GID" in the
DockerOptions portion of the "/home/ika/.tao_mounts.json" file. You can obtain your
users UID and GID by using the "id -u" and "id -g" commands on the
terminal.
2025-08-05 05:59:03,200 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 308: Printing tty value True
2025-08-05 02:59:05,733 [TAO Toolkit] [INFO] matplotlib.font_manager 1639: generated new fontManager
'combined.yaml' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/automatic_schema_

# Evaluate


In [5]:
!tao model re_identification evaluate \
    -e /home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/combined.yaml \
    evaluate.checkpoint=/home/ika/yzlm/TwinProject/ReID_Experiments/resnet50_market1501_aicity156.tlt \
    evaluate.query_dataset=/home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/data/query \
    evaluate.test_dataset=/home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/data/bounding_box_test  \
    re_ranking.re_ranking=False


2025-08-06 19:59:29,469 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2025-08-06 19:59:29,517 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:6.0.0-pyt
Docker will run the commands as root. If you would like to retain your
local host permissions, please add the "user":"UID:GID" in the
DockerOptions portion of the "/home/ika/.tao_mounts.json" file. You can obtain your
users UID and GID by using the "id -u" and "id -g" commands on the
terminal.
2025-08-06 19:59:29,527 [TAO Toolkit] [INFO] nvidia_tao_cli.components.docker_handler.docker_handler 308: Printing tty value True
2025-08-06 16:59:32,168 [TAO Toolkit] [INFO] matplotlib.font_manager 1639: generated new fontManager
'combined.yaml' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/automatic_schema_

In [11]:
!tao model re_identification evaluate \
    -e /home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/combined.yaml \
    evaluate.checkpoint=/home/ika/yzlm/TwinProject/ReID_Experiments/resnet50_market1501_aicity156.tlt \
    evaluate.query_dataset=/home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/data/query \
    evaluate.test_dataset=/home/ika/yzlm/TwinProject/ReID_Experiments/LTTC+PRCC+ULIRI/data/bounding_box_test \
    re_ranking.re_ranking=False

2025-08-06 20:05:08,861 [TAO Toolkit] [INFO] root 160: Registry: ['nvcr.io']
2025-08-06 20:05:08,909 [TAO Toolkit] [INFO] nvidia_tao_cli.components.instance_handler.local_instance 360: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:6.0.0-pyt
Traceback (most recent call last):
  File [35m"/home/ika/miniconda3/bin/tao"[0m, line [35m8[0m, in [35m<module>[0m
    sys.exit([31mmain[0m[1;31m()[0m)
             [31m~~~~[0m[1;31m^^[0m
  File [35m"/home/ika/miniconda3/lib/python3.13/site-packages/nvidia_tao_cli/entrypoint/tao_launcher.py"[0m, line [35m134[0m, in [35mmain[0m
    [31minstance.launch_command[0m[1;31m([0m
    [31m~~~~~~~~~~~~~~~~~~~~~~~[0m[1;31m^[0m
        [1;31mtask_group,[0m
        [1;31m^^^^^^^^^^^[0m
        [1;31mtask,[0m
        [1;31m^^^^^[0m
        [1;31margs[2:][0m
        [1;31m^^^^^^^^[0m
    [1;31m)[0m
    [1;31m^[0m
  File [35m"/home/ika/miniconda3/lib/python3.13/site-packages/nvidia_tao_cli/components/instan

In [24]:
!docker run --gpus all \
    --shm-size=16G \
    -v /home/ika:/home/ika \
    --rm \
    -it \
    nvcr.io/nvidia/tao/tao-toolkit:5.0.0-pyt \
    re_identification evaluate \
    -e /home/ika/yzlm/TwinProject/ReID_Experiments/pretrained.yaml \
    evaluate.checkpoint=/home/ika/yzlm/TwinProject/ReID_Experiments/resnet50_market1501_aicity156.tlt \
    evaluate.query_dataset=/home/ika/yzlm/TwinProject/ReID_Experiments/LTCC_ReID/data/query \
    evaluate.test_dataset=/home/ika/yzlm/TwinProject/ReID_Experiments/LTCC_ReID/data/bounding_box_test \
    re_ranking.re_ranking=False


=== TAO Toolkit PyTorch ===

NVIDIA Release 5.0.0-PyT (build 53420872)
TAO Toolkit Version 5.0.0

Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

This container image and its contents are governed by the TAO Toolkit End User License Agreement.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/tao-toolkit-software-license-agreement
ERROR: No supported GPU(s) detected to run this container

'pretrained.yaml' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/next/upgrades/1.0_to_1.1/automatic_schema_matching for migration instructions.
'pretrained.yaml' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/next/upgrades/1.0_to_1.1/automatic_schema_matching for