<a href="https://colab.research.google.com/github/raz0208/City-Person-Dataset-EDA/blob/main/CityPerson_LabelFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Augmentation Procsess

In [27]:
# Instal Required Libraries
import os
import zipfile
import json
import pandas as pd
from collections import Counter

In [7]:
# Dataset zip files path from Google Drive
gtFine = '/content/drive/MyDrive/CityPersonDataset/gtFine_trainvaltest.zip'
gtFinePanopticParts = '/content/drive/MyDrive/CityPersonDataset/gtFinePanopticParts_trainval.zip'
gtBbox = '/content/drive/MyDrive/CityPersonDataset/gtBbox_cityPersons_trainval.zip'

gtFine_ExtPath = '/content/CityPersonDataset/gtFine_trainvaltest'
gtFinePano_ExtPath = '/content/CityPersonDataset/gtFinePanopticParts_trainval'
gtBbox_ExtPath = '/content/CityPersonDataset/gtBbox_cityPersons_trainval'

In [9]:
# Extracting files function
def extract_zip(file_path, extract_path):
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

# Extract both datasets zip files
extract_zip(gtFine, gtFine_ExtPath)
extract_zip(gtFinePanopticParts, gtFinePano_ExtPath)
extract_zip(gtBbox, gtBbox_ExtPath)

In [10]:
# List the extracted content from both datasets
gtFine_Files = os.listdir(gtFine_ExtPath)
gtFinepano_Files = os.listdir(gtFinePano_ExtPath)
gtBbox_Files = os.listdir(gtBbox_ExtPath)

gtFine_Files, gtFinepano_Files, gtBbox_Files

(['README', 'license.txt', 'gtFine'],
 ['README_panopticParts.md', 'license.txt', 'gtFinePanopticParts'],
 ['README_cityPersons', 'gtBboxCityPersons', 'license.txt'])

In [11]:
# Path to core folder
gtFine_CorePath = os.path.join(gtFine_ExtPath, 'gtFine')
gtFinePano_CorePath = os.path.join(gtFinePano_ExtPath, 'gtFinePanopticParts')
gtBbox_CorePath = os.path.join(gtBbox_ExtPath, 'gtBboxCityPersons')

# List driectories inside core folders
gtFine_Dirs = os.listdir(gtFine_CorePath) if os.path.exists(gtFine_CorePath) else []
gtFinePano_Dirs = os.listdir(gtFinePano_CorePath) if os.path.exists(gtFinePano_CorePath) else []
gtBbox_Dirs = os.listdir(gtBbox_CorePath) if os.path.exists(gtBbox_CorePath) else []

gtFine_Dirs, gtFinePano_Dirs, gtBbox_Dirs

(['train', 'test', 'val'], ['train', 'val'], ['train', 'val'])

In [12]:
# Define the subdirectories
subdirs = gtFine_Dirs #["train", "val", "test"]

# Initialize dictionaries to store samples from each subdirectory
gtFine_CityFolders = {}
gtFinePano_CityFolders = {}
gtBbox_CityFolders = {}

# Process each subdirectory
for subdir in subdirs:
    gtFine_CityFolders[subdir] = os.listdir(os.path.join(gtFine_CorePath, subdir)) if subdir in gtFine_Dirs else []
    gtFinePano_CityFolders[subdir] = os.listdir(os.path.join(gtFinePano_CorePath, subdir)) if subdir in gtFinePano_Dirs else []
    gtBbox_CityFolders[subdir] = os.listdir(os.path.join(gtBbox_CorePath, subdir)) if subdir in gtBbox_Dirs else []

# Output the first few files for each subdirectory
gtFine_CityFolders_Preview = {key: value[:] for key, value in gtFine_CityFolders.items()}
gtFinePano_CityFolders_Preview = {key: value[:] for key, value in gtFinePano_CityFolders.items()}
gtBbox_CityFolders_Preview = {key: value[:] for key, value in gtBbox_CityFolders.items()}

gtFine_CityFolders_Preview, gtFinePano_CityFolders_Preview, gtBbox_CityFolders_Preview

({'train': ['zurich',
   'bochum',
   'erfurt',
   'bremen',
   'krefeld',
   'ulm',
   'tubingen',
   'strasbourg',
   'monchengladbach',
   'darmstadt',
   'jena',
   'dusseldorf',
   'cologne',
   'aachen',
   'stuttgart',
   'hanover',
   'weimar',
   'hamburg'],
  'test': ['berlin', 'munich', 'bonn', 'leverkusen', 'mainz', 'bielefeld'],
  'val': ['munster', 'lindau', 'frankfurt']},
 {'train': ['zurich',
   'bochum',
   'erfurt',
   'bremen',
   'krefeld',
   'ulm',
   'tubingen',
   'strasbourg',
   'monchengladbach',
   'darmstadt',
   'jena',
   'dusseldorf',
   'cologne',
   'aachen',
   'stuttgart',
   'hanover',
   'weimar',
   'hamburg'],
  'test': [],
  'val': ['munster', 'lindau', 'frankfurt']},
 {'train': ['zurich',
   'bochum',
   'erfurt',
   'bremen',
   'krefeld',
   'ulm',
   'tubingen',
   'strasbourg',
   'monchengladbach',
   'darmstadt',
   'jena',
   'dusseldorf',
   'cologne',
   'aachen',
   'stuttgart',
   'hanover',
   'weimar',
   'hamburg'],
  'test': [],


In [30]:
# Define the required labels to filter them
required_labels = {"pedestrian", "ignore", "rider", "sitting person", "person (other)", "person group"}

# Initialize a list to store filtered file names
filtered_json_files = []

# Process each subdirectory (train, val, test)
for subdir in os.listdir(gtBbox_CorePath):
    subdir_path = os.path.join(gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if it's not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract all labels in the current file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if all required labels are present
                    if required_labels.issubset(file_labels):
                        filtered_json_files.append(file_path)

# Output filtered JSON files
filtered_json_files

['/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000070_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000065_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/erfurt/erfurt_000068_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/bremen/bremen_000217_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/strasbourg/strasbourg_000000_016436_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/strasbourg/strasbourg_000000_028822_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/strasbourg/strasbourg_000000_029281_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_ci

In [31]:
def analyze_class_distribution(json_files):
    class_counts = Counter()
    for file in json_files:
        with open(file, 'r') as f:
            data = json.load(f)
        for obj in data['objects']:
            class_counts[obj['label']] += 1
    return class_counts

gtBbox_class_distribution = analyze_class_distribution(filtered_json_files)
print(pd.DataFrame(gtBbox_class_distribution.items(), columns=["Class", "Count"]))

            Class  Count
0      pedestrian    454
1  person (other)     34
2  sitting person     81
3           rider     36
4          ignore    100
5    person group     88


In [33]:
filterd_json_files_MoreThanOneLabels = []

# Process each subdirectory (train, val, test)
for subdir in os.listdir(gtBbox_CorePath):
    subdir_path = os.path.join(gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract unique labels in the file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if more than one unique label exists
                    if len(file_labels) > 1:
                        filterd_json_files_MoreThanOneLabels.append(file_path)

# Output filtered JSON files
filterd_json_files_MoreThanOneLabels

['/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000111_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000029_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000031_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000035_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000066_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000045_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/zurich/zurich_000002_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBbo

In [34]:
gtBbox_class_distribution = analyze_class_distribution(filterd_json_files_MoreThanOneLabels)
print(pd.DataFrame(gtBbox_class_distribution.items(), columns=["Class", "Count"]))

            Class  Count
0      pedestrian  18586
1          ignore   7637
2           rider   2111
3  sitting person   1209
4  person (other)    502
5    person group   1569


In [None]:
# Copy orginal dataset
Filtered_gtBbox_CorePath = gtBbox_CorePath

# Define the specific labels we want to check for removal
target_labels = {"pedestrian", "ignore"}

# Iterate through subdirectories (train, val, test)
for subdir in os.listdir(gtFine_CorePath):
    subdir_path = os.path.join(gtFine_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_polygons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract unique labels in the file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if the file contains only "pedestrian" and "ignore"
                    if file_labels == target_labels:
                        os.remove(file_path)  # Remove the JSON file
                        print(f"Removed: {file_path}")
