<a href="https://colab.research.google.com/github/raz0208/City-Person-Dataset-EDA/blob/main/CityPerson_LabelFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Augmentation Procsess

In [1]:
# Instal Required Libraries
import os
import zipfile
import json
import pandas as pd
from collections import Counter
import shutil

In [2]:
# Dataset zip files path from Google Drive
gtFine = '/content/drive/MyDrive/CityPersonDataset/gtFine_trainvaltest.zip'
gtFinePanopticParts = '/content/drive/MyDrive/CityPersonDataset/gtFinePanopticParts_trainval.zip'
gtBbox = '/content/drive/MyDrive/CityPersonDataset/gtBbox_cityPersons_trainval.zip'

gtFine_ExtPath = '/content/CityPersonDataset/gtFine_trainvaltest'
gtFinePano_ExtPath = '/content/CityPersonDataset/gtFinePanopticParts_trainval'
gtBbox_ExtPath = '/content/CityPersonDataset/gtBbox_cityPersons_trainval'

In [3]:
# Extracting files function
def extract_zip(file_path, extract_path):
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

# Extract both datasets zip files
extract_zip(gtFine, gtFine_ExtPath)
extract_zip(gtFinePanopticParts, gtFinePano_ExtPath)
extract_zip(gtBbox, gtBbox_ExtPath)

In [4]:
# List the extracted content from both datasets
gtFine_Files = os.listdir(gtFine_ExtPath)
gtFinepano_Files = os.listdir(gtFinePano_ExtPath)
gtBbox_Files = os.listdir(gtBbox_ExtPath)

gtFine_Files, gtFinepano_Files, gtBbox_Files

(['gtFine', 'README', 'license.txt'],
 ['README_panopticParts.md', 'license.txt', 'gtFinePanopticParts'],
 ['gtBboxCityPersons', 'README_cityPersons', 'license.txt'])

In [5]:
# Path to core folder
gtFine_CorePath = os.path.join(gtFine_ExtPath, 'gtFine')
gtFinePano_CorePath = os.path.join(gtFinePano_ExtPath, 'gtFinePanopticParts')
gtBbox_CorePath = os.path.join(gtBbox_ExtPath, 'gtBboxCityPersons')

# List driectories inside core folders
gtFine_Dirs = os.listdir(gtFine_CorePath) if os.path.exists(gtFine_CorePath) else []
gtFinePano_Dirs = os.listdir(gtFinePano_CorePath) if os.path.exists(gtFinePano_CorePath) else []
gtBbox_Dirs = os.listdir(gtBbox_CorePath) if os.path.exists(gtBbox_CorePath) else []

gtFine_Dirs, gtFinePano_Dirs, gtBbox_Dirs

(['train', 'val', 'test'], ['train', 'val'], ['train', 'val'])

In [6]:
# Define the subdirectories
subdirs = gtFine_Dirs #["train", "val", "test"]

# Initialize dictionaries to store samples from each subdirectory
gtFine_CityFolders = {}
gtFinePano_CityFolders = {}
gtBbox_CityFolders = {}

# Process each subdirectory
for subdir in subdirs:
    gtFine_CityFolders[subdir] = os.listdir(os.path.join(gtFine_CorePath, subdir)) if subdir in gtFine_Dirs else []
    gtFinePano_CityFolders[subdir] = os.listdir(os.path.join(gtFinePano_CorePath, subdir)) if subdir in gtFinePano_Dirs else []
    gtBbox_CityFolders[subdir] = os.listdir(os.path.join(gtBbox_CorePath, subdir)) if subdir in gtBbox_Dirs else []

# Output the first few files for each subdirectory
gtFine_CityFolders_Preview = {key: value[:] for key, value in gtFine_CityFolders.items()}
gtFinePano_CityFolders_Preview = {key: value[:] for key, value in gtFinePano_CityFolders.items()}
gtBbox_CityFolders_Preview = {key: value[:] for key, value in gtBbox_CityFolders.items()}

gtFine_CityFolders_Preview, gtFinePano_CityFolders_Preview, gtBbox_CityFolders_Preview

({'train': ['weimar',
   'tubingen',
   'cologne',
   'erfurt',
   'jena',
   'stuttgart',
   'hanover',
   'darmstadt',
   'krefeld',
   'bremen',
   'ulm',
   'bochum',
   'zurich',
   'dusseldorf',
   'strasbourg',
   'monchengladbach',
   'hamburg',
   'aachen'],
  'val': ['frankfurt', 'lindau', 'munster'],
  'test': ['leverkusen', 'munich', 'bonn', 'mainz', 'berlin', 'bielefeld']},
 {'train': ['weimar',
   'tubingen',
   'cologne',
   'erfurt',
   'jena',
   'stuttgart',
   'hanover',
   'darmstadt',
   'krefeld',
   'bremen',
   'ulm',
   'bochum',
   'zurich',
   'dusseldorf',
   'strasbourg',
   'monchengladbach',
   'hamburg',
   'aachen'],
  'val': ['frankfurt', 'lindau', 'munster'],
  'test': []},
 {'train': ['weimar',
   'tubingen',
   'cologne',
   'erfurt',
   'jena',
   'stuttgart',
   'hanover',
   'darmstadt',
   'krefeld',
   'bremen',
   'ulm',
   'bochum',
   'zurich',
   'dusseldorf',
   'strasbourg',
   'monchengladbach',
   'hamburg',
   'aachen'],
  'val': ['fra

#### All json files which contain all labels (classes)

In [7]:
# Define the required labels to filter them
required_labels = {"pedestrian", "ignore", "rider", "sitting person", "person (other)", "person group"}

# Initialize a list to store filtered file names
filtered_json_files = []

# Process each subdirectory (train, val, test)
for subdir in os.listdir(gtBbox_CorePath):
    subdir_path = os.path.join(gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if it's not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract all labels in the current file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if all required labels are present
                    if required_labels.issubset(file_labels):
                        filtered_json_files.append(file_path)

# Output filtered JSON files
filtered_json_files

['/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/cologne/cologne_000123_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/erfurt/erfurt_000068_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/hanover/hanover_000000_027390_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/hanover/hanover_000000_039021_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/hanover/hanover_000000_040294_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/hanover/hanover_000000_027998_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/hanover/hanover_000000_043822_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_tr

In [8]:
def analyze_class_distribution(json_files):
    class_counts = Counter()
    for file in json_files:
        with open(file, 'r') as f:
            data = json.load(f)
        for obj in data['objects']:
            class_counts[obj['label']] += 1
    return class_counts

gtBbox_class_distribution = analyze_class_distribution(filtered_json_files)
print(pd.DataFrame(gtBbox_class_distribution.items(), columns=["Class", "Count"]))

            Class  Count
0  person (other)     34
1      pedestrian    454
2  sitting person     81
3           rider     36
4          ignore    100
5    person group     88


#### All json files which contains more than one unique label (class)

In [9]:
filterd_json_files_MoreThanOneLabels = []

# Process each subdirectory (train, val, test)
for subdir in os.listdir(gtBbox_CorePath):
    subdir_path = os.path.join(gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract unique labels in the file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if more than one unique label exists
                    if len(file_labels) > 1:
                        filterd_json_files_MoreThanOneLabels.append(file_path)

# Output filtered JSON files
filterd_json_files_MoreThanOneLabels

['/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000085_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000055_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000049_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000092_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000084_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000106_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000002_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBbo

In [10]:
gtBbox_class_distribution = analyze_class_distribution(filterd_json_files_MoreThanOneLabels)
print(pd.DataFrame(gtBbox_class_distribution.items(), columns=["Class", "Count"]))

            Class  Count
0  sitting person   1209
1      pedestrian  18586
2          ignore   7637
3           rider   2111
4    person group   1569
5  person (other)    502


#### Remove json files which contain just "pedestrian", "ignore" labels (classes)

In [11]:
# Copy orginal dataset
removedirs_gtBbox_CorePath = gtBbox_CorePath

# Define the specific labels we want to check for removal
target_labels = {"pedestrian", "ignore"}

# Iterate through subdirectories (train, val, test)
for subdir in os.listdir(removedirs_gtBbox_CorePath):
    subdir_path = os.path.join(removedirs_gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract unique labels in the file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if the file contains only "pedestrian" and "ignore"
                    if file_labels == target_labels:
                        os.remove(file_path)  # Remove the JSON file
                        print(f"Removed: {file_path}")

Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000055_000019_gtBboxCityPersons.json
Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000084_000019_gtBboxCityPersons.json
Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000002_000019_gtBboxCityPersons.json
Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000041_000019_gtBboxCityPersons.json
Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000057_000019_gtBboxCityPersons.json
Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000105_000019_gtBboxCityPersons.json
Removed: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000003_000019_gtBboxCityPersons.json
Removed: /content/CityPerso

In [12]:
filterd_json_files_MoreThanOneLabels = []

# Process each subdirectory (train, val, test)
for subdir in os.listdir(removedirs_gtBbox_CorePath):
    subdir_path = os.path.join(removedirs_gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                    # Extract unique labels in the file
                    file_labels = {obj["label"] for obj in data.get("objects", [])}

                    # Check if more than one unique label exists
                    if len(file_labels) > 1:
                        filterd_json_files_MoreThanOneLabels.append(file_path)

# Output filtered JSON files
filterd_json_files_MoreThanOneLabels

['/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000085_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000049_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000092_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000106_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000037_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000098_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000111_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBbo

In [13]:
gtBbox_class_distribution = analyze_class_distribution(filterd_json_files_MoreThanOneLabels)
print(pd.DataFrame(gtBbox_class_distribution.items(), columns=["Class", "Count"]))

            Class  Count
0  sitting person   1209
1      pedestrian  15257
2           rider   2111
3          ignore   5278
4    person group   1569
5  person (other)    502


#### Replace all "pedestrian" labels with "ignore" in all json files

In [17]:
# Iterate through subdirectories (train, val, test)
for subdir in os.listdir(removedirs_gtBbox_CorePath):
    subdir_path = os.path.join(removedirs_gtBbox_CorePath, subdir)

    if not os.path.isdir(subdir_path):
        continue  # Skip if not a directory

    for city in os.listdir(subdir_path):
        city_path = os.path.join(subdir_path, city)

        if not os.path.isdir(city_path):
            continue

        for file in os.listdir(city_path):
            if file.endswith("_gtBboxCityPersons.json"):  # Adjust this if needed
                file_path = os.path.join(city_path, file)

                # Read JSON file
                with open(file_path, "r") as json_file:
                    data = json.load(json_file)

                # Modify labels in-place
                modified = False
                for obj in data.get("objects", []):
                    if obj["label"] == "pedestrian":
                        obj["label"] = "ignore"
                        modified = True

                # Write the updated JSON file if any modification was made
                if modified:
                    with open(file_path, "w") as json_file:
                        json.dump(data, json_file, indent=4)
                    print(f"Updated: {file_path}")

Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000085_000019_gtBboxCityPersons.json
Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000126_000019_gtBboxCityPersons.json
Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000049_000019_gtBboxCityPersons.json
Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000092_000019_gtBboxCityPersons.json
Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000106_000019_gtBboxCityPersons.json
Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000037_000019_gtBboxCityPersons.json
Updated: /content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000138_000019_gtBboxCityPersons.json
Updated: /content/CityPerso

In [20]:
# Create a list of all JSON files from gtFine
def get_json_files(directory):
    json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))
    return json_files

filterd_json_files_NoPedestrain = get_json_files(removedirs_gtBbox_CorePath)
filterd_json_files_MoreThanOneLabels

['/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000085_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000049_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000092_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000106_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000037_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000098_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBboxCityPersons/train/weimar/weimar_000111_000019_gtBboxCityPersons.json',
 '/content/CityPersonDataset/gtBbox_cityPersons_trainval/gtBbo

In [22]:
gtBbox_class_distribution = analyze_class_distribution(filterd_json_files_NoPedestrain)
print(pd.DataFrame(gtBbox_class_distribution.items(), columns=["Class", "Count"]))

            Class  Count
0  sitting person   1217
1          ignore  22394
2           rider   2189
3    person group   1573
4  person (other)    504


In [14]:
# # Define output ZIP file path
# output_zip_path = "/content/filtered_cityperson_dataset.zip"

# # Function to zip the dataset
# def zip_dataset(folder_path, output_path):
#     shutil.make_archive(output_path.replace(".zip", ""), 'zip', folder_path)
#     print(f"Dataset zipped successfully: {output_path}")

# # Zip the dataset
# zip_dataset(removedirs_gtBbox_CorePath, output_zip_path)

# # If running in Colab, provide a download link
# try:
#     from google.colab import files
#     files.download(output_zip_path)
#     print("Download started...")
# except ImportError:
#     print(f"Dataset saved at: {output_zip_path}")