<a href="https://colab.research.google.com/github/raz0208/City-Person-Dataset-EDA/blob/main/CityPersonDatasetEDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CityPerson Dataset EDA (Exploratory Data Analysis)**
A complete EDA (Exploratory Data Analysis) for CityPerson dataset

## Used Dataset:


*   gtFine_trainvaltest
*   gtFinePanopticParts_trainval

## Dataset Structure:

1.   gtFine_trainvaltest
  *   Contains PNG and JSON files.
  *   Organized by:
       * Train, validation, and test folders.
  *   Files Type:
      * *_color.png: Color-coded images for segmentation.
      * *_instanceIds.png: Encoded image masks where each pedestrian is represented with a unique ID.
      * *_labelIds.png: Encoded image masks for class labels.
      * *_polygons.json: Contains polygonal annotations for semantic segmentation, instance
2. gtFinePanopticParts_trainval:
    * Contains TIF files.
    * Organized similarly to gtFine_trainvaltest.
    * File type:
      * *_gtFinePanopticParts.tif: Panoptic segmentation with part-level annotations (e.g., parts of a pedestrian like arms or legs).


In [1]:
# Instal Required Libraries
import zipfile
import os

In [5]:
# Dataset zip files path from Google Drive
gtFine = '/content/drive/MyDrive/CityPersonDataset/gtFine_trainvaltest.zip'
gtFinePanopticParts = '/content/drive/MyDrive/CityPersonDataset/gtFinePanopticParts_trainval.zip'

gtFine_ExtPath = '/content/drive/MyDrive/CityPersonDataset/gtFine_trainvaltest'
gtFinePano_ExtPath = '/content/drive/MyDrive/CityPersonDataset/gtFinePanopticParts_trainval'

In [8]:
# Extracting files function
def extract_zip(zip_path, ext_path):
  if not os.path.exists(ext_path):
    os.makedirs(ext_path)
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(ext_path)

# Extract both datasets zip files
extract_zip(gtFine, gtFine_ExtPath)
extract_zip(gtFinePanopticParts, gtFinePano_ExtPath)

In [10]:
# List the extracted content from both datasets
gtFine_Files = os.listdir(gtFine_ExtPath)
gtFinepano_Files = os.listdir(gtFinePano_ExtPath)

gtFine_Files, gtFinepano_Files

(['README', 'license.txt', 'gtFine'],
 ['README_panopticParts.md', 'license.txt', 'gtFinePanopticParts'])

In [13]:
# Path to core folder
gtFine_CorePath = os.path.join(gtFine_ExtPath, 'gtFine')
gtFinePano_CorePath = os.path.join(gtFinePano_ExtPath, 'gtFinePanopticParts')

# List driectories inside core folders
gtFine_Dirs = os.listdir(gtFine_CorePath) if os.path.exists(gtFine_CorePath) else []
gtFinePano_Dirs = os.listdir(gtFinePano_CorePath) if os.path.exists(gtFinePano_CorePath) else []

gtFine_Dirs, gtFinePano_Dirs

(['train', 'val', 'test'], ['train', 'val'])

In [23]:
##### SIMPLE IMPLEMENTATION

# # Listing the sample files from the 'train' directory if it exists in both datasets
# gtfine_trainSample = os.listdir(os.path.join(gtFine_CorePath, 'train')) if 'train' in gtFine_Dirs else []
# gtFinePano_trainSample = os.listdir(os.path.join(gtFinePano_CorePath, 'train')) if 'train' in gtFinePano_Dirs else []

# gtFine_Dirs, gtfine_trainSample[:], gtFinePano_Dirs, gtFinePano_trainSample[:]

######

# Define the subdirectories
subdirs = ["train", "val", "test"]

# Initialize dictionaries to store samples from each subdirectory
gtFine_Samples = {}
gtFinePano_Samples = {}

# Process each subdirectory
for subdir in subdirs:
    gtFine_Samples[subdir] = os.listdir(os.path.join(gtFine_CorePath, subdir)) if subdir in gtFine_Dirs else []
    gtFinePano_Samples[subdir] = os.listdir(os.path.join(gtFinePano_CorePath, subdir)) if subdir in gtFinePano_Dirs else []

# Output the first few files for each subdirectory
gtFine_Samples_Preview = {key: value[:] for key, value in gtFine_Samples.items()}
gtFinePano_Samples_Preview = {key: value[:] for key, value in gtFinePano_Samples.items()}

gtFine_Samples_Preview, gtFinePano_Samples_Preview

({'train': ['jena',
   'stuttgart',
   'ulm',
   'dusseldorf',
   'darmstadt',
   'zurich',
   'bremen',
   'bochum',
   'hamburg',
   'tubingen',
   'aachen',
   'krefeld',
   'hanover',
   'weimar',
   'strasbourg',
   'cologne',
   'erfurt',
   'monchengladbach'],
  'val': ['frankfurt', 'munster', 'lindau'],
  'test': ['berlin', 'mainz', 'bielefeld', 'leverkusen', 'bonn', 'munich']},
 {'train': ['zurich',
   'strasbourg',
   'weimar',
   'aachen',
   'tubingen',
   'jena',
   'bochum',
   'darmstadt',
   'dusseldorf',
   'hamburg',
   'cologne',
   'monchengladbach',
   'krefeld',
   'ulm',
   'hanover',
   'stuttgart',
   'erfurt',
   'bremen'],
  'val': ['munster', 'lindau', 'frankfurt'],
  'test': []})