In [1]:
# Import necessary modules
import os
import numpy as np

# Function to process data and save to a new folder with the same filename
def process_and_save_data(file_path, mapping, new_directory):
    # Read the data line by line
    with open(file_path, 'r') as file:
        lines = file.readlines()

    processed_data = []
    
    # Process each line
    for line in lines:
        # Split the line into components
        row = line.strip().split()
        
        if len(row) >= 6:
            # Extract the relevant columns (3, 4, 5, 6)
            key = tuple(map(int, row[2:6]))  # Convert to integers
            # Check if the key exists in the mapping
            if key in mapping:
                # Replace the 3rd column (index 2) with the mapped value
                row[2] = str(mapping[key])
                del row[3:6]
            else:
                raise ValueError(f"Error: Unexpected combination {key} in row {row}")
            
            # Append the processed row as a string
            processed_data.append(" ".join(row))
        else:
            raise ValueError(f"Error: Row has insufficient data: {row}")
    
    # Create new directory for saving if it doesn't exist
    if not os.path.exists(new_directory):
        os.makedirs(new_directory)
    
    # Save the processed data to a new txt file with the same name
    original_filename = os.path.basename(file_path)
    new_filepath = os.path.join(new_directory, original_filename)
    
    with open(new_filepath, 'w') as file:
        file.write("\n".join(processed_data))
    
    return new_filepath

# Function to process all text files in a directory
def process_all_files_in_directory(directory, new_directory, mapping):
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):  # Only process text files
            file_path = os.path.join(directory, filename)
            try:
                processed_file = process_and_save_data(file_path, mapping, new_directory)
                print(f"Processed file saved at: {processed_file}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")

# Mapping for the combinations of columns 3, 4, 5, 6
mapping = {
    (0, 0, 0, 0): 0,
    (0, 1, 0, 0): 1,
    (0, 0, 1, 0): 2,
    (0, 0, 0, 1): 3,
    (1, 0, 0, 0): 4,
    (1, 1, 0, 0): 5,
    (1, 0, 1, 0): 6,
    (1, 0, 0, 1): 7,
    (0, 1, 0, 1): 8
}

# Example usage with a directory path
input_directory = 'Compete_COCO/labels/train'
output_directory = 'Compete_COCO/new_labels/train'

# Process all text files in the input directory
process_all_files_in_directory(input_directory, output_directory, mapping)


Processed file saved at: Compete_COCO/new_labels/train/CounterClock_Noisy_00013015.txt
Processed file saved at: Compete_COCO/new_labels/train/OiSam02_Day_00057561.txt
Processed file saved at: Compete_COCO/new_labels/train/Yeoksam_001887.txt
Processed file saved at: Compete_COCO/new_labels/train/OiSam02_Day_00051549.txt
Processed file saved at: Compete_COCO/new_labels/train/KCITY03_Snow_00011248.txt
Processed file saved at: Compete_COCO/new_labels/train/Seorin_005874.txt
Processed file saved at: Compete_COCO/new_labels/train/Jamsil_003525.txt
Processed file saved at: Compete_COCO/new_labels/train/Banseok01_Snow_00059249.txt
Processed file saved at: Compete_COCO/new_labels/train/IllegalParking05_Rain_00021538.txt
Processed file saved at: Compete_COCO/new_labels/train/TailLight52_000034.txt
Processed file saved at: Compete_COCO/new_labels/train/ETRItoBanseok_Snow_00009205.txt
Processed file saved at: Compete_COCO/new_labels/train/Banseok01_Snow_00052749.txt
Processed file saved at: Compet

In [2]:
import os
import csv
from collections import Counter

def parse_labels(label_path):
    """
    Parse the label file to extract car/bus, location, and rear light information.
    """
    labels = []
    with open(label_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 3:  # Ensure there are at least 3 components (car/bus, location, rear light)
                car_bus = int(parts[0])
                location = int(parts[1])
                rear_light = int(parts[2])
                labels.append((car_bus, location, rear_light))
    return labels

def assign_label_by_values(values):
    """
    Assign a label based on the unique combination of values (for location or rear light).
    The label is created by sorting the unique values in descending order and concatenating them as a string.
    """
    unique_values = sorted(set(values), reverse=True)  # Sort values in descending order
    label = ''.join(map(str, unique_values))  # Concatenate as string
    return int(label)  # Convert the concatenated string to an integer

def generate_labels(label_dir, output_csv_file):
    """
    Match txt filenames with their assigned bus existence, location labels, rear light labels,
    and weather conditions (Fog, Snow, Normal). Write everything to a CSV file.
    """
    label_files = sorted([file for file in os.listdir(label_dir) if file.endswith(".txt")])

    with open(output_csv_file, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        # Write header
        csvwriter.writerow(['Label Filename', 'Bus Present', 'Location Label', 'Rear Light Label', 'Weather Condition'])

        for label_file in label_files:
            label_path = os.path.join(label_dir, label_file)
            
            if os.path.exists(label_path):
                # Parse the label file to extract car/bus, location, and rear light information
                labels = parse_labels(label_path)
                
                # Check if any bus is present (1 in the first column)
                bus_present = any(car_bus == 1 for car_bus, _, _ in labels)
                
                # Assign location label
                locations = [location for _, location, _ in labels]
                location_label = assign_label_by_values(locations)
                
                # Assign rear light label
                rear_lights = [rear_light for _, _, rear_light in labels]
                rear_light_label = assign_label_by_values(rear_lights)
                
                # Check the weather condition based on the filename
                if 'Fog' in label_file:
                    weather_condition = 'Fog'
                elif 'Snow' in label_file:
                    weather_condition = 'Snow'
                elif 'Noisy' in label_file:
                    weather_condition = 'Noisy'
                elif 'Rain' in label_file:
                    weather_condition = 'Rain'
                elif 'Day' in label_file:
                    weather_condition = 'Day'
                else:
                    weather_condition = 'Normal'
                
                # Write all the information to the CSV file
                csvwriter.writerow([label_file, bus_present, location_label, rear_light_label, weather_condition])

# Example usage
label_directory = "Compete_COCO/new_labels/train"
output_csv_file = "combined_labels.csv"

# Generate the combined labels and write them to the CSV file
generate_labels(label_directory, output_csv_file)


In [None]:
import pandas as pd

def distribution(df):
    
    # Loop through each column and print value counts
    for column in df.columns[1:]:
        unique_count = df[column].nunique()
        print(f"Number of unique classes in {column}: {unique_count}")
        
        print(f"Distribution for {column}:")
        print(df[column].value_counts(dropna=False, ascending=False), "\n")

# Example usage
csv_file = "combined_labels.csv"
df = pd.read_csv(csv_file)
distribution(df)


Number of unique classes in Bus Present: 2
Distribution for Bus Present:
False    22940
True     10245
Name: Bus Present, dtype: int64 

Number of unique classes in Location Label: 31
Distribution for Location Label:
210      6861
10       3244
3210     3123
21       2966
1        1694
321      1414
4210     1407
310      1202
2        1097
3        1064
43210     953
4         826
32        749
20        734
421       613
31        571
0         484
410       481
420       474
4321      438
42        393
40        385
41        294
43        276
4320      233
320       221
30        211
4310      208
432       199
431       195
430       175
Name: Location Label, dtype: int64 

Number of unique classes in Rear Light Label: 73
Distribution for Rear Light Label:
0       17541
40       9268
10       1010
20        915
420       688
        ...  
431         1
530         1
5           1
6310        1
7540        1
Name: Rear Light Label, Length: 73, dtype: int64 

Number of unique classe

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33185 entries, 0 to 33184
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Label Filename     33185 non-null  object
 1   Bus Present        33185 non-null  bool  
 2   Location Label     33185 non-null  int64 
 3   Rear Light Label   33185 non-null  int64 
 4   Weather Condition  33185 non-null  object
dtypes: bool(1), int64(2), object(2)
memory usage: 1.0+ MB


In [5]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_split_and_move(csv_file, train_image_dir, train_label_dir, base_output_dir, test_size=0.1, val_size=0.1):
    """
    Perform stratified split based on the CSV data and move corresponding 
    image and label files from train folder into val and test folders.
    Class data with only 1 sample is directly assigned to the train set.
    """
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # Create a combined stratification column by combining all relevant columns
    df['stratify_col'] = df[['Bus Present', 'Location Label', 'Rear Light Label', 'Weather Condition']].astype(str).agg('_'.join, axis=1)

    # Identify classes with less than 2 samples and assign them to the train set
    class_counts = df['stratify_col'].value_counts()
    single_class_data = df[df['stratify_col'].isin(class_counts[class_counts < 2].index)]
    df_filtered = df[df['stratify_col'].isin(class_counts[class_counts >= 2].index)]

    # Split train and test sets (train: 70%, test: 10%, val: 10%) for classes with enough samples
    sss = StratifiedShuffleSplit(n_splits=1, test_size=(test_size + val_size), random_state=42)
    train_idx, test_val_idx = next(sss.split(df_filtered, df_filtered['stratify_col']))
    train_df = df_filtered.iloc[train_idx]
    test_val_df = df_filtered.iloc[test_val_idx]

    # Check for classes in test_val_df with less than 2 samples and move them to the train set
    test_val_class_counts = test_val_df['stratify_col'].value_counts()
    single_class_in_test_val = test_val_df[test_val_df['stratify_col'].isin(test_val_class_counts[test_val_class_counts < 2].index)]
    train_df = pd.concat([train_df, single_class_in_test_val])
    test_val_df = test_val_df[~test_val_df.index.isin(single_class_in_test_val.index)]  # Remove from test_val_df

    # Further split test_val set into test and validation sets (1.5:1.5)
    sss_val = StratifiedShuffleSplit(n_splits=1, test_size=val_size / (test_size + val_size), random_state=42)
    val_idx, test_idx = next(sss_val.split(test_val_df, test_val_df['stratify_col']))
    val_df = test_val_df.iloc[val_idx]
    test_df = test_val_df.iloc[test_idx]

    # Combine single-class data with the train set
    train_df = pd.concat([train_df, single_class_data])

    # Create directories for val and test sets (train images are already in their place)
    for folder in ['val', 'test']:
        os.makedirs(os.path.join(base_output_dir, 'images', folder), exist_ok=True)
        os.makedirs(os.path.join(base_output_dir, 'labels', folder), exist_ok=True)

    # Function to move files to the respective directories (from train to val/test)
    def move_files(df, split_type):
        image_dest = os.path.join(base_output_dir, 'images', split_type)
        label_dest = os.path.join(base_output_dir, 'labels', split_type)

        for _, row in df.iterrows():
            # Move image file from train folder
            image_file = row['Label Filename'].replace('.txt', '.png') 
            image_src = os.path.join(train_image_dir, image_file)
            if os.path.exists(image_src):
                shutil.move(image_src, os.path.join(image_dest, image_file))

            # Move label file (txt) from train folder
            label_file = row['Label Filename']
            label_src = os.path.join(train_label_dir, label_file)
            if os.path.exists(label_src):
                shutil.move(label_src, os.path.join(label_dest, label_file))

    # Move files from train to val and test folders
    move_files(val_df, 'val')
    move_files(test_df, 'test')

# Example usage
csv_file = "combined_labels.csv"
train_image_directory = "Compete_COCO/images/train"
train_label_directory = "Compete_COCO/labels/train"
base_output_directory = "Compete_COCO"

# Perform stratified split and move the files
stratified_split_and_move(csv_file, train_image_directory, train_label_directory, base_output_directory)


In [6]:
# import os
# import shutil

# new_label_train_folder = '02_baseline_code_and_model/Compete_COCO/new_labels/train'

# val_folder = '02_baseline_code_and_model/Compete_COCO/labels/val'
# new_val_folder= '02_baseline_code_and_model/Compete_COCO/new_labels/val'

# test_folder = '02_baseline_code_and_model/Compete_COCO/labels/test'
# new_test_folder= '02_baseline_code_and_model/Compete_COCO/new_labels/test'

# def move_new_labels(folder1, folder2, folder3):
#     # 폴더1에서 파일 이름 리스트 가져오기
#     file_names = [f for f in os.listdir(folder1) if f.endswith('.txt')]

#     # 파일 이름을 기준으로 폴더2에서 파일 찾고 폴더3으로 이동
#     for file_name in file_names:
#         file_path = os.path.join(folder2, file_name)
#         if os.path.exists(file_path):
#             shutil.move(file_path, folder3)
#             print(f'{file_name} 파일을 폴더3으로 이동했습니다.')
#         else:
#             print(f'{file_name} 파일을 폴더2에서 찾을 수 없습니다.')

# move_new_labels(val_folder, new_label_train_folder, new_val_folder)
# move_new_labels(test_folder, new_label_train_folder, new_test_folder)


In [7]:
import os
from collections import defaultdict

def count_classes_in_labels_by_folder(label_dir):
    """
    Count occurrences of each class based on the first six columns in the label files.
    Additionally, count how many files have 'Fog', 'Snow', 'Noisy', 'Rain', or 'Day' in the file name.
    This function counts classes separately for 'train', 'val', and 'test' folders.
    """
    folder_counts = {}
    weather_counts = ['Fog', 'Snow', 'Noisy', 'Rain', 'Day']  # Weather conditions to look for

    # Loop through each subfolder (train, val, test)
    for folder in ['train', 'val', 'test']:
        class_counts = defaultdict(int)
        weather_class_counts = defaultdict(int)
        folder_path = os.path.join(label_dir, folder)

        # Loop through each file in the folder
        for label_file in os.listdir(folder_path):
            if label_file.endswith('.txt'):
                with open(os.path.join(folder_path, label_file), 'r') as f:
                    for line in f:
                        data = line.split()[:6]  # Only consider the first 6 columns

                        # Update vehicle (first column)
                        class_counts[f'Vehicle_{data[0]}'] += 1
                        # Update location (second column)
                        class_counts[f'Location_{data[1]}'] += 1
                        # Update binary values (third to sixth columns)
                        for i, val in enumerate(data[2:], start=3):
                            class_counts[f'Binary_{i}_{val}'] += 1

                # Check if the filename contains any of the weather conditions
                for weather in weather_counts:
                    if weather in label_file:
                        weather_class_counts[weather] += 1

        folder_counts[folder] = {
            'class_counts': dict(class_counts),
            'weather_counts': dict(weather_class_counts)
        }  # Save counts for each folder

    return folder_counts

def print_class_counts(class_counts_by_folder):
    """
    Print the class counts in a specific order: Vehicle, Location, Binary, and Weather conditions.
    """
    # Define the order of class labels to print
    print_order = ['Vehicle_0', 'Vehicle_1', 'Location_0', 'Location_1', 'Location_2', 'Location_3', 'Location_4',
                   'Binary_3_0', 'Binary_3_1', 'Binary_4_0', 'Binary_4_1', 'Binary_5_0', 'Binary_5_1', 'Binary_6_0', 'Binary_6_1']

    for folder, data in class_counts_by_folder.items():
        class_counts = data['class_counts']
        weather_counts = data['weather_counts']
        
        print(f"\nClass counts for {folder} folder:")
        for class_label in print_order:
            if class_label in class_counts:
                print(f"{class_label}: {class_counts[class_label]}")
            else:
                print(f"{class_label}: 0")  # Print 0 if the class label is missing
        
        print("\nWeather counts:")
        for weather in ['Fog', 'Snow', 'Noisy', 'Rain', 'Day']:
            print(f"{weather}: {weather_counts.get(weather, 0)}")  # Print 0 if the weather label is missing

# Example usage
label_directory = "Compete_COCO/labels"
class_counts_by_folder = count_classes_in_labels_by_folder(label_directory)

# Print the results in a specific order
print_class_counts(class_counts_by_folder)



Class counts for train folder:
Vehicle_0: 187019
Vehicle_1: 12305
Location_0: 21598
Location_1: 78162
Location_2: 59868
Location_3: 21314
Location_4: 18382
Binary_3_0: 173476
Binary_3_1: 25848
Binary_4_0: 196727
Binary_4_1: 2597
Binary_5_0: 196726
Binary_5_1: 2598
Binary_6_0: 198258
Binary_6_1: 1066

Weather counts:
Fog: 2364
Snow: 6474
Noisy: 2457
Rain: 3000
Day: 2715

Class counts for val folder:
Vehicle_0: 20667
Vehicle_1: 1305
Location_0: 2425
Location_1: 8611
Location_2: 6740
Location_3: 2196
Location_4: 2000
Binary_3_0: 19178
Binary_3_1: 2794
Binary_4_0: 21788
Binary_4_1: 184
Binary_5_0: 21807
Binary_5_1: 165
Binary_6_0: 21916
Binary_6_1: 56

Weather counts:
Fog: 243
Snow: 750
Noisy: 278
Rain: 313
Day: 316

Class counts for test folder:
Vehicle_0: 20710
Vehicle_1: 1396
Location_0: 2433
Location_1: 8633
Location_2: 6790
Location_3: 2266
Location_4: 1984
Binary_3_0: 19364
Binary_3_1: 2742
Binary_4_0: 21924
Binary_4_1: 182
Binary_5_0: 21936
Binary_5_1: 170
Binary_6_0: 22052
Binary_