In [1]:
import os
import random
import shutil
import cv2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
Project_path ='/content/drive/MyDrive/Fashion Search Engine Project'
dataset_folder = Project_path + '/Data/Dataset_Extracted/Full_Shot_Images_Original'  # Path to the main dataset folder
annotation_folder = Project_path + '/Data/Dataset_Extracted/Images_annotations'
output_folder = Project_path + '/Data/Dataset_Extracted/FashionDatasetSampledForYolo'  # Path to the output sampled data folder

In [4]:
sample_size = 10000  # Number of samples to include in the representative sample
train_ratio = 0.8  # Ratio of samples to keep for training
val_ratio = 0.15  # Ratio of samples to keep for validation
test_ratio = 0.05  # Ratio of samples to keep for testing
image_size = (416, 416)  # Target size for the YOLO model

# Create the train, validation, and test folders
train_folder = os.path.join(output_folder, 'train')
val_folder = os.path.join(output_folder, 'val')
test_folder = os.path.join(output_folder, 'test')

os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

In [5]:
# Create subfolders within train, validation, and test folders for images and annotations

train_img_folder = os.path.join(train_folder, 'images')
train_ann_folder = os.path.join(train_folder, 'labels')

val_img_folder = os.path.join(val_folder, 'images')
val_ann_folder = os.path.join(val_folder, 'labels')

test_img_folder = os.path.join(test_folder, 'images')
test_ann_folder = os.path.join(test_folder, 'labels')


os.makedirs(train_img_folder, exist_ok=True)
os.makedirs(train_ann_folder, exist_ok=True)

os.makedirs(val_img_folder, exist_ok=True)
os.makedirs(val_ann_folder, exist_ok=True)

os.makedirs(test_img_folder, exist_ok=True)
os.makedirs(test_ann_folder, exist_ok=True)

In [7]:
# Get a list of all image files in the dataset folder
image_files = [f for f in os.listdir(dataset_folder) if f.endswith('.jpg')]

In [8]:
# Randomly select the representative sample
random_sample = random.sample(image_files, sample_size)

# Shuffle the representative sample
random.shuffle(random_sample)

In [9]:
print(len(image_files))
print(len(random_sample))

44860
10000


In [10]:
# Calculate the number of samples for each set
num_samples = len(random_sample)
num_train = int(train_ratio * num_samples)
num_val = int(val_ratio * num_samples)
num_test = num_samples - num_train - num_val

# Split the representative sample into train, validation, and test sets
train_files = random_sample[:num_train]
val_files = random_sample[num_train:num_train + num_val]
test_files = random_sample[num_train + num_val:]

In [11]:
def Fill_Folders(type_img_folder,type_ann_folder,files_type):
    # Copy the selected samples to the appropriate folders
    for file in files_type:
        img_path = os.path.join(dataset_folder, file)
        ann_path = os.path.join(annotation_folder, file.replace('.jpg', '.txt'))
        shutil.copy(img_path, type_img_folder)
        shutil.copy(ann_path, type_ann_folder)

In [12]:
Fill_Folders(train_img_folder,train_ann_folder,train_files)

In [13]:
print(len(os.listdir(train_img_folder)))
print(len(os.listdir(train_ann_folder)))

8000
8000


In [14]:
Fill_Folders(val_img_folder,val_ann_folder,val_files)

In [15]:
print(len(os.listdir(val_img_folder)))
print(len(os.listdir(val_ann_folder)))

1500
1500


In [16]:
Fill_Folders(test_img_folder,test_ann_folder,test_files)

In [17]:
print(len(os.listdir(test_img_folder)))
print(len(os.listdir(test_ann_folder)))

500
500


In [18]:
import os
import glob

image_folder = train_img_folder  # Replace with the path to your image folder
label_folder = train_ann_folder  # Replace with the path to your label folder

# Get the list of image files and label files
image_files = glob.glob(os.path.join(image_folder, '*.jpg'))  # Replace '*.jpg' with your image file extension
label_files = glob.glob(os.path.join(label_folder, '*.txt'))  # Replace '*.txt' with your label file extension

# Sort the lists to ensure the order is consistent
image_files.sort()
label_files.sort()

# Iterate over the image and label files simultaneously
for image_file, label_file in zip(image_files, label_files):
    image_name = os.path.basename(image_file)  # Get the image file name
    label_name = os.path.basename(label_file)  # Get the label file name

    # Extract the file name without extension
    image_name_without_ext = os.path.splitext(image_name)[0]
    label_name_without_ext = os.path.splitext(label_name)[0]

    if image_name_without_ext == label_name_without_ext:
        # The image and label have the same name, they are associated
        print(f"Image: {image_name}, Label: {label_name}")
    else:
        # The image and label do not have the same name, they are not associated
        print(f"Error: Image and Label names do not match for {image_name}, Label: {label_name}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image: 62f8c778e04a1c77c122c8e176aed2d1076dd9ad_0.jpg, Label: 62f8c778e04a1c77c122c8e176aed2d1076dd9ad_0.txt
Image: 62ffbed5dbe775e2ad56df5fb1484e62d1fac418_0.jpg, Label: 62ffbed5dbe775e2ad56df5fb1484e62d1fac418_0.txt
Image: 6307c19eaf639b8fbdb407d6c539566f17701a0e_1.jpg, Label: 6307c19eaf639b8fbdb407d6c539566f17701a0e_1.txt
Image: 630c3b05248244a0b432f847b0585e5dbad5642a_1.jpg, Label: 630c3b05248244a0b432f847b0585e5dbad5642a_1.txt
Image: 631ea076d7d8411b9bc62978ca9c1344ddbf6893_0.jpg, Label: 631ea076d7d8411b9bc62978ca9c1344ddbf6893_0.txt
Image: 631f03f916b6c2d1a3ee409c876379521386d116_0.jpg, Label: 631f03f916b6c2d1a3ee409c876379521386d116_0.txt
Image: 6322568dbaad4b440f02b1056c1fbafeb7291ab0_0.jpg, Label: 6322568dbaad4b440f02b1056c1fbafeb7291ab0_0.txt
Image: 63244d4bcd501ae4ff39d239b8f99f8f1ca74e6d_0.jpg, Label: 63244d4bcd501ae4ff39d239b8f99f8f1ca74e6d_0.txt
Image: 6326dedc05cf8a78606fbb392c140f51ad329621_0.jpg, Label: 6

### **Trying To Stratified samples**

In [None]:
Project_path ='/content/drive/MyDrive/Fashion Search Engine Project'

# Directory containing images
images_directory = Project_path + "/Data/Dataset_Extracted/Full_Shot_Images_Original/"

# Directory containing annotation files
annotations_directory = Project_path + "/Data/Dataset_Extracted/Images_annotations/"

# Output directory for the balanced dataset
output_directory = Project_path +  "/Data/Dataset_Extracted/balanced_dataset/"

In [None]:
# Create the output directories if they don't exist
sampled_images_directory = os.path.join(output_directory, "sampled_images")
os.makedirs(sampled_images_directory, exist_ok=True)

sampled_annotations_directory = os.path.join(output_directory, "sampled_annotations")
os.makedirs(sampled_annotations_directory, exist_ok=True)

In [None]:
# Dictionary to store label frequencies
label_frequencies = {}

In [None]:
def extract_label_from_annotation(annotation):
    label = annotation.split()[0]  # Extract the first number from the annotation
    return label

# Check if all labels in the annotation file have counts equal to or below the minimum frequency count
def is_below_min_frequency(annotation):
    counts = [label_frequencies[extract_label_from_annotation(a)] for a in annotation]
    return all(count <= min_frequency_count for count in counts)

In [None]:
# Load the list of image filenames
image_filenames = [filename for filename in os.listdir(images_directory) if filename.endswith(".jpg")]

In [None]:
len(image_filenames)

44860

In [None]:
# Iterate through each image
cnt =0
for image_filename in image_filenames:
    cnt +=1
    image_path = os.path.join(images_directory, image_filename)
    annotation_path = os.path.join(annotations_directory, os.path.splitext(image_filename)[0] + ".txt")
    print(cnt)
    # Load the annotation file
    with open(annotation_path, 'r') as file:
        annotations = file.read().strip().split('\n')

    # Iterate through each annotation in the file
    for annotation in annotations:
        label = extract_label_from_annotation(annotation)  # Implement your own method to extract the label

        # Update the label frequencies
        if label not in label_frequencies:
            label_frequencies[label] = 0
        label_frequencies[label] += 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
39861
39862
39863
39864
39865
39866
39867
39868
39869
39870
39871
39872
39873
39874
39875
39876
39877
39878
39879
39880
39881
39882
39883
39884
39885
39886
39887
39888
39889
39890
39891
39892
39893
39894
39895
39896
39897
39898
39899
39900
39901
39902
39903
39904
39905
39906
39907
39908
39909
39910
39911
39912
39913
39914
39915
39916
39917
39918
39919
39920
39921
39922
39923
39924
39925
39926
39927
39928
39929
39930
39931
39932
39933
39934
39935
39936
39937
39938
39939
39940
39941
39942
39943
39944
39945
39946
39947
39948
39949
39950
39951
39952
39953
39954
39955
39956
39957
39958
39959
39960
39961
39962
39963
39964
39965
39966
39967
39968
39969
39970
39971
39972
39973
39974
39975
39976
39977
39978
39979
39980
39981
39982
39983
39984
39985
39986
39987
39988
39989
39990
39991
39992
39993
39994
39995
39996
39997
39998
39999
40000
40001
40002
40003
40004
40005
40006
40007
40008
40009
40010
40011
40012
40013
40014
40015
40016

In [None]:
label_frequencies

{'16': 8969,
 '4': 23978,
 '15': 14546,
 '18': 2564,
 '17': 1816,
 '39': 3136,
 '36': 2651,
 '12': 3197,
 '6': 3471,
 '10': 5851,
 '7': 7412,
 '28': 1405,
 '11': 1426,
 '5': 2676,
 '38': 399,
 '21': 4038,
 '19': 2903,
 '42': 2650,
 '35': 9662,
 '3': 5542,
 '9': 2182,
 '40': 1027,
 '1': 733,
 '31': 573,
 '8': 1439,
 '22': 409,
 '23': 132,
 '37': 28,
 '2': 59,
 '24': 27,
 '41': 27,
 '25': 259,
 '43': 14,
 '29': 37,
 '14': 32,
 '26': 9,
 '32': 56,
 '20': 17,
 '33': 1,
 '34': 18,
 '13': 2,
 '27': 6,
 '0': 1,
 '30': 1}

In [None]:
# Find the minimum frequency count among all labels
min_frequency_count = min(label_frequencies.values())


In [None]:
# Filter the image filenames based on the condition
filtered_image_filenames = [image_filename for image_filename in image_filenames if is_below_min_frequency(annotations)]

In [None]:
# Copy the filtered images and their annotation files to the output directories
for image_filename in filtered_image_filenames:
    image_path = os.path.join(images_directory, image_filename)
    annotation_path = os.path.join(annotations_directory, os.path.splitext(image_filename)[0] + ".txt")
    shutil.copy2(image_path, os.path.join(sampled_images_directory, image_filename))
    shutil.copy2(annotation_path, os.path.join(sampled_annotations_directory, os.path.splitext(image_filename)[0] + ".txt"))

print("Balanced dataset created successfully.")