### Dataset Preprocessing


In [7]:
import os
import shutil
import pandas as pd

# Paths
dataset_folder = 'datasetFull'
csv_file = 'label_train.csv'

# Read CSV
df = pd.read_csv(csv_file)

# Create folders for each label
labels = df['race'].unique()
for label in labels:
    os.makedirs(os.path.join(dataset_folder, label), exist_ok=True)

# Move files to corresponding folders
for index, row in df.iterrows():
    file_name = row['file']
    label = row['race']
    source = os.path.join(dataset_folder, file_name)
    destination = os.path.join(dataset_folder, label, str(index+1))
    shutil.move(source, destination)

print("Files segregated successfully.")


Files segregated successfully.


In [None]:
import os
from PIL import Image

def lowercase_extensions(directory):
    for root, _, files in os.walk(directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            try:
                with Image.open(file_path) as img:
                    # Get the format from the image metadata
                    format = img.format.lower()
                    correct_extension = f".{format}"

                    # Get the current file extension
                    _, ext = os.path.splitext(filename)
                    ext = ext.lower()

                    if ext != correct_extension:
                        # Construct the new file name with the correct extension
                        new_file_path = os.path.join(root, os.path.splitext(filename)[0] + correct_extension)
                        os.rename(file_path, new_file_path)
                        print(f'Renamed: {file_path} to {new_file_path}')
            except Exception as e:
                print(f"Could not process file {file_path}: {e}")

# Directory to be processed
directory_path = 'datasetFull'

# Lowercase the extensions and fix any incorrect ones
lowercase_extensions(directory_path)


Could not process file datasetFull/.DS_Store: cannot identify image file 'datasetFull/.DS_Store'
Renamed: datasetFull/Indian/86062 to datasetFull/Indian/86062.jpeg
Renamed: datasetFull/Indian/71409 to datasetFull/Indian/71409.jpeg
Renamed: datasetFull/Indian/62822 to datasetFull/Indian/62822.jpeg
Renamed: datasetFull/Indian/9204 to datasetFull/Indian/9204.jpeg
Renamed: datasetFull/Indian/12369 to datasetFull/Indian/12369.jpeg
Renamed: datasetFull/Indian/19781 to datasetFull/Indian/19781.jpeg
Renamed: datasetFull/Indian/81075 to datasetFull/Indian/81075.jpeg
Renamed: datasetFull/Indian/13089 to datasetFull/Indian/13089.jpeg
Renamed: datasetFull/Indian/39988 to datasetFull/Indian/39988.jpeg
Renamed: datasetFull/Indian/76846 to datasetFull/Indian/76846.jpeg
Renamed: datasetFull/Indian/2886 to datasetFull/Indian/2886.jpeg
Renamed: datasetFull/Indian/40518 to datasetFull/Indian/40518.jpeg
Renamed: datasetFull/Indian/45856 to datasetFull/Indian/45856.jpeg
Renamed: datasetFull/Indian/32138 to

In [14]:
import os

def clean_folders(main_folder):
    # Check if the main folder exists
    if not os.path.exists(main_folder):
        print(f"Main folder does not exist: {main_folder}")
        return
    
    # Iterate through each subfolder in the main dataset folder
    for subdir, _, files in os.walk(main_folder):
        if subdir == main_folder:  # Skip the main folder itself
            continue
        
        # Filter out only image files (assuming jpg and png, modify as needed)
        image_files = [f for f in files if f.endswith(('.jpg', '.jpeg', '.png'))]
        
        # Skip if no image files found in the subfolder
        if not image_files:
            print(f"No image files found in folder: {subdir}")
            continue

        # Sort the image files to ensure consistency in keeping the first 200
        image_files.sort()

        # Keep the first 200 images, delete the rest
        for file in image_files[200:]:
            file_path = os.path.join(subdir, file)
            try:
                os.remove(file_path)
                print(f'Deleted: {file_path}')
            except Exception as e:
                print(f'Error deleting file {file_path}: {e}')

        print(f"Processed folder: {subdir}, Total images found: {len(image_files)}, Images kept: {min(200, len(image_files))}")

main_folder = 'dataset'
clean_folders(main_folder)


Processed folder: dataset/Indian, Total images found: 191, Images kept: 191
Processed folder: dataset/Latino_Hispanic, Total images found: 197, Images kept: 197
Processed folder: dataset/East Asian, Total images found: 193, Images kept: 193
Processed folder: dataset/Middle Eastern, Total images found: 189, Images kept: 189
Processed folder: dataset/White, Total images found: 200, Images kept: 200
Processed folder: dataset/Southeast Asian, Total images found: 193, Images kept: 193
Processed folder: dataset/Black, Total images found: 193, Images kept: 193


In [4]:
# Cleanup file name starting from 1

import os

def clean_and_rename_folders(main_folder):
    # Check if the main folder exists
    if not os.path.exists(main_folder):
        print(f"Main folder does not exist: {main_folder}")
        return
    
    # Iterate through each subfolder in the main dataset folder
    for subdir, _, files in os.walk(main_folder):
        if subdir == main_folder:  # Skip the main folder itself
            continue
        
        # Filter out only image files (assuming jpg, jpeg, and png, modify as needed)
        image_files = [f for f in files if f.endswith(('.jpg', '.jpeg', '.png'))]
        
        # Skip if no image files found in the subfolder
        if not image_files:
            print(f"No image files found in folder: {subdir}")
            continue

        # Sort the image files to ensure consistency in keeping the first 200
        image_files.sort()

        # Keep the first 200 images, delete the rest
        for file in image_files[200:]:
            file_path = os.path.join(subdir, file)
            try:
                os.remove(file_path)
                print(f'Deleted: {file_path}')
            except Exception as e:
                print(f'Error deleting file {file_path}: {e}')
        
        # Rename the remaining files to start from 1 to 200
        for index, file in enumerate(image_files[:200]):
            old_file_path = os.path.join(subdir, file)
            new_file_name = f"{index + 1}{os.path.splitext(file)[1]}"
            new_file_path = os.path.join(subdir, new_file_name)
            try:
                os.rename(old_file_path, new_file_path)
                print(f'Renamed: {old_file_path} to {new_file_path}')
            except Exception as e:
                print(f'Error renaming file {old_file_path} to {new_file_path}: {e}')

        print(f"Processed folder: {subdir}, Total images found: {len(image_files)}, Images kept and renamed: {min(200, len(image_files))}")

main_folder = 'dataset'
clean_and_rename_folders(main_folder)


Deleted: dataset/Indian/4436.jpeg
Deleted: dataset/Indian/45.jpeg
Deleted: dataset/Indian/4519.jpeg
Deleted: dataset/Indian/4522.jpeg
Deleted: dataset/Indian/46.jpeg
Deleted: dataset/Indian/47.jpeg
Deleted: dataset/Indian/48.jpeg
Deleted: dataset/Indian/4802.jpeg
Deleted: dataset/Indian/48606423547_d8978432cb_z.jpg
Deleted: dataset/Indian/4883.jpeg
Deleted: dataset/Indian/4898.jpeg
Deleted: dataset/Indian/49.jpeg
Deleted: dataset/Indian/4970.jpeg
Deleted: dataset/Indian/4980.jpeg
Deleted: dataset/Indian/4998.jpeg
Deleted: dataset/Indian/5.jpeg
Deleted: dataset/Indian/50.jpeg
Deleted: dataset/Indian/5094.jpeg
Deleted: dataset/Indian/51.jpeg
Deleted: dataset/Indian/5174.jpeg
Deleted: dataset/Indian/5179.jpeg
Deleted: dataset/Indian/5322.jpeg
Deleted: dataset/Indian/5326.jpeg
Deleted: dataset/Indian/54.jpeg
Deleted: dataset/Indian/5404.jpeg
Deleted: dataset/Indian/5442.jpeg
Deleted: dataset/Indian/55.jpeg
Deleted: dataset/Indian/5556.jpeg
Deleted: dataset/Indian/5580.jpeg
Deleted: dataset

In [6]:
import os
from PIL import Image

def resize_images_in_folder(root_dir, target_size=(256, 256)):
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            file_path = os.path.join(subdir, file)
            try:
                with Image.open(file_path) as img:
                    img = img.resize(target_size, Image.LANCZOS)
                    img.save(file_path)
                    print(f"Resized and saved image: {file_path}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

# Path to the root directory containing all subfolders
root_directory = 'dataset'  # Replace with your dataset directory

# Resize all images to 256x256
resize_images_in_folder(root_directory)


Error processing file dataset/.DS_Store: cannot identify image file 'dataset/.DS_Store'
Resized and saved image: dataset/Indian/47.jpeg
Resized and saved image: dataset/Indian/148.jpeg
Resized and saved image: dataset/Indian/10.jpeg
Resized and saved image: dataset/Indian/109.jpeg
Resized and saved image: dataset/Indian/51.jpeg
Resized and saved image: dataset/Indian/172.jpeg
Resized and saved image: dataset/Indian/92.jpeg
Resized and saved image: dataset/Indian/125.jpeg
Resized and saved image: dataset/Indian/84.jpeg
Resized and saved image: dataset/Indian/89.jpg
Resized and saved image: dataset/Indian/164.jpeg
Resized and saved image: dataset/Indian/113.jpeg
Resized and saved image: dataset/Indian/144.jpeg
Resized and saved image: dataset/Indian/152.jpeg
Resized and saved image: dataset/Indian/105.jpeg
Resized and saved image: dataset/Indian/191.jpeg
Resized and saved image: dataset/Indian/129.jpeg
Resized and saved image: dataset/Indian/71.jpeg
Resized and saved image: dataset/India