In [3]:
import os
import shutil
from tqdm import tqdm  # Import the tqdm library

def process_images(input_txt, search_folder, target_folder, output_txt):
    """
    Process the input text file, search for images, copy found images, and update the text file.

    Args:
    - input_txt (str): Path to the input text file.
    - search_folder (str): Root directory to search for images.
    - target_folder (str): Destination folder to copy images.
    - output_txt (str): Path to save the updated text file.
    """
    with open(input_txt, 'r') as file:
        lines = file.readlines()

    updated_lines = []
    os.makedirs(target_folder, exist_ok=True)

    # Initialize tqdm progress bar for the number of lines
    with tqdm(total=len(lines), desc="Processing lines", unit="line") as pbar:
        for line in lines:
            parts = line.strip().split()
            updated_parts = [parts[0]]  # Keep the initial number (e.g., "1")

            for image_name in parts[1:]:
                image_found = False
                for root, _, files in os.walk(search_folder):
                    if f"{image_name}.jpg" in files:
                        image_found = True
                        source_path = os.path.join(root, f"{image_name}.jpg")
                        destination_path = os.path.join(target_folder, f"{image_name}.jpg")
                        shutil.copy2(source_path, destination_path)  # Copy image to target folder
                        break

                if image_found:
                    updated_parts.append(image_name)  # Keep image name if found

            if len(updated_parts) > 1:  # Keep line only if there are images left
                updated_lines.append(" ".join(updated_parts) + "\n")
            
            pbar.update(1)  # Update the progress bar

    # Write the updated lines back to the output text file
    with open(output_txt, 'w') as file:
        file.writelines(updated_lines)

    print("Processing completed.")

In [4]:
# Example usage
input_txt = "polyvore/fashion_compatibility_prediction.txt"  # Path to the input text file
search_folder = "Sorted"  # Root folder containing subdirectories with images
target_folder = "Cleaned"  # Destination folder for found images
output_txt = "Cleaned/output.txt"  # Path to save the updated text file
num_threads = 8  # Number of threads to use
process_images(input_txt, search_folder, target_folder, output_txt)

Processing lines: 100%|████████████████████████████████████████████████████████| 7076/7076 [1:07:06<00:00,  1.76line/s]

Processing completed.





In [7]:
with open(output_txt, 'r') as file:
    lines = file.readlines()

    # Filter out lines where there is only one item after the initial number
    lines = [line for line in lines if len(line.strip().split()) > 2]

    # Write the filtered lines back to the output file
    with open(output_txt, 'w') as file:
        file.writelines(lines)

    print("Processing completed.")

Processing completed.


In [11]:
from pathlib import Path
from rembg import remove, new_session
import os
from tqdm import tqdm  # Import tqdm for the progress bar

print("Started")
providers = ['CUDAExecutionProvider']
session = new_session(providers=providers)

# Ensure the output directory exists
os.makedirs("BackgroundRemoved", exist_ok=True)

# Get all JPG files from the directory
image_files = list(Path('Cleaned').glob('*.jpg'))

# Create a tqdm iterator to show progress
for file in tqdm(image_files, desc="Processing images", unit="image"):
    input_path = str(file)
    output_path = os.path.join("BackgroundRemoved", file.name)  # Fix output path

    try:
        with open(input_path, 'rb') as i:
            with open(output_path, 'wb') as o:
                input_data = i.read()  # Read image in binary mode
                output_data = remove(input_data, session=session)  # Process the image
                o.write(output_data)  # Write the output to the new file
    except Exception as e:
        print(f"Error processing {input_path}: {e}")


Started


Processing images: 100%|██████████████████████████████████████████████████████| 8850/8850 [1:02:58<00:00,  2.34image/s]
