In [2]:
import pandas as pd
from pathlib import Path
import urllib.request
from urllib.error import URLError
from multiprocessing.pool import ThreadPool

# Define paths
dataset_folder = Path('../dataset2')
photos_download_path = dataset_folder / "unsplash"
features_path = Path("../features")
result_path = Path("../result")

# Ensure the download directory exists
photos_download_path.mkdir(parents=True, exist_ok=True)

# Read the photos table
photos = pd.read_csv("/home/sajid/Work/ResilientSage/FaceNet/photos.tsv000", sep='\t', header=0)

# Extract the IDs and the URLs of the photos
photo_urls = photos[['photo_id', 'photo_image_url']].values.tolist()

# Print some statistics
print(f'Photos in the dataset: {len(photo_urls)}')

# Function that downloads a single photo
def download_photo(photo):
    photo_id, photo_url = photo
    photo_url = f"{photo_url}?w=640"  # Set the width to 640 pixels
    photo_path = photos_download_path / f"{photo_id}.jpg"

    # Only download a photo if it doesn't exist
    if not photo_path.exists():
        try:
            urllib.request.urlretrieve(photo_url, photo_path)
            print(f"Downloaded: {photo_id}")
        except URLError as e:
            print(f"Cannot download {photo_url}: {e}")

# Create the thread pool
threads_count = 16
with ThreadPool(threads_count) as pool:
    pool.map(download_photo, photo_urls)

# Display some statistics
print(f'Photos downloaded: {len(list(photos_download_path.glob("*.jpg")))}')


Photos in the dataset: 25000
Downloaded: ZJ65mFzCGLA
Downloaded: SfgJnbqpUWU
Downloaded: XDy5I86-V78
Downloaded: 4HJiA2TWe2I
Downloaded: bygTaBey1Xk
Downloaded: tlbUJKfHhEw
Downloaded: s3dGFU-dHXw
Downloaded: Fgp8p6KD_Ks
Downloaded: 4-XMUn95FZU
Downloaded: 3tdvWoNxvbw
Downloaded: rSOrj871OwU
Downloaded: 9ogmgks2y-U
Downloaded: evrHojTLBKE
Downloaded: Sjx2iAwBVnM
Downloaded: o_-gToPk62c
Downloaded: dxZHS55WnlM
Downloaded: nRDjLxvexjA
Downloaded: HLHaSGAuF0A
Downloaded: 6cbl8B6vMLw
Downloaded: UN7teWR3BdQ
Downloaded: HPcBMVZG4rA
Downloaded: gXSFnk2a9V4
Downloaded: hRM9nzuKsh4
Downloaded: iU4fhAg9DYQ
Downloaded: C6ushDNv4qI
Downloaded: c0Bk7ftj2xM
Downloaded: eS7HrvG0mcA
Downloaded: 52P44moXNAQ
Downloaded: YnT7Q0OcJ0U
Downloaded: eK2YZq4a0fU
Downloaded: IzCdKnK42rg
Downloaded: kTJ8MnqQozg
Downloaded: _Jke7VT5XBQ
Downloaded: iGANt1N2ge8
Downloaded: RdoyDRR17cA
Downloaded: gTyyHANr974
Downloaded: grg6-DNJuaU
Downloaded: AACGBoNy59Q
Downloaded: 2BTgB4I4eSM
Downloaded: UPXeaJI3yG8
Downloaded:

In [16]:
import os
import shutil
from PIL import Image
from facenet_pytorch import MTCNN
from tqdm import tqdm

# Define the image directory and destination directory
image_dir = "/home/sajid/Work/ResilientSage/dataset2/unsplash/"
dest_dir = "/home/sajid/Work/ResilientSage/FaceNet/unsplash_face/"

# Create the destination directory if it doesn't exist
os.makedirs(dest_dir, exist_ok=True)

# Function to get all files in a directory
def get_all_files(directory):
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# Get all image files from the specified directory
files = get_all_files(image_dir)

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True, min_face_size=30)

# Process each image file with a progress bar
for file in tqdm(files, desc="Processing Images"):
    image_path = os.path.join(image_dir, file)  # Use os.path.join for cross-platform compatibility
    try:
        image = Image.open(image_path).convert("RGB")  # Ensure image is in RGB format

        # Detect faces in the image directly from PIL Image
        faces, prob = mtcnn.detect(image)

        # Continue if no faces are detected
        if faces is None:
            continue

        # Copy the image if at least one face has high confidence
        for i, box in enumerate(faces):
            if prob[i] > 0.95:  # Confidence threshold
                dest = os.path.join(dest_dir, file)  # Use os.path.join for destination path
                shutil.copy(image_path, dest)
                break  # Stop after copying the first detected face
    except Exception as e:
        print(f"Error processing {image_path}: {e}")


Processing Images: 100%|██████████████████████████| 24977/24977 [3:02:43<00:00,  2.28it/s]


In [19]:
images_face = os.listdir(dest_dir)
len(images_face)

797