
## Scraping images of Julia Roberts

Below is code that scrapes images of Julia Roberts from the internet. There are several search terms because if you search
for too many images corresponding to a search term, this will give problems. Now there is a cap for each search term of 300. Apparently this is easier with Bing than with Google hence I use Bing. Use *!pip install icrawler* as well as *pip install Pillow==10.2.0*
 




In [None]:

import os
import shutil
from icrawler.builtin import BingImageCrawler
from PIL import Image

In [None]:



# several keywords
celebrities = [
    "Julia Roberts", 
    "young Julia Roberts", 
    "Julia Roberts red carpet", 
    "Julia Roberts 2024",
    "Julia Roberts 2023"
]
output_folder = "roberts_images/raw"
os.makedirs(output_folder, exist_ok=True)

# Clear previous downloads
for f in os.listdir(output_folder):
    os.remove(os.path.join(output_folder, f))

# Download 10 images per celebrity
for celeb in celebrities:
    slug = celeb.lower().replace(" ", "_")
    temp_folder = os.path.join(output_folder, "tmp_" + slug)
    os.makedirs(temp_folder, exist_ok=True)

    crawler = BingImageCrawler(storage={"root_dir": temp_folder})
    crawler.crawl(keyword=celeb, max_num=300)

    # Move and rename images
    for i, fname in enumerate(os.listdir(temp_folder)):
        src = os.path.join(temp_folder, fname)
        if not os.path.isfile(src):
            continue
        new_name = f"{slug}_{i+1:02d}.jpg"
        dst = os.path.join(output_folder, new_name)
        shutil.move(src, dst)

    shutil.rmtree(temp_folder)



## Converting images

Below all the images are resized and made of the same type 



In [None]:

input_folder = "roberts_images/raw"
output_folder = "roberts_images/mobilenet_ready"
mobilenet_size = (224, 224)

os.makedirs(output_folder, exist_ok=True)

for fname in os.listdir(input_folder):
    fpath = os.path.join(input_folder, fname)
    if not os.path.isfile(fpath):
        continue
    if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.bmp')):
        try:
            with Image.open(fpath) as img:
                img = img.convert("RGB")
                img = img.resize(mobilenet_size, Image.LANCZOS)
                out_path = os.path.join(output_folder, os.path.splitext(fname)[0] + ".jpg")
                img.save(out_path, format="JPEG", quality=85)
        except Exception as e:
            print(f"Failed to process {fname}: {e}")


## Faceboxer 

We will enhance the quality of images by hand. To simplify this, we use the MTCNN face detector tool which, extracts the box of the face, and then in this folder of face images you can easily scroll through these images to find images that are problematic with from the point of view of classification which can be removed. Use *!pip install facenet-pytorch*


In [None]:
from facenet_pytorch import MTCNN
from PIL import Image
import os
import torch

# Initialize MTCNN face detector
mtcnn = MTCNN(image_size=160, margin=20, keep_all=False, device='cpu')

input_dir = 'roberts_images/mobilenet_ready/'
output_dir = 'roberts_images/crop/'

os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    img_path = os.path.join(input_dir, filename)

    try:
        img = Image.open(img_path).convert('RGB')
    except Exception as e:
        print(f"Skipping {filename}: {e}")
        continue

    # Detect one face (or None)
    face = mtcnn(img)

    if face is not None and face.shape[1] >= 20 and face.shape[2] >= 20:
        # Convert to image format: [C, H, W] → [H, W, C], then scale to 0–255 and convert to uint8
        face_img = (face.permute(1, 2, 0) * 255).clamp(0, 255).byte().cpu().numpy()

        save_path = os.path.join(output_dir, filename)
        Image.fromarray(face_img).save(save_path)
        print(f"Saved: {save_path}")
    else:
        print(f"No face detected or face too small in: {filename}")



## Extracting images of celebrities who are not Julia Roberts

To do binary classification, we also need images of celebrities that are not Julia Roberts, these are scraped below. I only scrape 20 of them hence I can still scrape with Google.


In [None]:
#extract 20 images of 50 celebrities


import os
import shutil
from icrawler.builtin import GoogleImageCrawler

# 10 celebrities (modify freely)
celebrities = [
    "Tom Hanks", "Emma Watson", "Leonardo DiCaprio", "Scarlett Johansson",
    "Denzel Washington", "Natalie Portman", "Morgan Freeman", "Keira Knightley",
    "Ryan Gosling", "Jennifer Lawrence",

    "Brad Pitt", "Angelina Jolie", "Chris Hemsworth", "Anne Hathaway",
    "Samuel L. Jackson", "Zendaya", "Matt Damon", "Cate Blanchett",
    "Will Smith", "Charlize Theron",

    "Robert Downey Jr.", "Gal Gadot", "Jake Gyllenhaal", "Emma Stone",
    "Mark Ruffalo", "Meryl Streep", "Tom Holland", "Salma Hayek",
    "Christian Bale", "Sandra Bullock",

    "Hugh Jackman", "Nicholas Cage", "Idris Elba", "Reese Witherspoon",
    "Benedict Cumberbatch", "Jessica Chastain", "Chadwick Boseman", "Kate Winslet",
    "Joaquin Phoenix", "Emily Blunt",

    "Timothée Chalamet", "Viola Davis", "Michael B. Jordan", "Kristen Stewart",
    "Daniel Radcliffe", "Rachel McAdams", "Chris Evans", "Amanda Seyfried",
    "Jason Momoa", "Florence Pugh"
]

output_folder = "not_roberts_images/raw"
os.makedirs(output_folder, exist_ok=True)

# Clear previous downloads
for f in os.listdir(output_folder):
    os.remove(os.path.join(output_folder, f))

# Download 10 images per celebrity
for celeb in celebrities:
    slug = celeb.lower().replace(" ", "_")
    temp_folder = os.path.join(output_folder, "tmp_" + slug)
    os.makedirs(temp_folder, exist_ok=True)

    crawler = GoogleImageCrawler(storage={"root_dir": temp_folder})
    crawler.crawl(keyword=celeb, max_num=20)

    # Move and rename images
    for i, fname in enumerate(os.listdir(temp_folder)):
        src = os.path.join(temp_folder, fname)
        if not os.path.isfile(src):
            continue
        new_name = f"{slug}_{i+1:02d}.jpg"
        dst = os.path.join(output_folder, new_name)
        shutil.move(src, dst)

    shutil.rmtree(temp_folder)



## Converting images of celebrities who are not Julia Roberts



In [None]:



input_folder = "not_roberts_images/raw"
output_folder = "not_roberts_images/mobilenet_ready"
mobilenet_size = (224, 224)

os.makedirs(output_folder, exist_ok=True)

for fname in os.listdir(input_folder):
    fpath = os.path.join(input_folder, fname)
    if not os.path.isfile(fpath):
        continue
    if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.bmp')):
        try:
            with Image.open(fpath) as img:
                img = img.convert("RGB")
                img = img.resize(mobilenet_size, Image.LANCZOS)
                out_path = os.path.join(output_folder, os.path.splitext(fname)[0] + ".jpg")
                img.save(out_path, format="JPEG", quality=85)
        except Exception as e:
            print(f"Failed to process {fname}: {e}")
