In [2]:
#importing necessary libraries
import os
import requests
from bs4 import BeautifulSoup

#function to scrape
def scrape_model_images_across_pages(login_url, base_gallery_url, user_id, password, output_folder, stop_after=300):
    session = requests.Session()

    #login with username and password
    login_payload = {"username": user_id, "password": password}
    login_response = session.post(login_url, data=login_payload)
    if login_response.status_code != 200:
        print("Failed to log in!")
        return
    print("Login successful!")

    processed_models = 0
    #starting from page 1
    page = 1

    #skip unnecessary images
    skip_list = ["burger.svg", "logo.svg", "no_gallery_cover.jpg", "no_profile_avatar.jpg", "no_video_cover.jpg"]

    while processed_models < stop_after:
        #access the gallery page
        gallery_url = f"{base_gallery_url}?page={page}"
        gallery_response = session.get(gallery_url)
        if gallery_response.status_code != 200:
            print(f"Failed to access gallery page {page}!")
            break

        gallery_soup = BeautifulSoup(gallery_response.content, "html.parser")
        models = gallery_soup.select("div.post_item a[href^='/models/']")

        if not models:
            print(f"No more models found on page {page}.")
            break

        for model in models:
            if processed_models >= stop_after:
                print("Reached limit of 300 models. Stopping...")
                break

            model_url = model.get("href")
            model_name = model.get_text(strip=True)

            #access the model page
            model_response = session.get(f"https://femjoy.com{model_url}")
            if model_response.status_code != 200:
                print(f"Failed to access model page: {model_url}")
                continue

            model_soup = BeautifulSoup(model_response.content, "html.parser")
            hair_color = model_soup.find("span", string="Hair color")
            hair_color = hair_color.find_previous("b").text if hair_color else "Unknown"
            bra_size = model_soup.find("span", string="Bra size")
            bra_size = bra_size.find_previous("b").text if bra_size else "Unknown"

            model_folder = os.path.join(output_folder, f"{model_name}_{hair_color}_{bra_size}")
            os.makedirs(model_folder, exist_ok=True)

            #scraping images from the model's profile page
            image_tags = model_soup.select("img[data-src], img[src]")

            if not image_tags:
                print(f"No images found on model page: {model_url}")
                continue

            for img_tag in image_tags:
                img_url = img_tag.get("data-src") or img_tag.get("src")
                if img_url:
                    img_name = os.path.basename(img_url.split("?")[0])

                    #skip unnecessary images
                    if any(skip_file in img_name for skip_file in skip_list):
                        print(f"Skipped unnecessary image: {img_name}")
                        continue

                    img_path = os.path.join(model_folder, img_name)

                    img_response = session.get(img_url, stream=True)
                    if img_response.status_code == 200:
                        with open(img_path, "wb") as img_file:
                            for chunk in img_response.iter_content(1024):
                                img_file.write(chunk)
                        print(f"Downloaded {img_name} to {model_folder}")
                    else:
                        print(f"Failed to download image: {img_url}")

            processed_models += 1

        page += 1

    print(f"Scraping complete. Processed {processed_models} models.")


#defining parameters
login_url = "https://femjoy.com/login"
base_gallery_url = "https://femjoy.com/photos"
user_id = "ohdotxyz"
password = "sqPCyFnb2F8xtsQ"
output_folder = "/content/drive/MyDrive/femjoy_model_images"

#run the scraper function
scrape_model_images_across_pages(login_url, base_gallery_url, user_id, password, output_folder, stop_after=300)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipped unnecessary image: burger.svg
Skipped unnecessary image: logo.svg
Downloaded 7377252e4674ec6b7b47869aca1215fc.jpeg to /content/drive/MyDrive/femjoy_model_images/Sophie Lix_brown_B
Skipped unnecessary image: no_gallery_cover.jpg
Downloaded b4eea935e0524e1cc9bcd98959e70a79.jpeg to /content/drive/MyDrive/femjoy_model_images/Sophie Lix_brown_B
Skipped unnecessary image: no_gallery_cover.jpg
Downloaded 1091420e05c22274513f437edde3a38a.jpeg to /content/drive/MyDrive/femjoy_model_images/Sophie Lix_brown_B
Skipped unnecessary image: no_gallery_cover.jpg
Downloaded 493b057bf1fd466c566a48e0f77b4de3.jpeg to /content/drive/MyDrive/femjoy_model_images/Sophie Lix_brown_B
Skipped unnecessary image: logo.svg
Skipped unnecessary image: burger.svg
Skipped unnecessary image: logo.svg
Downloaded 80716310b143b171ae5059b3cc0a14ac.jpeg to /content/drive/MyDrive/femjoy_model_images/Lilly A_~_B
Skipped unnecessary image: no_gallery_cover.