In [1]:
import os
import time
import csv
import requests
from urllib.parse import unquote

# Define the Wikimedia Commons API endpoint.
API_ENDPOINT = "https://commons.wikimedia.org/w/api.php"

# Define a custom User-Agent header per Wikimedia’s policy.
HEADERS = {
    "User-Agent": "ArchitectureScraper/1.0 (https://github.com/yourusername/architecture-scraper; your-email@example.com)"
}

# Full list of architects
architects_list = [
    "Frank Lloyd Wright",
    "Le Corbusier",
    "Ludwig Mies van der Rohe",
    "Walter Gropius",
    "Zaha Hadid",
    "Renzo Piano",
    "I.M. Pei",
    "Frank Gehry",
    "Norman Foster",
    "Rem Koolhaas",
    "Oscar Niemeyer",
    "Tadao Ando",
    "Herzog & de Meuron",
    "Santiago Calatrava",
    "Bjarke Ingels",
    "Shigeru Ban",
    "Daniel Libeskind",
    "Arata Isozaki",
    "Toyo Ito",
    "David Chipperfield",
    "Philip Johnson",
    "Louis Kahn",
    "Eero Saarinen",
    "Richard Rogers",
    "Charles Correa",
    "Moshe Safdie",
    "Cesar Pelli",
    "Mario Botta",
    "Kazuyo Sejima",
    "Kengo Kuma",
    "Alejandro Aravena",
    "Steven Holl",
    "Fumihiko Maki",
    "Enric Miralles",
    "Álvaro Siza Vieira",
    "Odile Decq",
    "Bernard Tschumi",
    "Jeanne Gang",
    "Glenn Murcutt",
    "Richard Meier",
    "Jean Nouvel",
    "Ken Yeang",
    "Michael Graves",
    "Thom Mayne",
    "David Adjaye",
    "Sou Fujimoto",
    "Peter Zumthor",
    "Rafael Viñoly",
    "Luis Barragán",
    "Paul Rudolph",
    "Marcel Breuer",
    "Kenzo Tange"
]

# Define the two image types we want to retrieve.
image_types = ["interior", "exterior"]

# CSV file to store metadata.
metadata_file = "architecture_metadata.csv"

# Valid image file extensions.
VALID_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.gif')

def search_commons(query, limit=10):
    """
    Search Wikimedia Commons for file results based on the query.
    The search is restricted to the File namespace (namespace=6).
    """
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": query,
        "srnamespace": 6,  # file namespace
        "srlimit": limit
    }
    response = requests.get(API_ENDPOINT, params=params, headers=HEADERS)
    response.raise_for_status()
    data = response.json()
    return data.get("query", {}).get("search", [])

def get_image_info(file_title):
    """
    Retrieve imageinfo metadata for the given file (e.g., "File:Example.jpg").
    Returns a dictionary with details including the direct URL of the image.
    """
    params = {
        "action": "query",
        "format": "json",
        "titles": file_title,
        "prop": "imageinfo",
        "iiprop": "url|comment"
    }
    response = requests.get(API_ENDPOINT, params=params, headers=HEADERS)
    response.raise_for_status()
    data = response.json()
    pages = data.get("query", {}).get("pages", {})
    for page in pages.values():
        if "imageinfo" in page:
            return page["imageinfo"][0]
    return None

def download_image(url, folder):
    """
    Download the image from the provided URL and save it to the given folder.
    Returns the local file path if successful.
    """
    try:
        filename = unquote(os.path.basename(url))
        local_path = os.path.join(folder, filename)
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        with open(local_path, "wb") as f:
            f.write(response.content)
        return local_path
    except Exception as e:
        print(f"Error downloading image {url}: {e}")
    return None

def create_folder_structure(architect, image_type):
    """
    Create a folder structure under a top-level 'Data' folder.
    Structure: Data/Architect_Name/[interior|exterior]
    Spaces in architect names are replaced with underscores.
    """
    architect_folder = architect.replace(" ", "_")
    folder_path = os.path.join("Data", architect_folder, image_type)
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

def extract_building_name(file_title):
    """
    Derive a basic building name from the file title by removing the 'File:' prefix and file extension.
    Customize this function if you require more nuanced parsing.
    """
    if file_title.startswith("File:"):
        file_title = file_title[5:]
    building_name = file_title.rsplit(".", 1)[0]
    return building_name

def main():
    # Open a CSV file to record metadata.
    with open(metadata_file, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["architect", "image_type", "building_name", "file_title", "file_url", "local_file_path"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        # Loop through each architect and each image type.
        for architect in architects_list:
            for image_type in image_types:
                # Form the search query.
                query = f'{architect} {image_type} building'
                print(f"Searching Wikimedia Commons for: {query}")
                # Request more results in case some are non-images.
                search_results = search_commons(query, limit=10)
                
                # Create the folder structure under the "Data" directory.
                target_folder = create_folder_structure(architect, image_type)
                
                valid_count = 0
                for result in search_results:
                    if valid_count >= 5:
                        break
                    file_title = result.get("title")
                    
                    # Filter out non-image files by extension.
                    if not file_title.lower().endswith(VALID_EXTENSIONS):
                        print(f"Skipping non-image file: {file_title}")
                        continue

                    print(f"Processing file: {file_title}")
                    info = get_image_info(file_title)
                    if info and "url" in info:
                        file_url = info["url"]
                        building_name = extract_building_name(file_title)
                        
                        # Download and save the image.
                        local_path = download_image(file_url, target_folder)
                        if local_path:
                            writer.writerow({
                                "architect": architect,
                                "image_type": image_type,
                                "building_name": building_name,
                                "file_title": file_title,
                                "file_url": file_url,
                                "local_file_path": local_path
                            })
                            print(f"Saved image to: {local_path}")
                            valid_count += 1
                        else:
                            print(f"Failed to download image from {file_url}")
                        time.sleep(1)  # pause between image downloads
                time.sleep(2)  # pause between queries for the same architect
            time.sleep(2)  # pause between different architects

# Run the main function.
main()
print("Scraping completed. Check the 'Data' folder and CSV file for results.")


Searching Wikimedia Commons for: Frank Lloyd Wright interior building
Processing file: File:Interior of Frank Lloyd Wright's Pope-Leighy house.jpg
Saved image to: Data/Frank_Lloyd_Wright/interior/Interior_of_Frank_Lloyd_Wright's_Pope-Leighy_house.jpg
Processing file: File:Interior - Living Area - Frank Lloyd Wright Chair.jpg
Saved image to: Data/Frank_Lloyd_Wright/interior/Interior_-_Living_Area_-_Frank_Lloyd_Wright_Chair.jpg
Processing file: File:Frank Lloyd Wright - V.C. Morris Gift Shop, SF - 12.jpg
Saved image to: Data/Frank_Lloyd_Wright/interior/Frank_Lloyd_Wright_-_V.C._Morris_Gift_Shop,_SF_-_12.jpg
Processing file: File:Frank Lloyd Wright Home Studio.jpg
Saved image to: Data/Frank_Lloyd_Wright/interior/Frank_Lloyd_Wright_Home_Studio.jpg
Processing file: File:Frank Lloyd Wright Studio- Playroom, interior- playroom, Oak Park, IL mhsdalad 250099.jpg
Saved image to: Data/Frank_Lloyd_Wright/interior/Frank_Lloyd_Wright_Studio-_Playroom,_interior-_playroom,_Oak_Park,_IL_mhsdalad_250099