In [None]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    ElementClickInterceptedException,
    ElementNotInteractableException,
    NoSuchElementException,
    UnexpectedAlertPresentException,
    TimeoutException,
)

Collect every style page link from the https://www.wikiart.org/en/paintings-by-style website.

In [None]:
# Start the browser
driver = webdriver.Chrome()
driver.get("https://www.wikiart.org/en/paintings-by-style")
time.sleep(5)  # wait for page to load, increase if needed

# Get all <ul> with the desired class
style_list = driver.find_elements(By.CSS_SELECTOR, 'ul.dictionaries-list li.dottedItem a')

print(f"Found {len(style_list)} style links:\n")

style_urls = []
# Print the names and URLs
for link in style_list:
    name = link.text.strip()
    href = link.get_attribute("href")
    print(f"{name} - {href}")
    if href:  # skip if None
        style_urls.append(href)

driver.quit()


Found 218 style links:

Early Dynastic 2 - https://www.wikiart.org/en/paintings-by-style/early-dynastic-period
Old Kingdom 1 - https://www.wikiart.org/en/paintings-by-style/old-kingdom
Middle Kingdom 9 - https://www.wikiart.org/en/paintings-by-style/middle-kingdom
New Kingdom 114 - https://www.wikiart.org/en/paintings-by-style/new-kingdom
Amarna 12 - https://www.wikiart.org/en/paintings-by-style/amarna
3rd Intermediate Period 16 - https://www.wikiart.org/en/paintings-by-style/3rd-intermediate-period
Late Period 4 - https://www.wikiart.org/en/paintings-by-style/late-period
Ptolemaic 4 - https://www.wikiart.org/en/paintings-by-style/ptolemaic
2nd Intermediate Period 1 - https://www.wikiart.org/en/paintings-by-style/2nd-intermediate-period
Archaic 39 - https://www.wikiart.org/en/paintings-by-style/archaic-period
Classical 89 - https://www.wikiart.org/en/paintings-by-style/classical-period
Hellenistic 135 - https://www.wikiart.org/en/paintings-by-style/hellenistic-period?select=featured
Ge

Collect each artwork page links from each style page collected above.

In [None]:
driver = webdriver.Chrome()
artwork_links_by_style = {}

for style_url in style_urls:
    try:
        driver.get(style_url)
        time.sleep(2)

        # Keep clicking "LOAD MORE" if available
        while True:
            try:
                load_more_btn = driver.find_element(By.CSS_SELECTOR, "a.masonry-load-more-button")
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more_btn)
                time.sleep(1)
                load_more_btn.click()
                time.sleep(2)
            except (NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException):
                break
            except UnexpectedAlertPresentException:
                alert = driver.switch_to.alert
                print(f"Alert appeared on {style_url}, accepting and continuing.")
                alert.accept()
                time.sleep(2)
                continue  # Try clicking again

        # Once all loaded, collect artwork links
        artwork_elements = driver.find_elements(By.CSS_SELECTOR, 'a.artwork-name')
        artwork_links = [a.get_attribute('href') for a in artwork_elements if a.get_attribute('href')]

        artwork_links_by_style[style_url] = artwork_links

    except UnexpectedAlertPresentException:
        # Handle unexpected alerts during page load
        alert = driver.switch_to.alert
        print(f"Alert during loading {style_url}, accepting and skipping.")
        alert.accept()
        continue
    except Exception as e:
        print(f"Error on {style_url}: {e}")
        continue


In [5]:
# Save to txt file
with open("artwork_links_by_style.txt", "w", encoding="utf-8") as f:
    for style_url, links in artwork_links_by_style.items():
        f.write(f"Style URL: {style_url}\n")
        for link in links:
            f.write(f"  {link}\n")
        f.write("\n")  # separate styles


Filter out the styles with less that 500 artworks in total

In [1]:
input_path = "artwork_links_by_style_eddited.txt"
output_path = "artwork_links_filtered.txt"

filtered_data = {}
current_style = None

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line.startswith("Style URL:"):
            current_style = line[len("Style URL:"):].strip()
            filtered_data[current_style] = []
        elif line.startswith("http") and current_style:
            filtered_data[current_style].append(line)

# Filter styles with at least 500 artworks
filtered_data = {style: links for style, links in filtered_data.items() if len(links) >= 500}

# Save the filtered data
with open(output_path, "w", encoding="utf-8") as f:
    for style, links in filtered_data.items():
        f.write(f"Style URL: {style}\n")
        for link in links:
            f.write(f"  {link}\n")
        f.write("\n")

print(f"Saved filtered styles (≥500 artworks) to: {output_path}")


Saved filtered styles (≥500 artworks) to: artwork_links_filtered.txt


In [5]:
#file_path = "artwork_links_filtered.txt"
file_path = "artwork_links_by_style_eddited.txt"

style_names = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line.startswith("Style URL:"):
            style_url = line[len("Style URL:"):].strip()
            style_names.append(style_url)

# Print results
print(f"Total styles: {len(style_names)}\n")


Total styles: 190



In [6]:
file_path = "artwork_links_filtered.txt"

style_counts = {}
current_style = None

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line.startswith("Style URL:"):
            current_style = line[len("Style URL:"):].strip()
            style_counts[current_style] = 0
        elif line.startswith("http") and current_style:
            style_counts[current_style] += 1

# Compute min and max
min_count = min(style_counts.values())
max_count = max(style_counts.values())

# Get styles with those counts
min_styles = [style for style, count in style_counts.items() if count == min_count]
max_styles = [style for style, count in style_counts.items() if count == max_count]

# Print results
print(f"Maximum artwork count: {max_count}")
for s in max_styles:
    print(f"  ↳ Style with max count: {s}")

print(f"\nMinimum artwork count: {min_count}")
for s in min_styles:
    print(f"  ↳ Style with min count: {s}")


Maximum artwork count: 3600
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/baroque?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/rococo?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/neoclassicism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/romanticism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/realism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/na-ve-art-primitivism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/surrealism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/post-impressionism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/symbolism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-sty

In [8]:
file_path = "artwork_links_by_style.txt"

style_counts = {}
current_style = None

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line.startswith("Style URL:"):
            current_style = line[len("Style URL:"):].strip()
            style_counts[current_style] = 0
        elif line.startswith("http") and current_style:
            style_counts[current_style] += 1

# Compute min and max
min_count = min(style_counts.values())
max_count = max(style_counts.values())

# Get styles with those counts
min_styles = [style for style, count in style_counts.items() if count == min_count]
max_styles = [style for style, count in style_counts.items() if count == max_count]

# Print results
print(f"Maximum artwork count: {max_count}")
for s in max_styles:
    print(f"  ↳ Style with max count: {s}")

print(f"\nMinimum artwork count: {min_count}")
for s in min_styles:
    print(f"  ↳ Style with min count: {s}")


Maximum artwork count: 3600
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/baroque?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/rococo?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/neoclassicism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/romanticism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/realism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/na-ve-art-primitivism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/surrealism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/post-impressionism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-style/symbolism?select=featured
  ↳ Style with max count: https://www.wikiart.org/en/paintings-by-sty

In [None]:
# Load filtered artwork links from file (ignoring style URLs)
with open("artwork_links_filtered.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Filter only artwork links (not style URLs)
artwork_links = [line.strip() for line in lines if line.startswith("  http") and not line.startswith("Style URL")]

print(len(artwork_links))

96498


Collect the information (features and image link which will be used to download the image later) from collected artwork page links 

In [None]:
# Output file
csv_file = "artwork_details.csv"
excel_file = "artwork_details.xlsx"

# Load already scraped data (if exists)    
if os.path.exists(csv_file) and os.path.getsize(csv_file) > 0:
    existing_df = pd.read_csv(csv_file)
    scraped_urls = set(existing_df["url"])
    all_data = existing_df.to_dict(orient="records")
else:
    scraped_urls = set()
    all_data = []

# Setup Selenium
driver = webdriver.Chrome()

# Helper function to safely get element text
def safe_find(by, value):
    try:
        return driver.find_element(by, value).text.strip()
    except NoSuchElementException:
        return None

# Scrape each URL (skip if already scraped)
for url in artwork_links:
    if url in scraped_urls:
        continue

    try:
        driver.get(url)
        time.sleep(0.5)

        data = {
            "url": url,
            "artwork_name": safe_find(By.TAG_NAME, "h3"),
            "artist_name": safe_find(By.CSS_SELECTOR, "h5[itemprop='creator'] span[itemprop='name']"),
            "date": safe_find(By.CSS_SELECTOR, "span[itemprop='dateCreated']"),
            "style": safe_find(By.XPATH, "//li[s/text()='Style:']/span"),
            "genre": safe_find(By.XPATH, "//li[s/text()='Genre:']//span[@itemprop='genre']"),
            "media": ", ".join(
                [e.text.strip() for e in driver.find_elements(By.XPATH, "//li[s/text()='Media:']/span/a")]
            ) or None,
            "tags": ", ".join(
                [e.text.strip() for e in driver.find_elements(By.CSS_SELECTOR, ".tags-cheaps__item__ref")]
            ) or None,
            "image_url": driver.find_element(By.CSS_SELECTOR, "img[itemprop='image']").get_attribute("src")
                if driver.find_elements(By.CSS_SELECTOR, "img[itemprop='image']") else None
        }

        all_data.append(data)
        scraped_urls.add(url)

        # Save after each artwork
        df = pd.DataFrame(all_data)
        df.to_csv(csv_file, index=False, encoding="utf-8")
        df.to_excel(excel_file, index=False)

        print(f"✅ Scraped and saved: {url}")

    except Exception as e:
        print(f"❌ Error with {url}: {e}")
        continue

# Close browser
driver.quit()


  existing_df = pd.read_csv(csv_file)


✅ Scraped and saved: https://www.wikiart.org/en/georges-valmier/place-du-village-1925
✅ Scraped and saved: https://www.wikiart.org/en/georges-valmier/still-life-in-front-of-a-window-1925
✅ Scraped and saved: https://www.wikiart.org/en/georges-valmier/still-life-in-front-of-the-window-1925
✅ Scraped and saved: https://www.wikiart.org/en/georges-valmier/the-village-fun-fair-1925
✅ Scraped and saved: https://www.wikiart.org/en/gustave-buchet/composition-avec-7-tubes-1925
✅ Scraped and saved: https://www.wikiart.org/en/vytautas-kairiukstis/laivelis-ant-ezero-1925
✅ Scraped and saved: https://www.wikiart.org/en/carlos-quizpez-asin/nocturno-1925
✅ Scraped and saved: https://www.wikiart.org/en/paul-nash/the-stackyard-1925
✅ Scraped and saved: https://www.wikiart.org/en/jindrich-styrsky/namesicna-1925
✅ Scraped and saved: https://www.wikiart.org/en/jindrich-styrsky/kominik-a-snehulak-1925
✅ Scraped and saved: https://www.wikiart.org/en/josef-capek/sasek-1925
✅ Scraped and saved: https://www.wi

KeyboardInterrupt: 