scrapes all artists from WikiArt, grouped by nationality, and saves them to a CSV.

In [10]:
import os
import json
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# ======================================
# CONFIG
# ======================================
BASE = "https://www.wikiart.org/en/artists-by-nation/"
OUT_CSV = "data/wikiart/wikiart_all_artists.csv"
CHECKPOINT_PATH = "data/wikiart/wikiart_artists_checkpoint.json"

# Browser headers (WikiArt blocks requests without these)
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0 Safari/537.36"
    ),
    "Referer": "https://www.google.com/",
}

# ======================================
# FULL NATIONALITY LIST
# ======================================
nationalities = [
    "american", "french", "british", "italian", "german", "russian", "romanian", "jewish",
    "dutch", "japanese", "spanish", "ukrainian", "swiss", "polish", "canadian", "austrian",
    "portuguese", "brazilian", "chinese", "hungarian", "australian", "belgian", "armenian",
    "flemish", "greek", "mexican", "iranian", "turkish", "catalan", "argentinean", "swedish",
    "bulgarian", "czech", "irish", "indian", "norwegian", "finnish", "south-african", "cuban",
    "south-korean", "danish", "belarusian", "israeli", "croatian", "lithuanian", "serbian",
    "egyptian", "latvian", "venezuelan", "colombian", "icelandic", "azerbaijani", "slovenian",
    "indigenous-north-americans", "scotland", "nigerian", "uruguayan", "palestinians",
    "dominican", "macedonian", "georgian", "ethiopian", "slovak", "filipino", "peruvian",
    "chilean", "indonesian", "new-zealander", "vietnamese", "lebanese", "moldovan", "albanian",
    "ecuadorian", "puerto-rican", "moroccan", "bosnian", "bengali", "thai", "jamaican",
    "syrian", "saudi", "iraqi", "north-koreans", "guatemalan", "kenyan", "estonian", "yemeni",
    "cypriot", "luxembourgers", "namibians", "bangladeshi", "mongol", "ghanaian", "qatari",
    "emirati", "angolan", "tunisian", "algerians", "libyan", "cameroonian", "costa-ricans",
    "sudanese", "bahraini", "guyanese", "montenegrins", "barbadian"
]

# ======================================
# HELPERS
# ======================================

def load_checkpoint():
    if os.path.exists(CHECKPOINT_PATH):
        with open(CHECKPOINT_PATH, "r") as f:
            return json.load(f)
    return []


def save_checkpoint(done_list):
    with open(CHECKPOINT_PATH, "w") as f:
        json.dump(done_list, f)


def append_to_csv(rows):
    df = pd.DataFrame(rows)
    header = not os.path.exists(OUT_CSV)
    df.to_csv(OUT_CSV, index=False, mode="a", header=header)


# ======================================
# SCRAPER
# ======================================

def fetch_artists(country):
    """Fetch list of artists for one nationality."""

    url = f"{BASE}{country}/text-list"

    # Retry up to 4 times with delay
    for attempt in range(4):
        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            if r.status_code == 200:
                break
            else:
                print(f"‚ö†Ô∏è {country}: HTTP {r.status_code}, retrying‚Ä¶")
        except Exception as e:
            print(f"‚ö†Ô∏è {country}: {e}, retrying‚Ä¶")
        time.sleep(2)
    else:
        # Failed all attempts
        return []

    soup = BeautifulSoup(r.text, "html.parser")

    main_ul = soup.select_one(".masonry-text-view.masonry-text-view-all ul")
    if not main_ul:
        print(f"‚ö†Ô∏è {country}: Could not find artist list container")
        return []

    artists = []
    for li in main_ul.select("li"):
        a = li.find("a")
        if not a:
            continue

        name = a.text.strip()
        href = "https://www.wikiart.org" + a["href"]

        spans = li.find_all("span")
        lifespan = spans[0].text.strip(", ") if len(spans) > 0 else ""
        artworks = spans[1].text.strip(", ") if len(spans) > 1 else ""

        artists.append({
            "name": name,
            "lifespan": lifespan,
            "artworks": artworks,
            "url": href,
            "nationality": country
        })

    return artists


def scrape_artists():
    done = load_checkpoint()
    print(f"üìç Already done: {len(done)} nationalities")

    for country in tqdm(nationalities):
        if country in done:
            continue

        print(f"\nüåç Fetching artists for {country}")
        artists = fetch_artists(country)
        print(f"üé® Found {len(artists)} artists")

        if artists:
            append_to_csv(artists)
        else:
            print(f"‚ö†Ô∏è No artists found for {country}")

        # Save checkpoint and continue
        done.append(country)
        save_checkpoint(done)

        # Polite delay
        time.sleep(random.uniform(1.5, 3.5))

    print("\n‚úÖ DONE ‚Äî all nationalities scraped!")


# ======================================
# RUN
# ======================================

if __name__ == "__main__":
    scrape_artists()


üìç Already done: 106 nationalities


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 675678.15it/s]


‚úÖ DONE ‚Äî all nationalities scraped!





For every artist in wikiart_all_artists.csv, fetch all their artworks from WikiArt, extract each artwork‚Äôs image URL, and save everything to wikiart_all_artworks.csv, with per-artist checkpointing

In [11]:
import os
import json
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# ==============================
# CONFIG
# ==============================
ARTISTS_CSV = "data/wikiart/wikiart_all_artists.csv"
ARTWORKS_CSV = "data/wikiart/wikiart_all_artworks.csv"
CHECKPOINT_PATH = "data/wikiart/wikiart_artworks_checkpoint.json"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0 Safari/537.36"
    ),
    "Referer": "https://www.google.com/",
}

# ==============================
# HELPERS
# ==============================

def load_checkpoint():
    if os.path.exists(CHECKPOINT_PATH):
        with open(CHECKPOINT_PATH, "r") as f:
            return set(json.load(f))
    return set()

def save_checkpoint(done):
    with open(CHECKPOINT_PATH, "w") as f:
        json.dump(list(done), f)

def append_to_csv(rows):
    df = pd.DataFrame(rows)
    header = not os.path.exists(ARTWORKS_CSV)
    df.to_csv(ARTWORKS_CSV, index=False, mode="a", header=header)

# ==============================
# SCRAPER HELPERS
# ==============================

def fetch_artworks_list(artist_url):
    """
    Fetch artwork title + URL from /all-works/text-list
    """
    url = artist_url + "/all-works/text-list"

    for attempt in range(3):
        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            if r.status_code == 200:
                break
            else:
                print("‚ö†Ô∏è HTTP", r.status_code)
        except Exception as e:
            print("‚ö†Ô∏è", e)
        # time.sleep(1)
    else:
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    items = soup.select("ul.painting-list-text li a[href^='/en/']")

    artworks = []
    for a in items:
        artworks.append((
            a.text.strip(),
            "https://www.wikiart.org" + a["href"]
        ))
    return artworks


def fetch_image_url(artwork_page):
    """Extract image from og:image meta tag."""
    try:
        r = requests.get(artwork_page, headers=HEADERS, timeout=20)
        if r.status_code != 200:
            return None
        soup = BeautifulSoup(r.text, "html.parser")
        img_tag = soup.find("meta", property="og:image")
        if img_tag:
            return img_tag["content"]
        return None
    except:
        return None

# ==============================
# MAIN LOOP
# ==============================

def scrape_all_artworks():
    artists_df = pd.read_csv(ARTISTS_CSV)
    artists = artists_df.to_dict("records")

    done = load_checkpoint()
    print(f"üìç Already processed: {len(done)} artists")

    for artist in tqdm(artists):
        artist_name = artist["name"]
        artist_url = artist["url"]

        if artist_url in done:
            continue

        print(f"\nüé® Artist: {artist_name}")

        # === 1) Get list of artworks
        artworks = fetch_artworks_list(artist_url)
        print(f"üñºÔ∏è Found {len(artworks)} artworks")

        rows = []

        for title, art_url in artworks:
            img = fetch_image_url(art_url)
            if not img:
                print(f"‚ö†Ô∏è No image: {title}")

            rows.append({
                "artist": artist_name,
                "nationality": artist["nationality"],
                "artwork_title": title,
                "artwork_url": art_url,
                "image_url": img
            })

        # === Save after finishing this artist
        if rows:
            append_to_csv(rows)

        # === Mark artist as done
        done.add(artist_url)
        save_checkpoint(done)

    print("\nüéâ DONE ‚Äî all artworks scraped!")

# ==============================
# RUN
# ==============================

if __name__ == "__main__":
    scrape_all_artworks()


üìç Already processed: 3518 artists


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4202/4202 [00:00<00:00, 3755479.52it/s]


üéâ DONE ‚Äî all artworks scraped!





print number of artworks per nationality

In [12]:
pd.set_option("display.max_rows", None)     # show all rows
pd.set_option("display.max_colwidth", None)

df = pd.read_csv("data/wikiart/wikiart_all_artworks.csv", low_memory=False)

df.columns = df.columns.str.strip().str.replace(" ", "_")

counts = (
    df["nationality"]
    .dropna()
    .value_counts()
)

print(counts)


nationality
french                        35667
american                      30563
russian                       17060
italian                       16297
british                       16122
german                        10076
dutch                          8152
spanish                        7136
ukrainian                      5268
japanese                       4054
romanian                       3664
austrian                       3127
flemish                        2816
polish                         2668
jewish                         2188
swiss                          1883
australian                     1795
belgian                        1765
hungarian                      1549
mexican                        1297
greek                          1203
canadian                       1147
norwegian                      1111
chinese                        1099
brazilian                      1078
portuguese                     1068
armenian                       1010
czech           

scans the MET‚Äôs public-domain collection, keeps only artworks from less documented nationalities (less than 1000 artworks in wikiart), fetches their image metadata safely via the MET API, and saves everything to a CSV with full resume support.

In [14]:
# MET API ‚Äî culturally filtered scraper with resume + proper headers + SAFE RETRIES

import requests
import pandas as pd
from tqdm import tqdm
import time
import random
import csv
import os

HEADERS = {"User-Agent": "Mozilla/5.0"}

OUTPUT = "data/met/met_all_artworks.csv"
CHECKPOINT = "data/met/met_artworks_checkpoint.txt"


# Forbidden Western cultural groups
forbidden_keywords = [
    "french", "american", "russian", "italian", "british",
    "german", "dutch", "spanish", "ukrainian", "japanese",
    "romanian", "austrian", "flemish", "polish", "jewish",
    "swiss", "australian", "belgian", "hungarian", "mexican",
    "greek", "canadian", "norwegian", "chinese", "brazilian",
    "portuguese", "armenian", "european", 
    "france", "russia", "italy", "britain",
    "germany", "spain", "ukraine", "japan",
    "austria", "poland",
    "switzerland", "australia", "belgium", "hungary", "mexico",
    "greece", "canada", "norway", "china", "brazil",
    "portugal", "armenia", "europe",
    "slavic"
]

# Indigenous cultures to KEEP
native_terms = [
    "native", "indigenous", "american indian",
    "navajo", "hopi", "inuit", "cherokee",
    "haida", "tlingit"
]


def keep_culture(culture):
    """Return True if the culture is allowed."""
    if not culture or culture.strip() == "":
        return False
    c = culture.lower()
    if any(term in c for term in native_terms):
        return True
    if any(word in c for word in forbidden_keywords):
        return False
    return True


# ------------------------------------------------------------
# SAFE RETRY VERSION ‚Äî NEVER STOPS FOR 403 RATE LIMITING
# ------------------------------------------------------------
def fetch_met_image(obj_id, max_retries=5):
    """Fetch one MET object with retry/backoff on 403/timeout."""

    url = f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{obj_id}"

    for attempt in range(max_retries):

        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            code = r.status_code

            # 404 ‚Üí object does not exist ‚Üí skip safely
            if code == 404:
                print(f"‚ö†Ô∏è Skipping missing object {obj_id} (404)")
                return None

            # 403 = RATE LIMIT ‚Äî wait longer, then retry
            if code == 403:
                wait = 5 + random.random() * 15
                print(f"‚è≥ 403 rate limit at object {obj_id} ‚Äî waiting {wait:.1f}s‚Ä¶")
                time.sleep(wait)
                continue

            # Any other non-success HTTP error ‚Üí retry a few times
            if code != 200:
                wait = 3 + random.random() * 4
                print(f"‚ö†Ô∏è HTTP {code} for {obj_id}, retry {attempt+1}/{max_retries} after {wait:.1f}s‚Ä¶")
                time.sleep(wait)
                continue

            # Parse JSON
            try:
                data = r.json()
            except Exception as e:
                raise RuntimeError(f"JSON decoding failed for object {obj_id}: {e}")

            if not isinstance(data, dict) or data == {}:
                print(f"‚ö†Ô∏è Empty JSON for ID {obj_id}, skipping")
                return None

            # Success
            return {
                "objectID": obj_id,
                "title": data.get("title"),
                "artist": data.get("artistDisplayName"),
                "culture": data.get("culture"),
                "period": data.get("period"),
                "date": data.get("objectDate"),
                "medium": data.get("medium"),
                "department": data.get("department"),
                "primaryImage": data.get("primaryImage"),
                "primaryImageSmall": data.get("primaryImageSmall"),
                "additionalImages": ";".join(data.get("additionalImages", [])),
            }

        except requests.exceptions.Timeout:
            print(f"‚è≥ Timeout for {obj_id}, retrying in 5s‚Ä¶")
            time.sleep(5)

        except Exception as e:
            print(f"‚ö†Ô∏è Unexpected error for {obj_id}: {e}, retrying in 5s‚Ä¶")
            time.sleep(5)

    print(f"‚ùå Giving up on object {obj_id} after {max_retries} retries")
    return None



def scrape_met_images():
    print("üìÇ Loading MetObjects.txt...")
    met = pd.read_csv("data/met/MetObjects.txt", low_memory=False)

    # Normalize column names
    met.columns = (
        met.columns.str.replace(" ", "", regex=False)
        .str.replace("-", "", regex=False)
        .str.replace(".", "", regex=False)
        .str.strip()
    )

    if "IsPublicDomain" not in met.columns or "ObjectID" not in met.columns:
        print("‚ùå Column mismatch. Columns found:")
        print(met.columns.tolist())
        return

    public_df = met[met["IsPublicDomain"] == True]

    cultures = public_df["Culture"].fillna("")
    mask = [keep_culture(c) for c in cultures]
    diverse_df = public_df[mask]

    object_ids = diverse_df["ObjectID"].astype(int).tolist()
    print(f"üåç Found {len(object_ids)} culturally diverse candidate objects")

    # Resume support
    processed = set()
    if os.path.exists(CHECKPOINT):
        with open(CHECKPOINT, "r") as f:
            processed = {int(x.strip()) for x in f if x.strip().isdigit()}
        print(f"üîÅ Loaded checkpoint: {len(processed)} processed")

    remaining_ids = [oid for oid in object_ids if oid not in processed]
    print(f"‚û°Ô∏è Will scrape: {len(remaining_ids)} new objects")

    # Prepare output
    fields = [
        "objectID", "title", "artist", "culture", "period", "date",
        "medium", "department", "primaryImage", "primaryImageSmall",
        "additionalImages"
    ]

    file_exists = os.path.exists(OUTPUT)
    file_empty = (not file_exists) or os.path.getsize(OUTPUT) == 0
    mode = "a" if file_exists and not file_empty else "w"

    f_out = open(OUTPUT, mode, newline="", encoding="utf-8")
    writer = csv.DictWriter(f_out, fieldnames=fields)
    if file_empty:
        writer.writeheader()

    f_ckpt = open(CHECKPOINT, "a")

    # Scraping loop
    for i, oid in enumerate(tqdm(remaining_ids)):

        d = fetch_met_image(oid)

        # Always record checkpoint, even if the object is None
        f_ckpt.write(str(oid) + "\n")
        f_ckpt.flush()
        os.fsync(f_ckpt.fileno())

        if d:
            writer.writerow(d)
            f_out.flush()
            os.fsync(f_out.fileno())

        # SAFE DELAY (much slower)
        time.sleep(random.uniform(0.3, 0.7))

        # Print memory usage update every 200 items
        if i % 200 == 0 and i > 0:
            size_mb = os.path.getsize(OUTPUT) / 1e6
            print(f"üíæ {i} scraped ‚Äî CSV size: {size_mb:.2f} MB")

    f_out.close()
    f_ckpt.close()

    print("‚úî Done!")


scrape_met_images()


üìÇ Loading MetObjects.txt...
üåç Found 32866 culturally diverse candidate objects
üîÅ Loaded checkpoint: 32884 processed
‚û°Ô∏è Will scrape: 0 new objects


0it [00:00, ?it/s]

‚úî Done!





print sorted list of all unique culture labels in MET dataset

In [15]:
import pandas as pd

# Load your dataset
met = pd.read_csv("data/met/met_all_artworks.csv", low_memory=False)

# Normalize column names (as before)
met.columns = (
    met.columns.str.replace(" ", "", regex=False)
    .str.replace("-", "", regex=False)
    .str.replace(".", "", regex=False)
    .str.strip()
)

# Get unique cultures
cultures = (
    met["culture"]
    .dropna()
    .unique()
)

# Print them sorted
cultures_sorted = sorted(cultures)

print("üåç Unique Cultures in the MET dataset:")
for c in cultures_sorted:
    print("-", c)


üåç Unique Cultures in the MET dataset:
- Abor, Burmese
- Achaemenid
- Achaemenid (?)
- Achaemenid or Seleucid
- Acheen
- Acheulean
- Acoma Pueblo
- Acoma, Native American
- Aegean
- Afghan
- Afghan (Turkmen)
- Afghan, Khyber
- Afghanistan
- Afghanistan (Hadda)
- Afghanistan (found near Kabul)
- Afghanistan (possibly Hadda)
- Afghanistan (probably Hadda)
- Afghanistan or Central Asia
- Afghanistan, possibly of West Indian manufacture
- Africa
- Africa (Central)
- Africa (West)
- African
- African (Maasai peoples)
- African (North)
- African (Pokot peoples)
- African (Sao peoples)
- African (West)
- African, Sudan
- African; created in United Kingdom
- African?
- Aguada
- Akan
- Akan peoples
- Akan peoples, Asante
- Akan peoples, Asante group
- Akan, Ashanti (Asante)
- Akkadian
- Akkadian or Neo-Sumerian
- Alamblak or Yimam people
- Alamito
- Alanic
- Albanian
- Alemannic
- Aleut
- Aleutian Islands (?)
- Algamurra
- Algerian
- Algerian or Moroccan
- Algerian, Kabyle
- Alutiiq/ Sugpiaq,

merge met and wikiart datasets

combined wikiart and met data

In [27]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# ----------------
# LOAD DATA
# ----------------
wiki = pd.read_csv("data/wikiart/wikiart_all_artworks.csv")
met = pd.read_csv("data/met/met_all_artworks.csv")

# ----------------
# CLEAN WIKI DATA
# ----------------
wiki_clean = wiki[["artist", "nationality", "artwork_title", "image_url"]].copy()
wiki_clean["source"] = "wiki"
wiki_clean["original_culture"] = None
wiki_clean["similarity_score"] = None

# Extract WikiArt nationalities
wiki_nationalities = (
    wiki["nationality"]
        .dropna()
        .astype(str)
        .str.lower()
        .str.strip()
        .unique()
        .tolist()
)

print("WikiArt nationalities loaded:", len(wiki_nationalities))

# ----------------
# MET DATA
# ----------------
met_sub = met[["artist", "title", "culture", "primaryImage"]].rename(
    columns={"title": "artwork_title", "primaryImage": "image_url"}
)

met_sub["source"] = "met"
met_sub["original_culture"] = met_sub["culture"]

# ----------------
# LOAD MODEL
# ----------------
model = SentenceTransformer("all-mpnet-base-v2")
nat_emb = model.encode(wiki_nationalities, convert_to_tensor=True)

# ----------------
# ALWAYS CHOOSE CLOSEST NATIONALITY + RETURN SCORE
# ----------------
def map_culture(culture_string):
    cleaned = str(culture_string).lower().strip()
    
    # embed culture string
    emb = model.encode(cleaned, convert_to_tensor=True)
    sim = util.cos_sim(emb, nat_emb)[0]
    
    # best match
    best_idx = sim.argmax().item()
    best_score = sim[best_idx].item()
    best_nat = wiki_nationalities[best_idx]

    # allow everything ‚Äî only unknown is dropped later
    return best_nat, best_score


# Apply mapping
preds = met_sub["culture"].apply(map_culture)
met_sub["nationality"] = preds.apply(lambda x: x[0])
met_sub["similarity_score"] = preds.apply(lambda x: x[1])

# ----------------
# REMOVE UNKNOWN NATIONALITY ROWS
# ----------------
met_clean = met_sub[
    met_sub["nationality"] != "unknown"
][[
    "artist", "nationality", "artwork_title", "image_url",
    "source", "original_culture", "similarity_score"
]]

# ----------------
# FINAL COMBINED DATA
# ----------------
wiki_clean = wiki_clean[[
    "artist", "nationality", "artwork_title", "image_url",
    "source", "original_culture", "similarity_score"
]]

combined = pd.concat([wiki_clean, met_clean], ignore_index=True)
combined.to_csv("combined_artworks.csv", index=False)

print("Saved combined_artworks.csv with", len(combined), "rows.")

WikiArt nationalities loaded: 94


  combined = pd.concat([wiki_clean, met_clean], ignore_index=True)


Saved combined_artworks.csv with 223965 rows.


see how many artworks per nationality when combined

In [28]:
import pandas as pd

pd.set_option("display.max_rows", None)     # show all rows
pd.set_option("display.max_colwidth", None)

df = pd.read_csv("combined_artworks.csv", low_memory=False)

df.columns = df.columns.str.strip().str.replace(" ", "_")

counts = (
    df["nationality"]
    .dropna()
    .value_counts()
)

print(counts)


nationality
french                        36405
american                      30566
italian                       20355
russian                       17065
british                       16138
german                        10081
dutch                          8651
spanish                        7282
ukrainian                      5291
romanian                       4596
japanese                       4135
macedonian                     3782
flemish                        3613
austrian                       3174
greek                          2828
egyptian                       2706
polish                         2672
indonesian                     2596
indian                         2304
jewish                         2218
iranian                        2124
swiss                          1887
australian                     1801
belgian                        1774
armenian                       1649
norwegian                      1635
thai                           1595
hungarian       

for all nationalities under 1000 artworks downloaded from british art museum and code below combines them into one csv.

In [29]:
import pandas as pd
import glob
import os
import re

# ------------------------------------------
# 1. NATIONALITY ‚Üí KEYWORD MAP
# ------------------------------------------
NAT_KEYWORDS = {
    "albanian": ["albania", "albanian"],
    "algerians": ["algeria", "algerian", "maghreb"],
    "angolan": ["angola", "angolan"],
    "argentinean": ["argentina", "argentine"],
    "armenian": ["armenia", "armenian"],
    "azerbaijani": ["azerbaijan", "azerbaijani"],
    "bahraini": ["bahrain", "bahraini"],
    "bengali": ["bengal", "bangladesh", "bengali"],
    "belarusian": ["belarus", "belarusian"],
    "bosnian": ["bosnia", "bosnian"],
    "bulgarian": ["bulgaria", "bulgarian"],
    "cameroonian": ["cameroon", "cameroonian"],
    "canadian": ["canada", "canadian"],
    "catalan": ["catalonia", "catalan"],
    "chilean": ["chile", "chilean"],
    "colombian": ["colombia", "colombian"],
    "cuban": ["cuba", "cuban"],
    "croatian": ["croatia", "croatian"],
    "czech": ["czech", "bohemia", "bohemian"],
    "danish": ["denmark", "danish"],
    "dominican": ["dominican republic", "dominicana", "hispaniola"],
    "ecuadorian": ["ecuador", "ecuadorian"],
    "egyptian": ["egypt", "egyptian"],
    "emirati": ["united arab emirates", "uae", "dubai", "abudhabi", "emirati"],
    "estonian": ["estonia", "estonian"],
    "ethiopian": ["ethiopia", "ethiopian"],
    "finnish": ["finland", "finnish"],
    "filipino": ["philippines", "filipino"],
    "georgian": ["georgia (republic)", "tbilisi", "georgian"],
    "hungarian": ["hungary", "hungarian"],
    "icelandic": ["iceland", "icelandic"],
    "indian": ["india", "indian"],
    "indigenous-north-americans": ["native american", "pueblo", "navajo", "apache"],
    "indonesian": ["indonesia", "indonesian", "balinese", "javanese"],
    "iraqi": ["iraq", "mesopotamia", "nineveh", "assyrian"],
    "irish": ["ireland", "irish", "gaelic"],
    "jamaican": ["jamaica", "jamaican"],
    "kenyan": ["kenya", "kenyan"],
    "latvian": ["latvia", "latvian"],
    "libyan": ["libya", "libyan"],
    "lithuania": ["lithuania", "lithuanian"],
    "luxembourgers": ["luxembourg", "luxembourger"],
    "macedonian": ["north macedonia", "macedonia", "macedonian"],
    "moldovan": ["moldova", "moldovan"],
    "mongol": ["mongolia", "mongol", "ulan bator"],
    "moroccan": ["morocco", "moroccan"],
    "namibians": ["namibia", "namibian"],
    "new-zealander": ["new zealand", "maori"],
    "nigerian": ["nigeria", "yoruba", "benin kingdom", "igbo", "hausa"],
    "north-koreans": ["north korea", "dprk", "korean"],
    "norwegian": ["norway", "norwegian"],
    "palestinians": ["palestine", "palestinian"],
    "peruvian": ["peru", "peruvian", "nazca", "moche", "chimu"],
    "puerto-rican": ["puerto rico", "boricua", "puertorican"],
    "saudi": ["saudi arabia", "saudi", "arabia"],
    "serbian": ["serbia", "serbian"],
    "slovak": ["slovakia", "slovak"],
    "slovenian": ["slovenia", "slovenian"],
    "south-african": ["south africa", "zulu", "xhosa", "sotho"],
    "south-korean": ["south korea", "korea", "hangul"],
    "sudanese": ["sudan", "nubian"],
    "swedish": ["sweden", "swedish", "scandinavia"],
    "syrian": ["syria", "damascus", "aleppo", "levant"],
    "thai": ["thailand", "thai", "siam"],
    "tunisian": ["tunisia", "tunisian"],
    "turkish": ["turkey", "turkish", "anatolia", "ottoman"],
    "uruguayan": ["uruguay", "uruguayan"],
    "venezuelan": ["venezuela", "venezuelan"],
    "vietnamese": ["vietnam", "vietnamese"],
    "yemeni": ["yemen", "yemeni", "sana'a", "hadhramaut"]
}


# ------------------------------------------
# 2. Extract nationality prefix from filename
# ------------------------------------------
def extract_nationality(filename):
    """Takes 'estonian2.csv' ‚Üí 'estonian'."""
    base = filename.replace(".csv", "").lower()
    m = re.match(r"([a-z\-]+)", base)
    return m.group(1) if m else None


# ------------------------------------------
# 3. Cultural origin filter (photographs allowed)
# ------------------------------------------
def belongs_to_culture(row, keywords):
    culture = str(row.get("Culture", "")).lower()
    ethnic = str(row.get("Ethnic name (made by)", "")).lower()
    prod_place = str(row.get("Production place", "")).lower()

    if any(k in culture for k in keywords):
        return True
    if any(k in ethnic for k in keywords):
        return True
    if any(k in prod_place for k in keywords):
        return True

    return False


# ------------------------------------------
# 4. Process all files
# ------------------------------------------
all_files = glob.glob("data/british/british_downloaded_data/*.csv")
all_clean = []

for path in all_files:
    filename = os.path.basename(path)
    nationality = extract_nationality(filename)

    if not nationality:
        print(f"‚ö†Ô∏è Could not parse nationality from {filename}")
        continue

    if nationality not in NAT_KEYWORDS:
        print(f"‚ö†Ô∏è No keyword mapping for {nationality}, skipping {filename}")
        continue

    keywords = NAT_KEYWORDS[nationality]
    print(f"‚Üí Processing {filename} as nationality: {nationality}, keywords: {keywords}")

    df = pd.read_csv(path)

    mask = df.apply(lambda r: belongs_to_culture(r, keywords), axis=1)
    df_clean = df[mask].copy()
    df_clean["nationality"] = nationality

    all_clean.append(df_clean)


# ------------------------------------------
# 5. Combine all into one dataset
# ------------------------------------------
combined = pd.concat(all_clean, ignore_index=True)
combined.to_csv("data/british/britishmuseum_combined.csv", index=False)

print("üéâ DONE!")
print("Total final objects:", len(combined))


‚Üí Processing angolan.csv as nationality: angolan, keywords: ['angola', 'angolan']
‚Üí Processing luxembourgers.csv as nationality: luxembourgers, keywords: ['luxembourg', 'luxembourger']
‚Üí Processing bosnian.csv as nationality: bosnian, keywords: ['bosnia', 'bosnian']
‚Üí Processing new-zealander.csv as nationality: new-zealander, keywords: ['new zealand', 'maori']
‚Üí Processing bengali.csv as nationality: bengali, keywords: ['bengal', 'bangladesh', 'bengali']
‚Üí Processing cuban2.csv as nationality: cuban, keywords: ['cuba', 'cuban']
‚Üí Processing georgian.csv as nationality: georgian, keywords: ['georgia (republic)', 'tbilisi', 'georgian']
‚Üí Processing venezuelan2.csv as nationality: venezuelan, keywords: ['venezuela', 'venezuelan']
‚Üí Processing ecuadorian.csv as nationality: ecuadorian, keywords: ['ecuador', 'ecuadorian']
‚Üí Processing moroccan.csv as nationality: moroccan, keywords: ['morocco', 'moroccan']
‚Üí Processing lithuania.csv as nationality: lithuania, keywords

‚Üí Processing chilean2.csv as nationality: chilean, keywords: ['chile', 'chilean']
‚Üí Processing uruguayan.csv as nationality: uruguayan, keywords: ['uruguay', 'uruguayan']
‚Üí Processing jamaican.csv as nationality: jamaican, keywords: ['jamaica', 'jamaican']
‚Üí Processing albanian.csv as nationality: albanian, keywords: ['albania', 'albanian']
‚Üí Processing argentinean2.csv as nationality: argentinean, keywords: ['argentina', 'argentine']
‚Üí Processing icelandic.csv as nationality: icelandic, keywords: ['iceland', 'icelandic']
‚Üí Processing north-koreans.csv as nationality: north-koreans, keywords: ['north korea', 'dprk', 'korean']
‚Üí Processing chilean.csv as nationality: chilean, keywords: ['chile', 'chilean']
‚Üí Processing tunisian.csv as nationality: tunisian, keywords: ['tunisia', 'tunisian']
‚Üí Processing slovak.csv as nationality: slovak, keywords: ['slovakia', 'slovak']
‚Üí Processing croatian.csv as nationality: croatian, keywords: ['croatia', 'croatian']
‚Üí Proces

In [30]:
import pandas as pd

# Load existing combined dataset (Wiki + MET)
combined = pd.read_csv("combined_artworks.csv")

# Load the fully processed British Museum dataset
bm = pd.read_csv("data/british/britishmuseum_combined.csv")

# -----
# Convert BM data to your unified schema
# -----

bm_clean = pd.DataFrame({
    "artist": bm["Producer name"].fillna("unknown"),
    "nationality": bm["nationality"],
    "artwork_title": bm["Title"],
    "image_url": bm["Image"],
    "source": "british_museum",
    "original_culture": bm["Culture"],
    "similarity_score": None
})

# -----
# Append BM data to your main dataset
# -----

final = pd.concat([combined, bm_clean], ignore_index=True)

# Save new combined dataset
final.to_csv("combined_artworks.csv", index=False)

print("üéâ All British Museum objects added!")
print("‚Üí Total rows now:", len(final))


  combined = pd.read_csv("combined_artworks.csv")
  bm = pd.read_csv("data/british/britishmuseum_combined.csv")
  final = pd.concat([combined, bm_clean], ignore_index=True)


üéâ All British Museum objects added!
‚Üí Total rows now: 261774


print met + wikiart + british museum number of artworks per nationality

In [31]:
import pandas as pd

pd.set_option("display.max_rows", None)     # show all rows
pd.set_option("display.max_colwidth", None)

df = pd.read_csv("combined_artworks.csv", low_memory=False)

df.columns = df.columns.str.strip().str.replace(" ", "_")

counts = (
    df["nationality"]
    .dropna()
    .value_counts()
)

print(counts)


nationality
french                        36405
american                      30566
italian                       20355
russian                       17065
british                       16138
german                        10081
dutch                          8651
nigerian                       8612
spanish                        7282
south-african                  5758
ukrainian                      5291
romanian                       4596
japanese                       4135
macedonian                     3782
flemish                        3613
austrian                       3174
kenyan                         2985
greek                          2828
egyptian                       2706
polish                         2672
indonesian                     2596
syrian                         2536
indian                         2304
jewish                         2218
iranian                        2124
vietnamese                     1950
irish                          1926
swiss           