specify · foozleface · Apr 8, 2026
diff --git a/specifyweb/backend/export/API_DOCS.md b/specifyweb/backend/export/API_DOCS.md
@@ -0,0 +1,121 @@
+# DwC Export API Documentation
+
+## Endpoints
+
+### Schema Mappings
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/export/list_mappings/` | List all schema mappings |
+| POST | `/export/create_mapping/` | Create a new mapping |
+| PUT | `/export/update_mapping/<id>/` | Update a mapping |
+| DELETE | `/export/delete_mapping/<id>/` | Delete a mapping (fails if referenced by packages) |
+| POST | `/export/clone_mapping/<id>/` | Clone a mapping |
+| POST | `/export/save_mapping_fields/<id>/` | Save DwC term assignments |
+
+### Export Packages
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/export/list_export_datasets/` | List all export packages |
+| POST | `/export/create_dataset/` | Create a new package |
+| PUT | `/export/update_dataset/<id>/` | Update a package |
+| DELETE | `/export/delete_dataset/<id>/` | Delete a package |
+| POST | `/export/clone_dataset/<id>/` | Clone a package |
+| POST | `/export/generate_dwca/<id>/` | Generate and download DwCA ZIP |
+
+### RSS Feed
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/export/rss/` | RSS feed of published exports |
+| POST | `/export/force_update/` | Rebuild legacy RSS feed |
+| POST | `/export/force_update_packages/` | Rebuild all RSS-enabled export packages |
+
+### Vocabulary
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/export/schema_terms/` | Get DwC vocabulary terms |
+
+## Scripted/Cron Usage
+
+To automate DwC archive generation on a schedule, call the export API:
+
+```bash
+# Generate a DwCA for a specific export package
+curl -X POST \
+  -b cookies.txt \
+  -H "X-CSRFToken: TOKEN" \
+  -o output.zip \
+  http://localhost:8001/export/generate_dwca/PACKAGE_ID/
+
+# Rebuild all RSS-enabled packages
+curl -X POST \
+  -b cookies.txt \
+  -H "X-CSRFToken: TOKEN" \
+  http://localhost:8001/export/force_update_packages/
+```
+
+### Authentication
+
+All endpoints require an authenticated session. For scripted access:
+
+1. POST to `/accounts/login/` with username/password
+2. Extract `csrftoken` and `sessionid` cookies
+3. Include both cookies and `X-CSRFToken` header in subsequent requests
+
+### Example cron script
+
+```bash
+#!/bin/bash
+# Export DwC archives nightly at 2 AM
+# crontab: 0 2 * * * /path/to/export_dwca.sh
+
+SPECIFY_URL="http://localhost:8001"
+USERNAME="admin"
+PASSWORD="password"
+PACKAGE_ID=1
+
+# Login
+COOKIES=$(mktemp)
+curl -s -c "$COOKIES" "$SPECIFY_URL/accounts/login/" > /dev/null
+CSRF=$(grep csrftoken "$COOKIES" | awk '{print $NF}')
+curl -s -c "$COOKIES" -b "$COOKIES" \
+  -d "username=$USERNAME&password=$PASSWORD&csrfmiddlewaretoken=$CSRF" \
+  -H "Referer: $SPECIFY_URL/accounts/login/" \
+  "$SPECIFY_URL/accounts/login/" > /dev/null
+
+# Generate archive
+CSRF=$(grep csrftoken "$COOKIES" | awk '{print $NF}')
+curl -s -b "$COOKIES" \
+  -H "X-CSRFToken: $CSRF" \
+  -X POST \
+  -o "/path/to/exports/archive_$(date +%Y%m%d).zip" \
+  "$SPECIFY_URL/export/generate_dwca/$PACKAGE_ID/"
+
+rm "$COOKIES"
+```
+
+### Idempotency
+
+`generate_dwca` is safe to call repeatedly. Each call regenerates from current data. `lastExported` is updated on success.
+
+### Automatic RSS Scheduling
+
+The management command `update_feed_v2` checks all Export Packages with `RSS = true` and rebuilds those whose `lastExported + frequency` has passed:
+
+```bash
+# Run manually
+python manage.py update_feed_v2
+
+# Add to system cron for automatic scheduling (e.g., check every hour)
+0 * * * * cd /path/to/specify7 && python manage.py update_feed_v2
+
+# Force update all RSS packages regardless of schedule
+python manage.py update_feed_v2 --force
+```
+
+A `Frequency` of 0 or null means the package is manual-only (never auto-updated).
+
+The "Update RSS Feed" button in the Export Packages UI triggers the same process via `POST /export/force_update_packages/`.
diff --git a/specifyweb/backend/export/attachment_urls.py b/specifyweb/backend/export/attachment_urls.py
@@ -0,0 +1,26 @@
+"""Auto-construct attachment URLs for DwC exports."""
+from django.conf import settings
+
+
+def construct_attachment_url(collection, filename):
+    """Build full URL to an attachment file on the web asset server.
+
+    Returns the URL string or empty string if no asset server configured.
+    """
+    base_url = getattr(settings, 'WEB_ATTACHMENT_URL', None)
+    if not base_url:
+        return ''
+
+    # Strip trailing slash
+    base_url = base_url.rstrip('/')
+
+    collection_name = collection.collectionname if collection else ''
+    return f'{base_url}/{collection_name}/{filename}'
+
+
+def is_attachment_field(field_name):
+    """Check if a field name corresponds to an attachment field."""
+    attachment_fields = {
+        'attachmentlocation', 'origfilename', 'attachmentimageattribute',
+    }
+    return field_name.lower() in attachment_fields
diff --git a/specifyweb/backend/export/cache.py b/specifyweb/backend/export/cache.py
@@ -0,0 +1,209 @@
+"""Cache table operations for DwC export pipeline."""
+import logging
+import re
+from django.db import connection
+
+from .dwca_utils import sanitize_column_name
+
+logger = logging.getLogger(__name__)
+
+
+def get_cache_table_name(mapping_id, collection_id, prefix='dwc_cache'):
+    """Generate a safe cache table name."""
+    return f'{prefix}_{mapping_id}_{collection_id}'
+
+
+def create_cache_table(table_name, columns):
+    """Create a cache table with the given columns.
+
+    columns: list of (column_name, column_type) tuples.
+    An auto-increment primary key is always added.
+    """
+    safe_name = re.sub(r'[^a-zA-Z0-9_]', '', table_name)
+    col_defs = ', '.join(
+        f'`{re.sub(r"[^a-zA-Z0-9_]", "", name)}` {col_type}'
+        for name, col_type in columns
+    )
+    with connection.cursor() as cursor:
+        cursor.execute(f'DROP TABLE IF EXISTS `{safe_name}`')
+        cursor.execute(
+            f'CREATE TABLE `{safe_name}` ('
+            f'`id` INT AUTO_INCREMENT PRIMARY KEY, {col_defs}'
+            f') ENGINE=InnoDB DEFAULT CHARSET=utf8mb4'
+        )
+    logger.info('Created cache table %s', safe_name)
+
+
+def drop_cache_table(table_name):
+    """Drop a cache table if it exists."""
+    safe_name = re.sub(r'[^a-zA-Z0-9_]', '', table_name)
+    with connection.cursor() as cursor:
+        cursor.execute(f'DROP TABLE IF EXISTS `{safe_name}`')
+    logger.info('Dropped cache table %s', safe_name)
+
+
+def build_cache_tables(export_dataset, user=None, progress_callback=None):
+    """Build cache tables for an ExportDataSet's core mapping and all extensions."""
+    core_mapping = export_dataset.coremapping
+    collection = export_dataset.collection
+
+    _build_single_cache(core_mapping, collection, user=user,
+                        progress_callback=progress_callback)
+
+    for ext in export_dataset.extensions.all().order_by('sortorder').iterator(chunk_size=2000):
+        _build_single_cache(ext.schemamapping, collection,
+                            prefix=f'dwc_cache_ext{ext.sortorder}',
+                            user=user, progress_callback=progress_callback)
+
+
+def _build_single_cache(mapping, collection, prefix='dwc_cache', user=None,
+                        progress_callback=None):
+    """Build a single cache table for one SchemaMapping."""
+    from .models import CacheTableMeta
+    from django.utils import timezone
+
+    table_name = get_cache_table_name(mapping.id, collection.id, prefix)
+
+    meta, _ = CacheTableMeta.objects.update_or_create(
+        schemamapping=mapping,
+        defaults={'tablename': table_name, 'buildstatus': 'building'}
+    )
+
+    try:
+        display_fields = [
+            f for f in mapping.query.fields.order_by('position')
+            if getattr(f, 'term', None)
+        ]
+
+        columns = [
+            (sanitize_column_name(f.term), _infer_column_type(f))
+            for f in display_fields
+        ]
+
+        create_cache_table(table_name, columns)
+
+        rowcount = _execute_and_populate(
+            table_name, mapping, collection, user, progress_callback
+        )
+
+        meta.buildstatus = 'idle'
+        meta.lastbuilt = timezone.now()
+        meta.rowcount = rowcount
+        meta.save()
+
+        logger.info('Cache table %s built with %d rows', table_name, rowcount)
+
+    except Exception:
+        meta.buildstatus = 'error'
+        meta.save()
+        logger.exception('Failed to build cache table %s', table_name)
+        raise
+
+
+def _execute_and_populate(table_name, mapping, collection, user, progress_callback=None):
+    """Execute a mapping's query and INSERT results into the cache table.
+
+    Uses SQLAlchemy build_query() to ensure output matches query_to_csv
+    (date formatting, null replacement, etc.), then batch-INSERTs rows.
+
+    Returns the number of rows inserted.
+    """
+    from specifyweb.backend.stored_queries.execution import (
+        build_query, BuildQueryProps, set_group_concat_max_len,
+        apply_special_post_query_processing,
+    )
+    from specifyweb.backend.stored_queries.queryfield import QueryField
+    from specifyweb.backend.stored_queries.models import session_context
+    from .field_adapter import EphemeralFieldAdapter
+
+    query_obj = mapping.query
+    display_fields = [
+        f for f in query_obj.fields.order_by('position')
+        if getattr(f, 'term', None)
+    ]
+    field_specs = [
+        QueryField.from_spqueryfield(EphemeralFieldAdapter(f, force_display=True))
+        for f in display_fields
+    ]
+
+    safe_name = re.sub(r'[^a-zA-Z0-9_]', '', table_name)
+    col_count = len(display_fields)
+    placeholders = ', '.join(['%s'] * col_count)
+    col_names = ', '.join(
+        f'`{sanitize_column_name(f.term)}`'
+        for f in display_fields
+    )
+    insert_sql = f'INSERT INTO `{safe_name}` ({col_names}) VALUES ({placeholders})'
+
+    total = 0
+    BATCH_SIZE = 2000
+
+    with session_context() as session:
+        set_group_concat_max_len(session.connection())
+        sa_query, _ = build_query(
+            session, collection, user,
+            query_obj.contexttableid,
+            field_specs,
+            BuildQueryProps(
+                replace_nulls=True,
+                date_format_override='%Y-%m-%d',
+            ),
+        )
+        sa_query = apply_special_post_query_processing(
+            sa_query, query_obj.contexttableid, field_specs, collection, user,
+            should_list_query=False,
+        )
+
+        batch = []
+        if isinstance(sa_query, list):
+            iterator = iter(sa_query)
+        else:
+            iterator = sa_query.yield_per(BATCH_SIZE)
+
+        for row in iterator:
+            batch.append(tuple(
+                str(v) if v is not None else '' for v in row[1:]
+            ))
+
+            if len(batch) >= BATCH_SIZE:
+                with connection.cursor() as cursor:
+                    cursor.executemany(insert_sql, batch)
+                total += len(batch)
+                batch = []
+                if progress_callback:
+                    progress_callback(total, None)
+
+        if batch:
+            with connection.cursor() as cursor:
+                cursor.executemany(insert_sql, batch)
+            total += len(batch)
+
+    if progress_callback:
+        progress_callback(total, total)
+
+    return total
+
+
+def _infer_column_type(spqueryfield):
+    """Infer a MySQL column type from a Specify query field."""
+    fname = (spqueryfield.fieldname or '').lower()
+
+    if 'guid' in fname or 'uuid' in fname:
+        return 'VARCHAR(256)'
+    if fname in ('id', 'rankid', 'number1', 'number2', 'countamt',
+                 'sortorder', 'position', 'version'):
+        return 'INT'
+    if 'numericyear' in fname or 'numericmonth' in fname or 'numericday' in fname:
+        return 'INT'
+    if fname in ('latitude1', 'latitude2', 'longitude1', 'longitude2',
+                 'latlongaccuracy', 'maxelevation', 'minelevation'):
+        return 'DECIMAL(12,6)'
+    if fname in ('startdate', 'enddate', 'determineddate', 'catalogeddate',
+                 'timestampcreated', 'timestampmodified'):
+        return 'VARCHAR(32)'
+    if fname.startswith('is') or fname.startswith('yes'):
+        return 'VARCHAR(8)'
+    if fname in ('catalognumber', 'altcatalognumber', 'barcode', 'fieldnumber',
+                 'code', 'abbreviation', 'datum'):
+        return 'VARCHAR(256)'
+    return 'TEXT'