In [None]:
import requests
import re
import os
import time
import json
import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Base URL for the website
base_url = "https://uyghur.linguistics.indiana.edu/manuscripts/"

# Output directory
output_dir = "jarring_manuscripts_data"
images_dir = os.path.join(output_dir, "images")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

# Function to clean text and handle HTML entities
def clean_text(text):
    if not text:
        return ""
    # Decode HTML entities
    text = html.unescape(text)
    # Remove XML/HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Function to download an image and save it to the appropriate directory
def download_image(image_url, manuscript_id, surface_id):
    if not image_url:
        return None, None
    
    # Create directory for this manuscript if it doesn't exist
    manuscript_dir = os.path.join(images_dir, manuscript_id)
    os.makedirs(manuscript_dir, exist_ok=True)
    
    # Parse the filename from the URL
    parsed_url = urlparse(image_url)
    filename = os.path.basename(parsed_url.path)
    
    # Add surface ID to the filename if it doesn't already contain it
    if surface_id not in filename:
        name, ext = os.path.splitext(filename)
        filename = f"{name}_{surface_id}{ext}"
    
    # Full path for saving the image
    image_path = os.path.join(manuscript_dir, filename)
    
    # Save the relative path for the JSON structure
    rel_path = os.path.join("images", manuscript_id, filename)
    
    # Download and save the image if it doesn't exist already
    if not os.path.exists(image_path):
        print(f"Downloading image: {filename} for {manuscript_id} {surface_id}")
        try:
            response = requests.get(image_url)
            if response.status_code == 200:
                with open(image_path, 'wb') as f:
                    f.write(response.content)
                print(f"Saved image to {image_path}")
            else:
                print(f"Failed to download image: {image_url}, status code: {response.status_code}")
                return None, None
        except Exception as e:
            print(f"Error downloading image {image_url}: {e}")
            return None, None
        
        # Sleep briefly to avoid overloading the server
        time.sleep(0.5)
    else:
        print(f"Image already exists: {filename}")
    
    return rel_path, image_path

# Function to extract transcript data from a single manuscript page
def scrape_transcript(transcript_url, manuscript_id):
    print(f"Scraping transcript: {transcript_url}")
    
    # Send a request to get the XML content
    try:
        response = requests.get(transcript_url)
        # Ensure proper encoding
        response.encoding = 'utf-8'
        xml_content = response.text
    except Exception as e:
        print(f"Error retrieving transcript: {e}")
        return []
    
    # Parse the XML content using BeautifulSoup with xml parser
    soup = BeautifulSoup(xml_content, 'xml')
    
    # List to store the parsed data for this manuscript
    manuscript_data = []
    
    # Find all surface elements which contain facsimiles and transcriptions
    surfaces = soup.find_all('surface') or soup.find_all('tei:surface')
    
    if not surfaces:
        # Try a more general approach using attribute-based search
        surfaces = soup.find_all(lambda tag: tag.name == 'surface' or tag.name.endswith(':surface'))
    
    print(f"Found {len(surfaces)} surface elements in {manuscript_id}")
    
    for surface in surfaces:
        # Get the surface ID (like "1a", "1b", etc.)
        surface_id = surface.get('n')
        if not surface_id:
            continue
        
        # Find the graphic elements (images)
        graphics = surface.find_all('graphic') or surface.find_all('tei:graphic')
        
        # Get the full-size image URL (skip thumbnails)
        image_url = None
        for graphic in graphics:
            # Skip thumbnails
            if graphic.get('rend') == 'thumbnail':
                continue
            # Use full-size images or images with no specific rend attribute
            elif graphic.get('rend') == 'fullpage' or graphic.get('rend') is None:
                image_url = graphic.get('url')
                break
        
        # Make URL absolute
        if image_url:
            image_url = urljoin(base_url, image_url)
        
        # Download image and get local path
        rel_image_path, abs_image_path = download_image(image_url, manuscript_id, surface_id)
        
        # Find the atmo:page element which contains the transcriptions
        page_elements = (surface.find_all('page') or 
                        surface.find_all('atmo:page') or 
                        surface.find_all(lambda tag: tag.name == 'page' or tag.name.endswith(':page')))
        
        if not page_elements:
            print(f"No page element found for {manuscript_id} {surface_id}")
            continue
        
        page = page_elements[0]
        
        # Get the page content areas (main)
        main_section = (page.find('main') or 
                        page.find('atmo:main') or 
                        page.find(lambda tag: tag.name == 'main' or tag.name.endswith(':main')))
        
        if not main_section:
            print(f"No main section found for {manuscript_id} {surface_id}")
            continue
        
        # Find all line groups and lines
        zones = main_section.find_all('zone') or main_section.find_all('tei:zone')
        if not zones:
            zones = [main_section]  # If no zones, treat the main section as a single zone
        
        # Store the lines for this surface
        surface_lines = []
        
        for zone_index, zone in enumerate(zones):
            # Find lines within this zone
            lines = (zone.find_all('line') or 
                    zone.find_all('tei:line') or 
                    zone.find_all(lambda tag: tag.name == 'line' or tag.name.endswith(':line')))
            
            for line_index, line in enumerate(lines):
                # Get the line number (if available) or generate one
                line_num = line.get('n')
                if not line_num:
                    line_num = f"z{zone_index+1}l{line_index+1}"  # Zone 1, Line 1, etc.
                
                # Get the Arabic text
                lit_tag = (line.find('lit') or 
                          line.find('atmo:lit') or 
                          line.find(lambda tag: tag.name == 'lit' or tag.name.endswith(':lit')))
                
                arabic_text = ""
                if lit_tag:
                    arabic_text = clean_text(''.join(str(content) for content in lit_tag.contents))
                
                # Get the Latin text
                lat_tag = (line.find('lat') or 
                          line.find('atmo:lat') or 
                          line.find(lambda tag: tag.name == 'lat' or tag.name.endswith(':lat')))
                
                latin_text = ""
                if lat_tag:
                    latin_text = clean_text(''.join(str(content) for content in lat_tag.contents))
                
                # Only add lines that have content
                if arabic_text.strip() or latin_text.strip():
                    surface_lines.append({
                        'line_num': line_num,
                        'zone_index': zone_index,
                        'arabic_text': arabic_text,
                        'latin_text': latin_text
                    })
        
        # Add the surface data if it has lines
        if surface_lines:
            manuscript_data.append({
                'manuscript_id': manuscript_id,
                'surface_id': surface_id,
                'online_image_url': image_url,
                'local_image_path': rel_image_path,
                'absolute_image_path': abs_image_path,
                'lines': surface_lines
            })
    
    return manuscript_data

# Function to get a standardized manuscript ID
def normalize_manuscript_id(id_text):
    # Extract just the "Jarring_Prov_X" part
    match = re.search(r'(Jarring_Prov_\d+)', id_text)
    if match:
        return match.group(1)
    return id_text

# Function to extract all manuscripts with transcripts from the main page
def get_manuscripts_with_transcripts():
    # Get the main index page
    main_page_url = urljoin(base_url, "index.xhtml")
    response = requests.get(main_page_url)
    response.encoding = 'utf-8'
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Dictionary to store manuscript IDs and their transcript URLs
    manuscripts = {}
    
    # Find all manuscript list items
    manuscript_items = soup.find_all('li', id=lambda x: x and x.startswith('Jarring_Prov_'))
    
    for item in manuscript_items:
        # Get the manuscript ID from the li's id attribute
        manuscript_id = normalize_manuscript_id(item.get('id'))
        if not manuscript_id:
            continue
        
        # Look for "Line-by-line transcript" or "Line by line transcript" links
        transcript_links = item.find_all('a', text=lambda t: t and 
                                        (('Line-by-line transcript' in t) or 
                                         ('Line by line transcript' in t) or
                                         ('Transcript' in t)))
        
        for link in transcript_links:
            href = link.get('href')
            if href:
                # Store the manuscript ID and its transcript URL
                # Remove ".xhtml" if present and add ".xml" to ensure we get the XML version
                href = re.sub(r'\.xhtml$', '', href)
                if not href.endswith('.xml'):
                    href = href + '.xml'
                
                transcript_url = urljoin(base_url, href)
                manuscripts[manuscript_id] = transcript_url
                print(f"Found transcript for {manuscript_id}: {transcript_url}")
                break
    
    # Add specific manuscripts that might be missed by the general approach
    known_manuscripts = {
        "Jarring_Prov_56": "https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_56.transcript.xml",
        "Jarring_Prov_461": "https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_461.transcript.xml",
    }
    
    for manuscript_id, transcript_url in known_manuscripts.items():
        if manuscript_id not in manuscripts:
            manuscripts[manuscript_id] = transcript_url
            print(f"Added known transcript for {manuscript_id}: {transcript_url}")
    
    return manuscripts

# Main execution
def main():
    # Get all manuscripts with transcripts
    manuscripts = get_manuscripts_with_transcripts()
    print(f"Found {len(manuscripts)} manuscripts with transcripts")
    
    # Store all manuscript data
    all_manuscript_data = []
    processed_count = 0
    success_count = 0
    
    # Process each manuscript
    for manuscript_id, transcript_url in sorted(manuscripts.items()):
        try:
            processed_count += 1
            print(f"Processing {processed_count}/{len(manuscripts)}: {manuscript_id}")
            
            # Scrape the transcript data
            manuscript_data = scrape_transcript(transcript_url, manuscript_id)
            
            if manuscript_data:
                # Add to the overall dataset
                all_manuscript_data.extend(manuscript_data)
                success_count += 1
                print(f"Successfully processed {manuscript_id} with {len(manuscript_data)} surfaces")
            else:
                print(f"No data extracted from {manuscript_id}")
            
            # Sleep briefly to avoid overloading the server
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing {manuscript_id}: {e}")
    
    print(f"Successfully processed {success_count} out of {len(manuscripts)} manuscripts")
    
    # Save the complete dataset to a JSON file
    json_path = os.path.join(output_dir, "jarring_manuscripts_structured.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(all_manuscript_data, f, ensure_ascii=False, indent=2)
    
    print(f"Saved complete dataset to {json_path}")
    
    # Create a summary of what was scraped
    manuscript_summary = {}
    for item in all_manuscript_data:
        manuscript_id = item['manuscript_id']
        if manuscript_id not in manuscript_summary:
            manuscript_summary[manuscript_id] = {
                'surfaces': set(), 
                'total_lines': 0, 
                'images': [],
                'image_paths': {}
            }
        
        manuscript_summary[manuscript_id]['surfaces'].add(item['surface_id'])
        manuscript_summary[manuscript_id]['total_lines'] += len(item['lines'])
        
        if item['local_image_path'] and item['local_image_path'] not in manuscript_summary[manuscript_id]['images']:
            manuscript_summary[manuscript_id]['images'].append(item['local_image_path'])
            manuscript_summary[manuscript_id]['image_paths'][item['surface_id']] = item['local_image_path']
    
    # Convert sets to lists for JSON serialization
    for manuscript_id in manuscript_summary:
        manuscript_summary[manuscript_id]['surfaces'] = sorted(list(manuscript_summary[manuscript_id]['surfaces']))
        manuscript_summary[manuscript_id]['surface_count'] = len(manuscript_summary[manuscript_id]['surfaces'])
        manuscript_summary[manuscript_id]['image_count'] = len(manuscript_summary[manuscript_id]['images'])
    
    # Save the summary to a JSON file
    summary_path = os.path.join(output_dir, "jarring_manuscripts_summary.json")
    with open(summary_path, 'w', encoding='utf-8') as f:
        json.dump(manuscript_summary, f, ensure_ascii=False, indent=2)
    
    print(f"Saved summary to {summary_path}")
    
    # Create a comprehensive corpus with manuscript information and structure
    corpus_data = {
        'collection_name': 'Jarring Collection Manuscripts',
        'description': 'Annotated Turki Manuscripts from the Jarring Collection',
        'source': 'Lund University Library',
        'website': base_url,
        'scrape_date': time.strftime('%Y-%m-%d'),
        'manuscripts': {}
    }
    
    # Group by manuscript ID for the corpus data
    for manuscript_id in sorted(manuscript_summary.keys()):
        corpus_data['manuscripts'][manuscript_id] = {
            'id': manuscript_id,
            'title': f"Manuscript {manuscript_id}",
            'surfaces': manuscript_summary[manuscript_id]['surfaces'],
            'surface_count': manuscript_summary[manuscript_id]['surface_count'],
            'total_lines': manuscript_summary[manuscript_id]['total_lines'],
            'images': manuscript_summary[manuscript_id]['images'],
            'image_by_surface': manuscript_summary[manuscript_id]['image_paths'],
            'surface_data': {}
        }
    
    # Add detailed surface data to the corpus
    for item in all_manuscript_data:
        manuscript_id = item['manuscript_id']
        surface_id = item['surface_id']
        
        corpus_data['manuscripts'][manuscript_id]['surface_data'][surface_id] = {
            'surface_id': surface_id,
            'image_path': item['local_image_path'],
            'online_image_url': item['online_image_url'],
            'lines': item['lines']
        }
    
    # Save the corpus to a JSON file
    corpus_path = os.path.join(output_dir, "jarring_corpus.json")
    with open(corpus_path, 'w', encoding='utf-8') as f:
        json.dump(corpus_data, f, ensure_ascii=False, indent=2)
    
    print(f"Saved comprehensive corpus to {corpus_path}")
    
    # Create a simple HTML index for the scraped manuscripts
    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Jarring Manuscripts - Parallel Corpus</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            .summary { margin-bottom: 30px; }
            table { border-collapse: collapse; width: 100%; }
            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
            th { background-color: #f2f2f2; }
            tr:nth-child(even) { background-color: #f9f9f9; }
            .manuscript { margin-bottom: 40px; border: 1px solid #ccc; padding: 20px; }
            .manuscript-header { margin-bottom: 15px; }
            .manuscript-images { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 15px; }
            .manuscript-image { max-width: 200px; }
            .manuscript-link { margin-top: 10px; }
        </style>
    </head>
    <body>
        <h1>Jarring Manuscripts - Chaghatay Parallel Corpus</h1>
        <div class="summary">
            <h2>Summary of Scraped Manuscripts</h2>
            <table>
                <tr>
                    <th>Manuscript ID</th>
                    <th>Number of Surfaces</th>
                    <th>Number of Images</th>
                    <th>Total Lines</th>
                </tr>
    """
    
    for manuscript_id, data in sorted(manuscript_summary.items()):
        html_content += f"""
                <tr>
                    <td>{manuscript_id}</td>
                    <td>{data['surface_count']}</td>
                    <td>{data['image_count']}</td>
                    <td>{data['total_lines']}</td>
                </tr>
        """
    
    html_content += """
            </table>
        </div>
        
        <h2>Manuscript Previews</h2>
    """
    
    # Group manuscript data by manuscript_id
    manuscripts_by_id = {}
    for item in all_manuscript_data:
        manuscript_id = item['manuscript_id']
        if manuscript_id not in manuscripts_by_id:
            manuscripts_by_id[manuscript_id] = []
        manuscripts_by_id[manuscript_id].append(item)
    
    # Add a preview section for each manuscript
    for manuscript_id, surfaces in sorted(manuscripts_by_id.items()):
        html_content += f"""
        <div class="manuscript">
            <div class="manuscript-header">
                <h3>{manuscript_id}</h3>
                <p>Total surfaces: {len(surfaces)}</p>
            </div>
            <div class="manuscript-images">
        """
        
        # Show up to 5 images from this manuscript
        shown_images = set()
        for surface in surfaces[:5]:
            if surface['local_image_path'] and surface['local_image_path'] not in shown_images:
                shown_images.add(surface['local_image_path'])
                html_content += f"""
                <img class="manuscript-image" src="{surface['local_image_path']}" alt="{manuscript_id} {surface['surface_id']}">
                """
        
        html_content += f"""
            </div>
            <div class="manuscript-link">
                <a href="{manuscript_id}_viewer.html">View detailed transcript</a>
            </div>
        </div>
        """
    
    html_content += """
    </body>
    </html>
    """
    
    html_path = os.path.join(output_dir, "jarring_manuscripts_index.html")
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"Created HTML index at {html_path}")
    
    # Create a more detailed HTML viewer for each manuscript
    for manuscript_id, surfaces in manuscripts_by_id.items():
        manuscript_html = f"""
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>{manuscript_id} - Parallel Corpus</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; }}
                .page {{ margin-bottom: 40px; border: 1px solid #ccc; padding: 20px; }}
                .image {{ text-align: center; margin-bottom: 20px; }}
                img {{ max-width: 100%; height: auto; }}
                .transcription {{ display: flex; flex-direction: column; }}
                .line {{ display: flex; margin-bottom: 10px; }}
                .line-num {{ width: 30px; font-weight: bold; }}
                .arabic {{ width: 50%; direction: rtl; text-align: right; padding-right: 10px; font-size: 18px; }}
                .latin {{ width: 50%; padding-left: 10px; }}
                h2 {{ color: #333; }}
                .back-link {{ margin-bottom: 20px; }}
            </style>
        </head>
        <body>
            <div class="back-link">
                <a href="jarring_manuscripts_index.html">← Back to Index</a>
            </div>
            <h1>{manuscript_id} - Chaghatay Manuscript Parallel Corpus</h1>
        """
        
        # Sort surfaces by surface_id
        surfaces.sort(key=lambda x: x['surface_id'])
        
        for surface in surfaces:
            surface_id = surface['surface_id']
            local_image_path = surface['local_image_path']
            lines = surface['lines']
            
            manuscript_html += f"""
            <div class="page">
                <h2>Surface {surface_id}</h2>
                <div class="image">
                    <img src="../{local_image_path}" alt="Surface {surface_id}">
                </div>
                <div class="transcription">
            """
            
            # Sort lines by zone and line numbers
            sorted_lines = sorted(lines, key=lambda x: (x.get('zone_index', 0), x['line_num']))
            
            for line in sorted_lines:
                line_num = line['line_num']
                arabic_text = line['arabic_text']
                latin_text = line['latin_text']
                
                manuscript_html += f"""
                    <div class="line">
                        <div class="line-num">{line_num}</div>
                        <div class="arabic">{arabic_text}</div>
                        <div class="latin">{latin_text}</div>
                    </div>
                """
            
            manuscript_html += """
                </div>
            </div>
            """
        
        manuscript_html += """
        </body>
        </html>
        """
        
        manuscript_html_path = os.path.join(output_dir, f"{manuscript_id}_viewer.html")
        with open(manuscript_html_path, 'w', encoding='utf-8') as f:
            f.write(manuscript_html)
        
        print(f"Created HTML viewer for {manuscript_id} at {manuscript_html_path}")
    
    print("Scraping completed successfully!")

if __name__ == "__main__":
    main()

  transcript_links = item.find_all('a', text=lambda t: t and


Found transcript for Jarring_Prov_2: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_2.transcript.xml
Found transcript for Jarring_Prov_3: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_3.transcript.xml
Found transcript for Jarring_Prov_4: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_4.transcript.xml
Found transcript for Jarring_Prov_5: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_5.transcript.xml
Found transcript for Jarring_Prov_8: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_8.transcript.xml
Found transcript for Jarring_Prov_9: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_9.transcript.xml
Found transcript for Jarring_Prov_24: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_24.transcript.xml
Found transcript for Jarring_Prov_29: https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_29.transcript.xml
Found transcript for Jarring_Prov_45: https://uyghur

In [16]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
import re
import os
import time
import json
import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# ─────────────────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────────────────
BASE_URL = "https://uyghur.linguistics.indiana.edu/manuscripts/"

OUTPUT_DIR  = "jarring_manuscripts_data"
IMAGES_DIR  = os.path.join(OUTPUT_DIR, "images")
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ─────────────────────────────────────────────────────────────────────────────
# Helper utilities
# ─────────────────────────────────────────────────────────────────────────────
def clean_text(text: str) -> str:
    """Remove tags, decode entities, collapse whitespace."""
    if not text:
        return ""
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def download_image(image_url: str, manuscript_id: str, surface_id: str):
    """Download image if necessary; return (relative_path, absolute_path)."""
    if not image_url:
        return None, None

    # ensure directory for this manuscript exists
    m_dir = os.path.join(IMAGES_DIR, manuscript_id)
    os.makedirs(m_dir, exist_ok=True)

    filename = os.path.basename(urlparse(image_url).path)
    if surface_id and surface_id not in filename:
        name, ext = os.path.splitext(filename)
        filename = f"{name}_{surface_id}{ext}"

    abs_path = os.path.join(m_dir, filename)
    rel_path = os.path.join("images", manuscript_id, filename)

    if not os.path.exists(abs_path):
        print(f"  ↳ downloading {filename}")
        try:
            resp = requests.get(image_url)
            resp.raise_for_status()
            with open(abs_path, "wb") as fh:
                fh.write(resp.content)
            time.sleep(0.5)
        except Exception as e:
            print(f"  ! failed ({e})")
            return None, None
    else:
        print(f"  ↳ cached   {filename}")

    return rel_path, abs_path


def normalize_manuscript_id(id_text: str) -> str:
    """Return canonical ‘Jarring_Prov_NNN’."""
    m = re.search(r'(Jarring_Prov_\d+)', id_text)
    return m.group(1) if m else id_text


# ─────────────────────────────────────────────────────────────────────────────
# NEW: robust surface‑ID resolver
# ─────────────────────────────────────────────────────────────────────────────
def get_surface_id(surface, idx: int) -> str:
    """
    Return a usable ID for a <surface>.
    Priority:
        1. its own  @n
        2. first   <page n="…">
        3. first   <graphic n="…">
        4. synthetic  s1, s2, …
    """
    if surface.get("n"):
        return surface["n"]

    page = surface.find(lambda t: t.name.endswith("page") and t.get("n"))
    if page:
        return page["n"]

    graphic = surface.find(lambda t: t.name.endswith("graphic") and t.get("n"))
    if graphic:
        return graphic["n"]

    return f"s{idx+1}"


# ─────────────────────────────────────────────────────────────────────────────
# Scraping logic
# ─────────────────────────────────────────────────────────────────────────────
def scrape_transcript(transcript_url: str, manuscript_id: str):
    print(f"• scraping {manuscript_id}")
    try:
        r = requests.get(transcript_url)
        r.encoding = "utf-8"
    except Exception as e:
        print(f"  ! request failed: {e}")
        return []

    soup = BeautifulSoup(r.text, "xml")
    surfaces = soup.find_all('surface') or soup.find_all('tei:surface')
    if not surfaces:
        surfaces = soup.find_all(lambda t: t.name == "surface" or t.name.endswith(":surface"))

    print(f"  ↳ {len(surfaces)} surfaces")
    manuscript_data = []

    for s_idx, surface in enumerate(surfaces):
        surface_id = get_surface_id(surface, s_idx)

        # locate image URL
        image_url = None
        for g in surface.find_all(lambda t: t.name.endswith("graphic")):
            if g.get('rend') == 'thumbnail':
                continue
            if g.get('rend') in (None, 'fullpage'):
                image_url = urljoin(BASE_URL, g.get('url'))
                break

        rel_img, abs_img = download_image(image_url, manuscript_id, surface_id)

        # locate page + main zone
        page = surface.find(lambda t: t.name.endswith('page'))
        if not page:
            continue
        main = page.find(lambda t: t.name.endswith('main'))
        if not main:
            continue

        zones = main.find_all(lambda t: t.name.endswith('zone')) or [main]
        surface_lines = []

        for z_idx, zone in enumerate(zones):
            for l_idx, line in enumerate(zone.find_all(lambda t: t.name.endswith('line'))):
                line_num = line.get('n') or f"z{z_idx+1}l{l_idx+1}"
                lit  = line.find(lambda t: t.name.endswith('lit'))
                arab = clean_text(''.join(str(x) for x in lit.contents)) if lit else ''
                lat  = line.find(lambda t: t.name.endswith('lat'))
                latin = clean_text(''.join(str(x) for x in lat.contents)) if lat else ''

                if arab or latin:
                    surface_lines.append({
                        "line_num": line_num,
                        "zone_index": z_idx,
                        "arabic_text": arab,
                        "latin_text": latin
                    })

        if surface_lines:
            manuscript_data.append({
                "manuscript_id": manuscript_id,
                "surface_id": surface_id,
                "online_image_url": image_url,
                "local_image_path": rel_img,
                "absolute_image_path": abs_img,
                "lines": surface_lines
            })

    return manuscript_data


def get_manuscripts_with_transcripts():
    """Return {id: xml_url} for every manuscript that has a transcript."""
    index_url = urljoin(BASE_URL, "index.xhtml")
    soup = BeautifulSoup(requests.get(index_url).text, "html.parser")

    manuscripts = {}
    for li in soup.find_all('li', id=lambda x: x and x.startswith('Jarring_Prov_')):
        mid = normalize_manuscript_id(li['id'])
        for a in li.find_all('a', string=lambda t: t and "transcript" in t.lower()):
            href = re.sub(r'\.xhtml$', '', a['href'])
            if not href.endswith('.xml'):
                href += '.xml'
            manuscripts[mid] = urljoin(BASE_URL, href)
            break

    # add any that still aren’t picked up
    extra = {
        "Jarring_Prov_56":  "https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_56.transcript.xml",
        "Jarring_Prov_461": "https://uyghur.linguistics.indiana.edu/manuscripts/Jarring_Prov_461.transcript.xml",
    }
    manuscripts.update(extra)
    return manuscripts


# ─────────────────────────────────────────────────────────────────────────────
# Main driver
# ─────────────────────────────────────────────────────────────────────────────
def main():
    all_manuscript_data = []
    manuscripts = get_manuscripts_with_transcripts()
    print(f"Found {len(manuscripts)} manuscripts\n")

    for i, (mid, url) in enumerate(sorted(manuscripts.items()), 1):
        try:
            data = scrape_transcript(url, mid)
            if data:
                all_manuscript_data.extend(data)
                print(f"  ✔ {mid} ({len(data)} surfaces)\n")
            else:
                print(f"  ⚠ {mid}: no usable data\n")
            time.sleep(1)
        except Exception as e:
            print(f"  ✘ {mid}: {e}\n")

    # write combined JSON
    out_json = os.path.join(OUTPUT_DIR, "jarring_manuscripts_structured.json")
    with open(out_json, "w", encoding="utf-8") as fh:
        json.dump(all_manuscript_data, fh, ensure_ascii=False, indent=2)

    print(f"Saved {len(all_manuscript_data)} surface records → {out_json}")


if __name__ == "__main__":
    main()


Found 15 manuscripts

• scraping Jarring_Prov_2
  ↳ 96 surfaces
  ↳ downloading Jarring_Prov_2_001_r.Lanczos.800.65_1a.jpg
  ↳ downloading Jarring_Prov_2_001_v.Lanczos.800.65_1b.jpg
  ↳ downloading Jarring_Prov_2_002_r.Lanczos.800.65_2a.jpg
  ↳ downloading Jarring_Prov_2_002_v.Lanczos.800.65_2b.jpg
  ↳ downloading Jarring_Prov_2_003_r.Lanczos.800.65_3a.jpg
  ↳ downloading Jarring_Prov_2_003_v.Lanczos.800.65_3b.jpg
  ↳ downloading Jarring_Prov_2_004_r.Lanczos.800.65_4a.jpg
  ↳ downloading Jarring_Prov_2_004_v.Lanczos.800.65_4b.jpg
  ↳ downloading Jarring_Prov_2_005_r.Lanczos.800.65_5a.jpg
  ↳ downloading Jarring_Prov_2_005_v.Lanczos.800.65_5b.jpg
  ↳ downloading Jarring_Prov_2_006_r.Lanczos.800.65_6a.jpg
  ↳ downloading Jarring_Prov_2_006_v.Lanczos.800.65_6b.jpg
  ↳ downloading Jarring_Prov_2_007_r.Lanczos.800.65_7a.jpg
  ↳ downloading Jarring_Prov_2_007_v.Lanczos.800.65_7b.jpg
  ↳ downloading Jarring_Prov_2_008_r.Lanczos.800.65_8a.jpg
  ↳ downloading Jarring_Prov_2_008_v.Lanczos.800.65