In [75]:
from pdf2image import convert_from_path
from pytesseract import image_to_string
from PIL import Image
import os

In [85]:
print(f"Current working directory: {os.getcwd()}")

base_path = "papers/2018deSouza"
filename = "2018deSouza_supplementarry_information.pdf"

file_path = os.path.join(os.getcwd(), base_path, filename)
print(f"Full filepath: {file_path}")

Current working directory: /Users/jamesbyers/code/github/Kaggle/openai_to_z
Full filepath: /Users/jamesbyers/code/github/Kaggle/openai_to_z/papers/2018deSouza/2018deSouza_supplementarry_information.pdf


In [None]:
def extract_text_from_pdf(pdf_path, dpi=300, page_nums=None):
    print(f"Converting pages {page_nums} of PDF to images...")
    images = convert_from_path(
        pdf_path,
        dpi=dpi,
        first_page=min(page_nums),
        last_page=max(page_nums)
    )
    print(f"Converted {len(images)} pages to images.")

    all_text = []

    for i, image in enumerate(images):
        actual_page = page_nums[i]
        print(f"Processing OCR on page {actual_page}...")
        text = image_to_string(image, config='--psm 6')
        all_text.append((actual_page, text))
        print(f"Finished OCR for page {actual_page}.")

    print("Finished processing all pages.")
    return all_text

pdf_path = file_path
pages_to_extract = [8, 9, 10]
extracted = extract_text_from_pdf(pdf_path, page_nums=pages_to_extract)

Converting pages [8, 9, 10] of PDF to images...
Converted 3 pages to images.
Processing OCR on page 8...
Finished OCR for page 8.
Processing OCR on page 9...
Finished OCR for page 9.
Processing OCR on page 10...
Finished OCR for page 10.
Finished processing all pages.
--- Page 8 ---
Supplementary Table 4. Archaeological sites identified in the Upper Tapajés Basin.

Population estimates were calculated based on the linear equation described in Curet **.

Site Structure Type Latitude Longitude thay opulation

Mto1 l circular enclosure -57.9452  -9.4071 1.61 239
hexagonal

Mto2 l enclosure -57.8872 -9.8711 3.18 454

Mto3 l circular enclosure  -57.8772 -9.7048 5.11 719

Mto4 l enclosure -57.8212 -9.8132 2.41 349

Mto5 l circular enclosure -57.7557 = -9.8221 1.77 262

Mt06 | circular enclosure -58.2325 -9.4150 1.82 268

Mto6 Il enclosure -58.2326 -9.4149 ;

Mt07 l circular enclosure -59.3228 -9.3501

Mto7 ll circular enclosure -59.3208 -9.3512 9.88 1373

Mto7 ll causeway -59.3146 -9.3476

M

In [96]:
# Assume `extracted` is a list of (page_num, text) tuples
# or a list of strings, or lists of strings

def flatten_text(extracted):
    lines = []

    for item in extracted:
        if isinstance(item, tuple):
            _, content = item
        else:
            content = item

        if isinstance(content, list):
            for sub in content:
                lines.append(str(sub))
        else:
            lines.append(str(content))

    return "\n".join(lines)

raw_text = flatten_text(extracted)


In [98]:
import re
import json
from collections import defaultdict

# Paste your raw text as a string here
raw_text = """PASTE YOUR RAW TEXT HERE"""

# Normalise characters misread by OCR
def normalise(text):
    return (
        text
        .replace("=", "-")
        .replace("Il", "II").replace("lI", "II")
        .replace("ll", "II").replace("l", "I").replace("I|", "I")
        .replace("H]", "II").replace("HI", "III")
        .replace("|", "I").replace("“", "").replace("”", "")
        .replace("‘", "").replace("’", "")
        .replace("°", "").replace("+", ".").replace(",", ".")
    )

# Clean and extract entries
lines = normalise(raw_text).splitlines()
site_data = defaultdict(list)

site_re = re.compile(
    r"(?P<site>Z?-?Mt[\d/-]+)[\s\-]*I{1,4}\b.*?"
    r"(?P<type>circular enclosure|mounded village|causeway|enclosure|circularenclosure)?"
    r"\s*(-?(?:\d{1,3}\.\d+))\s+(-?(?:\d{1,3}\.\d+))"  # lat long
    r"(?:\s+(\d+(?:\.\d+)?))?"                        # area (optional)
    r"(?:\s+(\d+))?"                                  # population (optional)
)

structure_index = defaultdict(int)

for line in lines:
    match = site_re.search(line)
    if not match:
        continue

    site_id_raw = match.group(1).replace("Z-", "").replace("--", "-")
    site_base = re.sub(r"[/-].*", "", site_id_raw)
    structure_index[site_base] += 1
    structure_num = structure_index[site_base]

    site_entry = {
        "site": site_base,
        "structure": ["I", "II", "III", "IV", "V"][structure_num - 1],
        "type": match.group(2).replace("circularenclosure", "circular enclosure") if match.group(2) else "enclosure",
        "latitude": float(match.group(3)),
        "longitude": float(match.group(4)),
        "area_ha": float(match.group(5)) if match.group(5) else None,
        "potential_population": int(match.group(6)) if match.group(6) else None,
    }

    site_data[site_base].append(site_entry)

# Output JSON
with open("parsed_sites.json", "w") as f:
    json.dump(site_data, f, indent=2)
