# Lit Mining V2

In [None]:
import os
import json

In [2]:
import fitz

def has_text(paper_path):
    doc = fitz.open(paper_path)
    text = doc[0].get_text()
    
    return len(text.strip()) < 50

In [34]:
from grobid_client.grobid_client import GrobidClient
import xml.etree.ElementTree as ET

def extract_paper_details(client, pdf_path):
    """Extract Paper Details using GROBID"""
    # If pdf has text, extract abstract from text
    result = client.process_pdf(
        service='processHeaderDocument',  # Fast - only extracts header/metadata
        pdf_file=pdf_path,
        generateIDs=False,
        consolidate_header=True,  # Improve accuracy with CrossRef
        consolidate_citations=False,  # Not needed for abstract
        include_raw_citations=False,
        include_raw_affiliations=False,
        tei_coordinates=False,
        segment_sentences=False
    )
    # Extract XML content from tuple (pdf_file, status, xml_text)
    _, status, result_xml = result
    
    if status != 200:
        return f"Error processing PDF: Status {status}", "No title found", [], "No date found", "No journal found", "No volume found", "No issue found"
    
    # Parse the XML to extract abstract
    root = ET.fromstring(result_xml)
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Find the abstract
    abstract_elem = root.find('.//tei:abstract//tei:p', ns)
    abstract = abstract_elem.text if abstract_elem is not None else "No abstract found"

    # Extract title
    title_elem = root.find('.//tei:title', ns)
    title = title_elem.text if title_elem is not None else "No title found"
    
    # Extract authors
    authors = []
    author_elements = root.findall('.//tei:author', ns)
    for author in author_elements:
        name = author.find('.//tei:name', ns)
        if name is not None:
            authors.append(name.text)
    
    # Extract date
    date_elem = root.find('.//tei:date', ns)
    date = date_elem.text if date_elem is not None else "No date found"

    # Extract journal
    journal_elem = root.find('.//tei:journal', ns)
    journal = journal_elem.text if journal_elem is not None else "No journal found"

    # Extract volume
    volume_elem = root.find('.//tei:volume', ns)
    volume = volume_elem.text if volume_elem is not None else "No volume found"

    # Extract issue
    issue_elem = root.find('.//tei:issue', ns)
    issue = issue_elem.text if issue_elem is not None else "No issue found"

    return abstract, title, authors, date, journal, volume, issue


In [35]:
pdf_dir = "./pdf_pub/"

client = GrobidClient(
        batch_size=10,
        sleep_time=10,
        timeout=1000
)
papers_with_abstracts = []
papers_without_abstracts = []

for file in os.listdir(pdf_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, file)
        abstract, title, authors, date, journal, volume, issue = extract_paper_details(client, pdf_path)
        print(f"Abstract for {file}: abstract: {abstract[:100]}")
        if abstract == "No abstract found":
            papers_without_abstracts.append(file)
        else:
            papers_with_abstracts.append({
                "file": file,
                "abstract": abstract,
                "title": title,
                "authors": authors,
                "date": date,
                "journal": journal,
                "volume": volume,
                "issue": issue
            })

print(f"Papers with abstracts: {len(papers_with_abstracts)}")
print(f"Papers without abstracts: {len(papers_without_abstracts)}")

2025-10-08 16:41:23,961 - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2025-10-08 16:41:24,038 - INFO - GROBID server http://localhost:8070 is up and running


Abstract for Kaneetal2012.pdf: abstract: Hawai'i's beaches are a focal point of modern lifestyle as well as cultural tradition. Yet coastal e
Abstract for Spirandellietal2016_ImprovingAdaptationPlanningforSLR.pdf: abstract: Sea-level rise (SLR) presents risks to communities and ecosystems because of hazards like coastal er
Abstract for sherman_JSR_1999.pdf: abstract: No abstract found
Abstract for Romine et al 2013 Beach Erosion and SLR in HI.pdf: abstract: The islands of Oahu and Maui, Hawaii, with significantly different rates of localized sea-level rise
Abstract for Fletcher-Chapter6-slr-hawaii.pdf: abstract: Error processing PDF: Status 500
Abstract for CoastalSedimentary.pdf: abstract: No abstract found
Abstract for wave_driven_cross_shore.pdf: abstract: Coastal erosion, intensified by sea level rise, poses significant threats to coastal communities in 
Abstract for Harney_Fletcher_JSR_2003.pdf: abstract: Sediments of the bay and coastal plain of Kailua (Oahu, Hawaii) are Ͼ 90% bi

In [None]:
## write papers_with_abstracts to json
with open("output/papers_with_abstracts.json", "w") as f:
    json.dump(papers_with_abstracts, f)

In [None]:
from marker.output import text_from_rendered
from pathlib import Path

def extract_paper_text(pdf_path, converter, output_folder):
    """
    Extract full text from a research paper
    Returns the text as a string
    """

    try:
        rendered = converter(str(pdf_path))
        text, _, _ = text_from_rendered(rendered)
        
        # Save
        output_file = Path(output_folder) / f"{Path(pdf_path).stem}.txt"
        output_file.write_text(text, encoding='utf-8')
        
        return f"✓ {Path(pdf_path).name}"
    except Exception as e:
        return f"✗ {Path(pdf_path).name}: {e}"
    
    return text

In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict

# Initialize models (do once, reuse for multiple PDFs)
model_dict = create_model_dict()
converter = PdfConverter(artifact_dict=model_dict)
# Usage
for idx, paper in enumerate(papers_with_abstracts):
    
    paper_text = extract_paper_text(pdf_dir + paper["file"], converter, )
    papers_with_abstracts[idx]["full_text"] = paper_text
    print(f"Extracted full text for {paper['file']}")
    if idx % 10 == 0:
        print(f"Extracted full text for {idx} papers")

Downloading manifest.json: 100%|██████████| 262/262 [00:00<00:00, 228kB/s]
Downloading .gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 1.06MB/s]025_09_23:   0%|          | 0/12 [00:00<?, ?it/s]

Downloading special_tokens_map.json: 100%|██████████| 278/278 [00:00<00:00, 181kB/s]09_23:   8%|▊         | 1/12 [00:00<00:01,  6.37it/s]

[A

Downloading README.md: 100%|██████████| 5.05k/5.05k [00:00<00:00, 2.63MB/s]



[A[A[A
Downloading training_args.bin: 100%|██████████| 7.45k/7.45k [00:00<00:00, 2.65MB/s]
Downloading config.json: 100%|██████████| 50.4k/50.4k [00:00<00:00, 8.83MB/s]
Downloading vocab_math.json: 100%|██████████| 20.1k/20.1k [00:00<00:00, 8.46MB/s]


Downloading tokenizer_config.json: 100%|██████████| 694/694 [00:00<00:00, 674kB/s]
Downloading specials_dict.json: 100%|██████████| 43.5k/43.5k [00:00<00:00, 10.1MB/s]
Downloading preprocessor_config.json: 100%|██████████| 419/419 [00:00<00:00, 402kB/s]
Downloading processor_config.json: 100%|██████████| 411/411 [

Extracted full text for Kaneetal2012.pdf
Extracted full text for 0 papers


Recognizing Layout: 100%|██████████| 14/14 [01:17<00:00,  5.56s/it]
Running OCR Error Detection: 100%|██████████| 4/4 [00:00<00:00,  5.28it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  3.18it/s]
Recognizing Text:  81%|████████  | 17/21 [02:04<01:23, 20.76s/it]

KeyboardInterrupt: 

In [None]:
# write papers_with_full_text  to json
with open("output/papers_with_full_text.json", "w") as f:
    json.dump(papers_with_abstracts, f)

In [None]:
def full_text_review_prompt(paper_metadata, full_text):
    return f"""You are reviewing a scientific paper for inclusion in a Hawaiian sea level rise database. Analyze the COMPLETE paper text and respond in valid JSON format.

=== PAPER INFORMATION ===
Title: {paper_metadata.get('title')}
Previous Classification: MEDIUM confidence (based on abstract only)

=== FULL TEXT ===
{full_text}

=== YOUR TASK ===
Review the complete paper, paying special attention to:
1. **Methods section**: Study location, data sources, modeling approaches
2. **Results section**: Quantitative findings, measurements, projections
3. **Discussion/Conclusions**: Specific implications for Hawaii

Determine:
- Final relevance (relevant/not relevant)
- Confidence level (HIGH/MEDIUM/LOW)
- Applicable database layers (maximum 2)
- Whether to upgrade/downgrade from abstract-only assessment

=== CONFIDENCE CRITERIA ===

**HIGH confidence** - All of these must be present:
- Study specifically focuses on Hawaiian locations (named islands, cities, or regions)
- Contains quantitative data (measurements, rates, projections with numbers)
- Results section includes specific Hawaii-relevant findings
- Clear methodology described for Hawaii context

**MEDIUM confidence** - At least two of these:
- Methodology applicable to Hawaii but not Hawaii-specific data
- Mentions Hawaii but focuses on broader Pacific/global context
- Qualitative findings relevant to Hawaii
- Modeling approach transferable to Hawaii

**LOW confidence** - Downgrade if:
- Hawaii mentioned only in passing or as example
- No actionable data or findings
- Methodology not applicable to Hawaii

=== LAYER DEFINITIONS ===

**FLOODING LAYERS** (choose max 2 most relevant):

1. **passive_marine_flooding**
   Direct ocean water inundation from sea level rise
   Keywords: "marine inundation", "coastal flooding", "flooded area", "inundation zone", "bathtub model", "passive flooding", "flood depth"
   Example: "3.2 ft SLR results in 1,200 acres of marine flooding"

2. **groundwater_inundation**
   Flooding from rising groundwater table
   Keywords: "groundwater", "water table rise", "subsurface flooding", "groundwater emergence", "drainage impacts", "aquifer"
   Example: "Water table will reach surface in X years"

3. **low_lying_flooding**
   Areas defined by elevation thresholds
   Keywords: "critical elevation", "below [X]m/ft", "elevation threshold", "low-lying areas", "DEM analysis"
   Example: "Areas below 1.5m elevation are vulnerable"

4. **compound_flooding**
   Multiple simultaneous flood mechanisms
   Keywords: "compound flooding", "multiple mechanisms", "combined effects", "storm surge + rain", "concurrent flooding"
   Example: "Combined storm surge and high tide flooding"

5. **drainage_backflow**
   Stormwater/sewer system flooding
   Keywords: "storm drain", "drainage backflow", "sewer flooding", "infrastructure flooding", "drain capacity"
   Example: "Storm drains will backflow at X cm SLR"

**EROSION/HAZARD LAYERS**:

6. **future_erosion_hazard_zone**
   Shoreline retreat rates and predictions
   Keywords: "erosion rate", "[X] m/year", "shoreline change", "beach loss", "hazard zone", "coastal retreat", "shoreline position"
   Example: "Average erosion rate of 0.3 m/year projected"

7. **annual_high_wave_flooding**
   Wave-driven coastal flooding events
   Keywords: "wave runup", "wave-driven flooding", "extreme waves", "overwash", "wave setup", "wave impact"
   Example: "Annual high wave events cause flooding to X elevation"

8. **emergent_and_shallow_groundwater**
   Groundwater near or at surface
   Keywords: "shallow groundwater", "emergent groundwater", "water table depth", "groundwater level", "subsurface water"
   Example: "Groundwater within 0.5m of surface"

=== LAYER SELECTION RULES ===
1. Select ONLY layers with explicit evidence in Results/Discussion sections
2. Maximum 2 layers per paper - choose the most prominent findings
3. If paper covers multiple aspects, prioritize quantitative results over methodology
4. Don't assign layers based solely on Methods - findings must be present
5. If uncertain between layers, choose the one with more quantitative support

=== RESPONSE FORMAT ===
Return ONLY valid JSON (no markdown, no extra text):

{{
    "relevant": true,
    "confidence": "HIGH",
    "relevant_layers": ["passive_marine_flooding"],
    "reasoning": "Results section (page X) reports 1,200 acres of Oahu coastal area will experience marine inundation under 3.2 ft SLR scenario. Study uses LiDAR elevation data and hydrodynamic modeling specific to Pearl Harbor area. Discussion quantifies impacts on infrastructure and population.",
    "changed_from_abstract": true,
    "change_explanation": "Upgraded from MEDIUM to HIGH. Abstract mentioned modeling approach but full text reveals extensive Hawaii-specific quantitative results including precise flood extents, elevation thresholds, and infrastructure impacts.",
    "key_findings": "3.2 ft SLR: 1,200 acres flooded, 2,400 structures affected in Oahu coastal zone. Critical elevation threshold: 1.5m NAVD88. Study period: 2020-2100 projections.",
    "quantitative_data": {{
        "locations": ["Pearl Harbor", "Waikiki Beach", "Honolulu Harbor"],
        "measurements": ["1,200 acres flood extent", "3.2 ft SLR scenario", "1.5m NAVD88 threshold"],
        "time_periods": ["2020-2100"]
    }}
}}

=== IMPORTANT ===
- Cite specific page numbers or section names when possible
- Quote exact quantitative values from the text
- If downgrading confidence, explain why full text reveals less relevance than abstract suggested
- If paper is not relevant, set "relevant": false and provide brief reasoning
- Ensure JSON is valid (use double quotes, proper escaping)

Begin your analysis:"""

In [None]:
from openai import OpenAI

client = OpenAI()
results = []
 # Create prompt
for paper in successful_extractions:
    full_text = extract_full_text_pdfplumber(f"./pdf_pub/{paper['filename']}")
    prompt = full_text_review_prompt(paper, full_text)
    # Call LLM
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in climate science and coastal hazards, specializing in Hawaiian environmental research."
            },
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"},
        temperature=0.3
    )
    result = json.loads(response.choices[0].message.content)
    result['paper_metadata'] = paper
    
    results.append(result)