# Literature Mining

In [22]:
import pdfplumber
import os
import re
import openai
from dotenv import load_dotenv

load_dotenv()

True

In [6]:
def clean_abstract(abstract):
    """Clean extracted abstract text"""
    # Remove excessive whitespace
    abstract = re.sub(r'\s+', ' ', abstract)
    
    # Remove common artifacts
    abstract = re.sub(r'^[:\-\s]+', '', abstract)  # Leading punctuation
    abstract = re.sub(r'©.*?rights reserved\.?', '', abstract, flags=re.IGNORECASE)  # Copyright
    abstract = re.sub(r'\n+', ' ', abstract)  # Newlines to spaces
    
    # Remove page numbers or DOIs that might have leaked in
    abstract = re.sub(r'\b\d{1,3}\s*$', '', abstract)  # Trailing page numbers
    abstract = re.sub(r'doi:.*$', '', abstract, flags=re.IGNORECASE)
    
    # Remove "Abstract" if it's still at the start
    abstract = re.sub(r'^abstract[:\s]*', '', abstract, flags=re.IGNORECASE)
    
    return abstract.strip()


In [7]:
abstract_patterns = [
    # Your existing patterns are good, here are additions:
    
    # Your originals
    r'(?i)\babstract\b\s*[-:.\s]*\n*(.*?)(?=\n\s*(?:keywords?|introduction|1\.|references|background|\n\n))',
    r'(?i)\babstract\b\s*[-:.\s]*(.*?)(?=\n\s*(?:keywords?|introduction|1\.|references|background))',
    r'(?i)^abstract\s*$(.*?)^(?:keywords?|introduction|1\.|references)',
    r'(?i)abstract[—–-]\s*(.*?)(?=\nindex terms|keywords|introduction)',
    r'(?i)summary\s*(.*?)(?=\nmain|introduction|methods)',
    
    # Additional patterns for climate papers
    # AGU journals often use this format
    r'(?i)abstract\s*\n+(.*?)(?=\n\s*\d+\.?\s+introduction)',
    
    # Some papers have "Key Points" before abstract ends
    r'(?i)abstract[:\s]+(.*?)(?=\n\s*(?:key points|plain language summary))',
    
    # Elsevier format with article info
    r'(?i)a\s*b\s*s\s*t\s*r\s*a\s*c\s*t\s+(.*?)(?=\n\s*©|keywords|introduction)',
    
    # Format with numbered sections (common in older papers)
    r'(?i)abstract\s*[:\-]?\s*(.*?)(?=\n\s*(?:\d+\.\s*introduction|\d+\.\s*background))',
]

def extract_abstract(text):
    """Extract abstract from plain text using regex patterns"""
    for pattern in abstract_patterns:
        match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
        if match:
            abstract = match.group(1).strip()
            
            # Clean up the abstract
            abstract = clean_abstract(abstract)
            
            # Filter out very short matches (likely false positives)
            # Also check it's not too long (might have captured too much)
            word_count = len(abstract.split())
            if 10 < word_count < 500:  # Most abstracts are 100-300 words
                return abstract
    
    return None

In [8]:
def extract_title(text):
    """Extract title from first page - complement to your abstract extraction"""
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Title is usually in first 5-10 lines, and is the longest substantial line
    candidate_titles = []
    
    for i, line in enumerate(lines[:15]):
        # Skip very short lines, page numbers, headers
        if len(line) < 20 or line.isdigit():
            continue
        
        # Skip lines that look like metadata
        if any(keyword in line.lower() for keyword in 
               ['journal', 'volume', 'issn', 'doi', 'author', 'received', 'published']):
            continue
            
        # Title candidates should be substantial
        if len(line.split()) >= 4:
            candidate_titles.append((i, line))
    
    # Return the first substantial line (usually the title)
    if candidate_titles:
        return candidate_titles[0][1]
    
    return None


In [9]:
import os
import pdfplumber
from pathlib import Path

def extract_metadata_batch(pdf_folder='pdf_pub', max_pages=3):
    """
    Extract abstracts and titles from all PDFs in folder
    Returns list of dicts with metadata for prefiltering
    """
    results = []
    failed = []
    
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
    print(f"Processing {len(pdf_files)} PDFs...\n")
    
    for idx, file in enumerate(pdf_files, 1):
        print(f"[{idx}/{len(pdf_files)}] {file[:50]}...", end=" ")
        
        try:
            with pdfplumber.open(f'{pdf_folder}/{file}') as pdf:
                # Extract text from first few pages
                text = ""
                for page_num in range(min(max_pages, len(pdf.pages))):
                    page_text = pdf.pages[page_num].extract_text()
                    if page_text:
                        text += page_text + "\n"
                
                # Extract metadata
                title = extract_title(text)
                abstract = extract_abstract(text)
                
                if abstract:
                    results.append({
                        'filename': file,
                        'title': title or file,  # Fallback to filename
                        'abstract': abstract,
                        'text_preview': text[:500],  # For debugging
                        'success': True
                    })
                    print("✓")
                else:
                    failed.append({
                        'filename': file,
                        'title': title,
                        'abstract': None,
                        'reason': 'No abstract found',
                        'success': False
                    })
                    print("✗ No abstract")
                    
        except Exception as e:
            failed.append({
                'filename': file,
                'title': None,
                'abstract': None,
                'reason': str(e),
                'success': False
            })
            print(f"✗ Error: {str(e)[:50]}")
            # Summary
    print(f"\n{'='*60}")
    print(f"Success: {len(results)}/{len(pdf_files)} papers")
    print(f"Failed: {len(failed)}/{len(pdf_files)} papers")
    
    if failed:
        print(f"\nFailed files:")
        for item in failed[:5]:  # Show first 5
            print(f"  - {item['filename']}: {item['reason']}")
        if len(failed) > 5:
            print(f"  ... and {len(failed)-5} more")
    
    return results, failed

In [10]:
# Usage
successful_extractions, failed_extractions = extract_metadata_batch('pdf_pub')

# Save results for prefiltering
import json

with open('extracted_abstracts.json', 'w') as f:
    json.dump(successful_extractions, f, indent=2)

print(f"\nSaved {len(successful_extractions)} abstracts to extracted_abstracts.json")

Processing 113 PDFs...

[1/113] Kaneetal2012.pdf... ✓
[2/113] Spirandellietal2016_ImprovingAdaptationPlanningfor... ✓
[3/113] sherman_JSR_1999.pdf... ✗ No abstract
[4/113] Romine et al 2013 Beach Erosion and SLR in HI.pdf... ✓
[5/113] Fletcher-Chapter6-slr-hawaii.pdf... ✗ No abstract
[6/113] CoastalSedimentary.pdf... ✗ No abstract
[7/113] wave_driven_cross_shore.pdf... ✗ No abstract
[8/113] Harney_Fletcher_JSR_2003.pdf... ✗ No abstract
[9/113] Genz_06-0757.pdf... ✓
[10/113] Genz_06-0756.pdf... ✓
[11/113] KCAP_ClimateWP_22_0302.pdf... ✓
[12/113] ClimateBrief_low.pdf... ✗ No abstract
[13/113] Andrade_et_al_2023-coas-40-02-338-352.pdf... ✓
[14/113] RooneyCoastalSed2003.pdf... ✓
[15/113] Romine et al 2016 Beach Erosion Under Rising Sea L... ✓
[16/113] s10584-018-2327-7.pdf... ✓
[17/113] Bochicchio_etal_2009.pdf... ✓
[18/113] remotesensing-12-00154.pdf... ✓
[19/113] Conger_TGARS.pdf... ✓
[20/113] annurev-marine-020923-120737.pdf... ✓
[21/113] KaneEtAl2014_SLRCriticalElevation.pdf... ✓
[22/1

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


✓
[83/113] Use surplus to protect Sunset Beach.pdf... ✗ No abstract
[84/113] WaikikiUAS_Defense_OnlineVersion.pdf... ✗ No abstract
[85/113] Anderson_Frazer_JCR_preprint.pdf... ✓
[86/113] GeologyofHawaiiReefs.pdf... ✗ No abstract
[87/113] ClimateChange_in_FSM_Exec_Summary.pdf... ✗ No abstract
[88/113] Rotzoll Fletcher NCC 2012.pdf... ✗ No abstract
[89/113] Cooper_etal_2013_2.pdf... ✓
[90/113] EngelsJSR04.pdf... ✗ No abstract
[91/113] Vitousek_SCD08.pdf... ✓
[92/113] Anderson_etal_2014_JCR.pdf... ✓
[93/113] AmSamoa Climate 2016.pdf... ✗ No abstract
[94/113] coastal_land_subsidence.pdf... ✗ No abstract
[95/113] Romine Fletcher 2013 Oahu Armoring.pdf... ✓
[96/113] Romine_Fletcher_inpress_HI_ShoreChange_Summary_JCR... ✓
[97/113] RichmondHCH2001.pdf... ✓
[98/113] computation_of_energetic_nearshore_waves.pdf... ✗ No abstract
[99/113] FletcherFiersten_Hawaiichaptercoasts.pdf... ✗ No abstract
[100/113] CNMI Climate 2016.pdf... ✗ No abstract
[101/113] Earth s Future - 2020 - Kane - Rethinking Re

In [23]:
openai_key = os.getenv("OPENAI_API_KEY")

In [34]:
from openai import OpenAI

client = OpenAI()

def create_rich_layer_embeddings(layer_docs):
    """
    Combine description and terms for comprehensive embeddings
    Your descriptions are technical, terms are user-focused - use both!
    """
    from openai import OpenAI
    client = OpenAI()
    
    layer_embeddings = {}
    
    for layer_id, layer_data in layer_docs.items():
        # Combine description and top terms
        description = layer_data['description'][:500]  # First 500 chars
        top_terms = ', '.join(layer_data['terms'][:20])  # Top 20 terms
        
        # Create rich semantic representation
        combined_text = f"""
        {layer_data['title']}
        
        Description: {description}
        
        Key terms: {top_terms}
        """
        
        response = client.embeddings.create(
            input=combined_text.strip(),
            model="text-embedding-3-small"
        )
        
        layer_embeddings[layer_id] = response.data[0].embedding
    
    return layer_embeddings

In [35]:
def prefilter_hybrid_with_metadata(paper_metadata, layer_docs, layer_embeddings=None):
    """
    Best approach: Use your rich metadata in stages
    1. Fast term matching from your 'terms' arrays
    2. Semantic matching using description + terms embeddings
    """
    title = paper_metadata.get('title', '').lower()
    abstract = paper_metadata.get('abstract', '').lower()
    combined_text_lower = f"{title} {abstract}"
    combined_text_original = f"{paper_metadata.get('title', '')}. {paper_metadata.get('abstract', '')}"
    
    # Stage 1: Term matching (fast)
    term_matches = {}
    
    for layer_id, layer_data in layer_docs.items():
        score = 0
        matched_terms = []
        
        for term in layer_data['terms']:
            if term.lower() in combined_text_lower:
                score += 1
                matched_terms.append(term)
        
        if score > 0:
            term_matches[layer_id] = {
                'score': score,
                'matched_terms': matched_terms[:5]  # Keep top 5 for reference
            }
    
    # Strong matches = immediate accept
    strong_matches = {
        layer: data for layer, data in term_matches.items() 
        if data['score'] >= 3
    }
    
    if strong_matches:
        return {
            'decision': 'accept',
            'method': 'term_matching',
            'relevant_layers': list(strong_matches.keys()),
            'layer_scores': {layer: data['score'] for layer, data in strong_matches.items()},
            'matched_terms': {layer: data['matched_terms'] for layer, data in strong_matches.items()}
        }
    
    # Stage 2: Semantic matching for unclear cases
    if layer_embeddings and term_matches:
        # Has some term matches but not strong - verify with embeddings
        import numpy as np
        
        client = OpenAI()
        response = client.embeddings.create(
            input=combined_text_original,
            model="text-embedding-3-small"
        )
        paper_embedding = response.data[0].embedding
        
        # Calculate similarity
        similarities = {}
        for layer_id, layer_embedding in layer_embeddings.items():
            similarity = np.dot(paper_embedding, layer_embedding) / (
                np.linalg.norm(paper_embedding) * np.linalg.norm(layer_embedding)
            )
            similarities[layer_id] = float(similarity)
        
        # Combine term and embedding scores
        combined_scores = {}
        for layer_id in set(list(term_matches.keys()) + list(similarities.keys())):
            term_score = term_matches.get(layer_id, {}).get('score', 0)
            embedding_score = similarities.get(layer_id, 0)
            
            # Weighted combination: terms are direct evidence, embeddings are contextual
            combined_scores[layer_id] = (term_score * 0.6) + (embedding_score * 10 * 0.4)
        
        relevant_layers = [
            layer for layer, score in combined_scores.items() 
            if score >= 2.0  # Adjust threshold as needed
        ]
        
        if relevant_layers:
            return {
                'decision': 'accept',
                'method': 'hybrid',
                'relevant_layers': relevant_layers,
                'combined_scores': combined_scores,
                'term_matches': term_matches,
                'embedding_similarities': similarities
            }
    
    # No significant matches
    return {
        'decision': 'reject',
        'method': 'hybrid',
        'relevant_layers': [],
        'term_matches': term_matches,
        'reason': 'Insufficient term and semantic matches'
    }

In [None]:
with open('../../data/documentation.json', 'r') as f:
    layer_docs = json.load(f)

layer_embeddings = create_rich_layer_embeddings(layer_docs)

In [38]:
layer_embeddings

{'passive_marine_flooding': [0.01859542541205883,
  0.06561864167451859,
  0.06911343336105347,
  0.04733618348836899,
  -0.0038012389559298754,
  -0.01894751377403736,
  0.010060569271445274,
  -0.026158783584833145,
  0.009356394410133362,
  0.06165440008044243,
  -0.018308540806174278,
  0.0032600676640868187,
  -0.0390164889395237,
  -0.012557781301438808,
  0.005574715789407492,
  0.032861482352018356,
  0.004339150153100491,
  0.010640861466526985,
  0.005115046165883541,
  0.04491069167852402,
  0.035521697252988815,
  -0.005838781129568815,
  -0.023302964866161346,
  0.02714984491467476,
  -0.029079804196953773,
  -0.037243012338876724,
  0.008208850398659706,
  0.026380468159914017,
  0.04198967292904854,
  0.06509703397750854,
  0.054247528314590454,
  -0.03385254368185997,
  -0.03622587397694588,
  0.013020710088312626,
  0.06947856396436691,
  0.044649887830019,
  -0.04900533705949783,
  0.05951579660177231,
  -0.02158164791762829,
  0.0395902618765831,
  -0.008808703161776

In [42]:
accepted_papers = []
rejected_papers = []    
for abstract in successful_extractions:
    result = prefilter_hybrid_with_metadata(abstract, layer_docs, layer_embeddings)
    if result['decision'] == 'accept':
        accepted_papers.append(abstract)
    else:
        rejected_papers.append(abstract)


In [46]:
rejected_papers

[{'filename': 'Romine et al 2013 Beach Erosion and SLR in HI.pdf',
  'title': 'Global and Planetary Change',
  'abstract': 'Articlehistory: The islands of Oahu and Maui, Hawaii, with significantly different rates of localized sea-level rise (SLR, Received30March2013 approximately65%higherrateonMaui)overthepastcenturyduetolithosphericflexureand/orvariations Accepted20June2013 inupperoceanwatermasses,provideauniquesettingtoinvestigatepossiblerelationsbetweenhistorical Availableonline28June2013 shorelinechangesandSLR.Island-wideandregionalhistoricalshorelinetrendsarecalculatedfortheislands usingshorelinepositionsmeasuredfromaerialphotographsandsurveycharts.Historicalshorelinedataare',
  'text_preview': 'GlobalandPlanetaryChange108(2013)149–157\nContentslistsavailableatSciVerseScienceDirect\nGlobal and Planetary Change\njournal homepage: www.elsevier.com/locate/gloplacha\nAre beach erosion rates and sea-level rise related in Hawaii?\nBradleyM.Rominea,⁎ ,CharlesH.Fletcherb,1,MatthewM.Barbee

In [78]:
def llm_review_paper(paper_metadata, layer_docs, model="gpt-4o-mini"):
    """
    Use LLM to review papers that were rejected by keyword/embedding filters
    LLM can understand nuanced connections that simple matching misses
    """
    prompt = f"""Review paper for Hawaiian sea level rise database. Be highly selective. Respond in JSON format.

PAPER:
Title: {paper_metadata.get('title')}
Abstract: {paper_metadata.get('abstract')}

=== AUTOMATIC REJECTIONS (stop analysis if ANY apply) ===
❌ Title contains: "Climate Action Plan", "Adaptation Plan", "Policy Report", "Legislative Act"
❌ Abstract is mostly section headings, table of contents, or policy recommendations
❌ Geographic focus: Brazil, Atlantic Coast, Europe, Asia (unless methodology explicitly for Hawaii)
❌ No quantitative data or findings mentioned in abstract
❌ Archaeological/historical paper without sea level rise connection

=== LAYER ASSIGNMENT (max 2 per paper) ===

FLOODING LAYERS (assign if paper quantifies inundation):
- passive_marine_flooding: Direct ocean inundation
  Match if: "marine inundation", "coastal flooding", "SLR exposure", "bathtub model", "flooded area"
  
- groundwater_inundation: Subsurface flooding from water table rise
  Match if: "groundwater", "water table", "subsurface flooding", "drainage", "infrastructure flooding"
  
- low_lying_flooding: Areas below critical elevations
  Match if: "critical elevation", "below [X]m", "low-lying areas", "elevation threshold"
  
- compound_flooding: Multiple flood sources combined
  Match if: "compound flooding", "multi-mechanism", "storm surge + rain", "combined impacts"
  
- drainage_backflow: Storm drain flooding
  Match if: "storm drain", "drainage backflow", "sewer flooding", "drainage system"

EROSION LAYERS (assign if paper has erosion data/projections):
- future_erosion_hazard_zone: Shoreline retreat rates/predictions
  Match if: "erosion rate", "[X] m/year", "shoreline change", "beach loss", "coastal retreat", "hazard zone"
  
- annual_high_wave_flooding: Wave-driven coastal flooding
  Match if: "wave runup", "wave-driven flooding", "extreme waves", "overwash", "wave inundation"
  
- emergent_and_shallow_groundwater: Groundwater near surface
  Match if: "shallow groundwater", "emergent groundwater", "groundwater depth", "water table depth"

=== ASSIGNMENT RULES ===
1. Maximum 2 layers per paper (prefer 1 layer)
2. Assign ONLY if abstract mentions:
   - Quantitative results (rates, percentages, measurements, areas)
   - Specific Hawaiian locations OR Pacific Islands for comparison
   - Methods with results (not just proposed methods)
3. Methodology papers: Maximum 1 layer, MEDIUM confidence

=== CONFIDENCE LEVELS ===
- HIGH: Hawaii data + quantitative findings + layer keywords present
- MEDIUM: Transferable methods OR Pacific Island analog OR indirect relevance
- Never assign LOW confidence (reject instead)

=== RESPONSE FORMAT (JSON) ===
{{
    "relevant": true or false,
    "confidence": "HIGH" or "MEDIUM",
    "relevant_layers": ["layer_name"],
    "reasoning": "Brief explanation citing specific abstract content"
}}

If relevant=false, set relevant_layers to empty array [].
Goal: Include papers that help MODEL or VISUALIZE sea level rise impacts in Hawaii."""

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system", 
                    "content": "You are an expert in climate science and coastal hazards, specializing in Hawaiian environmental research."
                },
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.3
        )
        
        result = json.loads(response.choices[0].message.content)
        
        # Handle both 'relevant' and 'decision' keys for compatibility
        is_relevant = result.get('relevant', result.get('decision') == 'accept')
        
        return {
            'decision': 'accept' if is_relevant else 'reject',
            'method': 'llm_review',
            'confidence': result.get('confidence', 'UNKNOWN'),
            'relevant_layers': result.get('relevant_layers', []),
            'reasoning': result.get('reasoning', ''),
            'model_used': model
        }
        
    except Exception as e:
        return {
            'decision': 'error',
            'method': 'llm_review',
            'error': str(e),
            'relevant_layers': []
        }

In [79]:
# Loop through rejected papers and review them
papers_results = []
combined_papers = accepted_papers + rejected_papers
for paper in combined_papers:
    result = llm_review_paper(paper, layer_docs)
    papers_results.append(result)

papers_results

[{'decision': 'accept',
  'method': 'llm_review',
  'confidence': 'HIGH',
  'relevant_layers': ['future_erosion_hazard_zone'],
  'reasoning': 'The paper provides quantitative data on shoreline change rates and identifies erosion hazard zones based on historical photographs, which directly relates to future erosion projections in Hawaii.',
  'model_used': 'gpt-4o-mini'},
 {'decision': 'accept',
  'method': 'llm_review',
  'confidence': 'MEDIUM',
  'relevant_layers': ['future_erosion_hazard_zone'],
  'reasoning': 'The paper discusses a probabilistic shoreline model for SLR planning, focusing on shoreline change, which implies potential erosion rates and coastal retreat. Although it does not provide specific quantitative data in the abstract, it offers a methodology relevant to adaptation planning in Hawaii.',
  'model_used': 'gpt-4o-mini'},
 {'decision': 'accept',
  'method': 'llm_review',
  'confidence': 'HIGH',
  'relevant_layers': ['future_erosion_hazard_zone'],
  'reasoning': 'The ab

In [81]:
# Augment paper results with paper metadata

for idx, results in enumerate(papers_results):
    if idx < len(combined_papers):
        papers_results[idx]['paper_metadata'] = combined_papers[idx]
    else:
        papers_results[idx]['paper_metadata'] = None

papers_results

[{'decision': 'accept',
  'method': 'llm_review',
  'confidence': 'HIGH',
  'relevant_layers': ['future_erosion_hazard_zone'],
  'reasoning': 'The paper provides quantitative data on shoreline change rates and identifies erosion hazard zones based on historical photographs, which directly relates to future erosion projections in Hawaii.',
  'model_used': 'gpt-4o-mini',
  'paper_metadata': {'filename': 'Kaneetal2012.pdf',
   'title': 'Vulnerability Assessment of Hawai‘i’s Cultural Assets',
   'abstract': 'KANE, H.H.; FLETCHER, C.H.; ROMINE, B.M.; ANDERSON, T.R.; FRAZER, N.L., and BARBEE, M.M., 2012. VulnerabilityassessmentofHawai‘i’sculturalassetsattributabletoerosionusingshorelinetrendanalysistechniques. JournalofCoastalResearch,28(3),533–539.WestPalmBeach(Florida),ISSN0749-0208. Hawai‘i’sbeachesareafocalpointofmodernlifestyleaswellasculturaltradition.Yetcoastalerosionthreatensareas thathaveservedasburialgrounds,homesites,andotherformsofculturalsignificance.Toimproveunderstandingofthe 

In [82]:
# Save combined results
with open('combined_results_v3.json', 'w') as f:
    json.dump(papers_results, f, indent=2)


In [15]:
def extract_full_text_pdfplumber(pdf_path):
    """Extract text from PDF using pdfplumber"""
    text_parts = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_parts.append(text)
        
        full_text = "\n\n".join(text_parts)
        
        # Truncate if too long (stay under ~40k tokens)
        words = full_text.split()
        if len(words) > 30000:
            full_text = " ".join(words[:30000])
            full_text += "\n\n[TEXT TRUNCATED - Analysis based on first 30,000 words]"
        
        return full_text
        
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

In [31]:
def full_text_review_prompt(paper_metadata, full_text):
    return f"""Review FULL PAPER TEXT for Hawaiian sea level rise database. Respond in JSON format.

PAPER:
Title: {paper_metadata.get('title')}

Full Text:
{full_text}

This paper was previously classified as MEDIUM confidence from abstract alone.
Review the FULL TEXT (especially Results, Discussion, Conclusions) to:
1. Confirm or upgrade confidence level
2. Assign more accurate layers based on complete findings

=== LAYER ASSIGNMENT (max 2 per paper) ===

FLOODING LAYERS:
- passive_marine_flooding: Direct ocean inundation
  Match if: "marine inundation", "coastal flooding", "flooded area", "bathtub model"
  
- groundwater_inundation: Subsurface flooding from water table rise
  Match if: "groundwater", "water table", "subsurface flooding", "drainage impacts"
  
- low_lying_flooding: Areas below critical elevations
  Match if: "critical elevation", "below [X]m", "elevation threshold"
  
- compound_flooding: Multiple flood sources
  Match if: "compound flooding", "multiple mechanisms", "storm surge + rain"
  
- drainage_backflow: Storm drain flooding
  Match if: "storm drain", "drainage backflow", "sewer flooding"

EROSION LAYERS:
- future_erosion_hazard_zone: Shoreline retreat rates/predictions
  Match if: "erosion rate", "[X] m/year", "shoreline change", "beach loss", "hazard zone"
  
- annual_high_wave_flooding: Wave-driven coastal flooding
  Match if: "wave runup", "wave-driven flooding", "extreme waves", "overwash"
  
- emergent_and_shallow_groundwater: Groundwater near surface
  Match if: "shallow groundwater", "emergent groundwater", "water table depth"

=== RULES ===
1. Max 2 layers per paper
2. HIGH confidence: Quantitative Hawaii data + specific findings in results/discussion
3. MEDIUM confidence: Methodology or indirect relevance
4. If full text confirms abstract assessment, keep original
5. Look specifically for quantitative data in Results/Discussion sections

=== RESPONSE (JSON) ===
{{
    "relevant": true or false,
    "confidence": "HIGH" or "MEDIUM",
    "relevant_layers": ["layer1"],
    "reasoning": "Cite specific quantitative findings from Results/Discussion",
    "changed_from_abstract": true or false,
    "key_findings": "Brief summary of quantitative data found"
}}"""

In [32]:
from openai import OpenAI

client = OpenAI()
results = []
 # Create prompt
for paper in successful_extractions:
    full_text = extract_full_text_pdfplumber(f"./pdf_pub/{paper['filename']}")
    prompt = full_text_review_prompt(paper, full_text)
    # Call LLM
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in climate science and coastal hazards, specializing in Hawaiian environmental research."
            },
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"},
        temperature=0.3
    )
    result = json.loads(response.choices[0].message.content)
    result['paper_metadata'] = paper
    
    results.append(result)

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P7' is an invalid float value
Cannot set gray stroke color because /'P8' is an invalid float value
Cannot set gray stroke color because /'P9' is an invalid float value
Cannot set gray stroke color b

In [34]:
# Save results
with open('full_text_w_paper_metadata_results.json', 'w') as f:
    json.dump(results, f, indent=2)
