In [39]:
#import packages
import os
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
import fitz
import pandas as pd
from bertopic import BERTopic
import spacy
from spacy.matcher import PhraseMatcher
import random
from collections import defaultdict, Counter
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
nlp.max_length = 2_000_000  # Adjust this as needed


In [36]:
# Load spaCy model for NER
nlp = spacy.load('en_core_web_sm')

# Define a dictionary of common abbreviations
abbreviation_map = {
    "WBGT": "Wet Bulb Globe Temperature",
    "AC": "Air Conditioning",
    "UHI": "Urban Heat Island",
    "COP": "Coefficient of Performance",
    "RH": "Relative Humidity",
    "UTCI": "Universal Thermal Climate Index",
    "PPE": "Personal Protective Equipment",
    "OHS": "Occupational Health and Safety",
    "HVAC": "Heating, Ventilation, and Air Conditioning",
    "EMS": "Emergency Medical Services"
}

# keywords focused on core heat stress, cooling methods, physiological terms, and occupational health basics
heat_stress_keywords = [
    # Core concepts
    "heat stress", "thermal stress", "thermal comfort", "heat load", "heat exposure", "heat risk",
    "heat-related illness", "heat exhaustion", "heat stroke", "heat-related mortality",
    "heat index", "WBGT", "UTCI", "Relative Humidity",

    # Cooling strategies
    "cooling zones", "cooling centers", "cooling shelters", "hydration stations", "green roofs",
    "cool roofs", "urban cooling", "natural ventilation", "fans", "air conditioning", 
    "shading structures", "permeable pavements", "green walls", "urban trees", "urban forestry",

    # Worker safety / occupational health
    "rest breaks", "shift scheduling", "hydration", "water supply", "protective clothing", 
    "personal protective equipment", "PPE compliance", "workplace cooling", "safety training", 
    "fatigue prevention", "first aid", "medical response", "heat stress prevention", 
    "occupational safety", "outdoor labor", "industrial heat", "construction safety",

    # Resilience / adaptation
    "acclimatization", "heat acclimatization", "climate resilience", "adaptive capacity",
    "emergency preparedness", "disaster risk reduction", "resilient cities", 
    "early warning systems", "cooling interventions"
]

# focused on broader environmental, urban, climate, health, safety, and single-word keywords to capture diverse contexts
relevant_terms = {
    # Environmental & urban planning
    "urban heat stress", "green infrastructure", "urban adaptation", "urban cooling systems",
    "greenery", "vegetation", "urban forestry", "evapotranspiration", "green roofs", "green walls",
    "cool roofs", "shade provision", "cool pavements", "permeable surfaces", "urban health", 
    "climate-sensitive planning", "climate-smart cities", "nature-based solutions", "albedo",
    "microclimate regulation", "urban heat mitigation", "resilient urban design", 
    "sustainable urban development", "built environment", "thermal insulation",

    # Climate and health
    "extreme heat", "climate change", "environmental heat", "solar radiation", "ambient temperature",
    "humidity", "heat wave", "overheating", "heat-related mortality", "climate vulnerability",
    "weather extremes", "heat adaptation", "public health", "health equity", "environmental justice",

    # Worker protection
    "occupational health", "worker safety", "heat safety training", "industrial safety", 
    "PPE", "shift schedules", "rest breaks", "water access", "dehydration", "outdoor workers",
    "high-risk workers", "fatigue prevention", "first aid", "medical emergency", 
    "construction workers", "farm workers", "workplace safety", "safety measures",

    # Broader interventions
    "cooling technologies", "air conditioning", "fans", "ventilation", "natural cooling", 
    "climate adaptation", "thermal comfort zones", "sustainable cooling", "zero-emission cities",
    "energy efficiency", "energy demand", "climate action plans", "urban ecosystem", "rainwater harvesting",

    # Single keywords (disambiguate based on context in code)
    "temperature", "humidity", "cooling", "shade", "air", "heat", "hydration", "safety", "stress", 
    "resilience", "protection", "exposure", "evaporation", "convection", "conduction", "radiation", 
    "climate", "health", "fatigue", "recovery", "acclimation", "heatstroke", "well-being", 
    "workforce", "monitoring", "risk", "intervention", "emergency", "prevention", "adaptation", 
    "protocol", "response", "vulnerability", "monitor", "workplace"
}

In [9]:
# Step 1: Load and preprocess
def load_and_clean_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [10]:
# Step 2: Tokenize and Remove Stopwords
def tokenize_and_remove_stopwords(text):
    nltk.download('stopwords', quiet=True)
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

In [11]:
# Step 3: Named Entity Recognition (NER) for heat stress terms
def extract_heat_stress_entities(text):
    doc = nlp(text)
    heat_stress_entities = []
    
    # Customize the entities you're looking for (e.g., cooling shelters, hydration stations)
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'GPE', 'FAC', 'PRODUCT']:  # Organization, Location, Facility, Product
            heat_stress_entities.append(ent.text)
    
    return heat_stress_entities

In [12]:
# Step 4: Keyword Extraction using TF-IDF (focusing on heat stress)
def extract_heat_stress_keywords(texts, keyword_list):
    vectorizer = TfidfVectorizer(vocabulary=keyword_list, max_df=0.8, min_df=2)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer

In [13]:
# Step 5: Cluster and Extract Top Keywords from Clusters
def cluster_texts(tfidf_matrix, vectorizer, num_clusters=3):
    from sklearn.cluster import KMeans

    km = KMeans(n_clusters=num_clusters, random_state=42)
    km.fit(tfidf_matrix)
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    
    cluster_keywords = {}
    for i in range(num_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :10]]
        cluster_keywords[f"Cluster {i}"] = top_terms
        print(f"\nCluster {i} top keywords:")
        print(", ".join(top_terms))
    
    return km.labels_, cluster_keywords


In [14]:
# Step 6: Process extracted entities (replace abbreviations, filter terms)
def process_entities(extracted_entities, relevant_terms):
    # Replace abbreviations with full forms
    extracted_entities = [abbreviation_map.get(entity, entity) for entity in extracted_entities]
    
    # Remove duplicates
    extracted_entities = list(set(extracted_entities))
    
    # Filter by relevant terms
    extracted_entities = [entity for entity in extracted_entities if entity.lower() in relevant_terms]
    
    # Remove unwanted symbols or references
    unwanted_symbols = {"•", "ref", "fig", "≥", "≥"}
    extracted_entities = [entity for entity in extracted_entities if entity not in unwanted_symbols]
    print(f'length of extracted_entities after processing: {len(extracted_entities)}')
    return extracted_entities

In [21]:
# Process pdfs - run above functions
def process_pdfs(random_sample, pdf_folder='papers/'):
    all_pdfs = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

    if random_sample == 1:
        sample_size = min(50, len(all_pdfs))
        sampled_pdfs = random.sample(all_pdfs, sample_size)
    else:
        sampled_pdfs = all_pdfs
        sample_size = len(sampled_pdfs)

    all_entities = []
    for pdf_file in sampled_pdfs:
        file_path = os.path.join(pdf_folder, pdf_file)
        print(f"Processing {pdf_file} ...")
        
        text = load_and_clean_text_from_pdf(file_path)
        cleaned_text = tokenize_and_remove_stopwords(text)
        entities = extract_heat_stress_entities(cleaned_text)
        processed_entities = process_entities(entities, relevant_terms)
        all_entities.extend(processed_entities)

    print(f"Total extracted entities from {sample_size} papers: {len(all_entities)}")
    return all_entities

# TESTRUN: Main pipeline running first on 50 random PDFs, then all papers
# First, only 50 papers will be sampled - to make sure that the code runs as expected
entities_50_papers = process_pdfs(1) 


Processing 1-s2.0-S0003687024000589-main.pdf ...
length of extracted_entities after processing: 1
Processing Wang-EstimatingOccupationalHeat-2019.pdf ...
length of extracted_entities after processing: 1
Processing Xiang-Extremeheatoccupational-2015.pdf ...
length of extracted_entities after processing: 0
Processing fpubh-09-713711.pdf ...
length of extracted_entities after processing: 0
Processing Morris et al 2020.pdf ...
length of extracted_entities after processing: 0
Processing HeatRelatedDeathsAmong-2008.pdf ...
length of extracted_entities after processing: 0
Processing Matthews-Communicatingdeadlyconsequences-2017.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0013935118305772-main.pdf ...
length of extracted_entities after processing: 0
Processing Hunt-HeatStrainHydration-2014.pdf ...
length of extracted_entities after processing: 0
Processing Effects_of_Heat_Stress_Exposure_and_Clim.pdf ...
length of extracted_entities after processing: 0
Processin

In [25]:
# Now, all papers will be processed
entities_all_papers = process_pdfs(0)

Processing 0.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0001457513003230-main.pdf ...
length of extracted_entities after processing: 1
Processing 1-s2.0-S0003687024000589-main.pdf ...
length of extracted_entities after processing: 1
Processing 1-s2.0-S0013935118305772-main.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S001393512100075X-main.pdf ...
length of extracted_entities after processing: 2
Processing 1-s2.0-S0013935125007467-main.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0048969718323313-main.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0048969721033313-main.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0169204614000498-main.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0306456513000521-main.pdf ...
length of extracted_entities after processing: 0
Processing 1-s2.0-S0360132325002653-main.pdf ...

<h3> In summary, we have 40 extracted entities from 138 research papers related to heat stress - this is only 0.28 entities per paper, which could indicate that the filtering is too restricted, or that spaCy's general NER model may not be well-suited for exploring heat stress. Now, we will try some different approaches. </h3>


In [29]:
# Prepare matcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# Convert keyword list to patterns
phrases = heat_stress_keywords
patterns = [nlp.make_doc(text) for text in phrases]
matcher.add("HEAT_STRESS_TERM", patterns)

def extract_matched_phrases(text):
    doc = nlp(text)
    matches = matcher(doc)
    return list(set([doc[start:end].text for match_id, start, end in matches]))

In [31]:
# Improved entity recognition

# Approach 1: Use PhraseMatcher instead of NER for domain-specific terms
def extract_domain_entities_v2(text):
    """Extract entities using PhraseMatcher for better domain coverage"""
    doc = nlp(text)
    matches = matcher(doc)
    matched_entities = []
    
    for match_id, start, end in matches:
        span = doc[start:end]
        matched_entities.append(span.text)
    
    return list(set(matched_entities))

# Approach 2: Combine multiple extraction methods
def extract_comprehensive_entities(text):
    """Combine NER, PhraseMatcher, and regex patterns"""
    doc = nlp(text)
    all_entities = []
    
    # Method 1: Traditional NER
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'GPE', 'FAC', 'PRODUCT', 'PERSON']:
            all_entities.append(ent.text)
    
    # Method 2: PhraseMatcher for domain terms
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        all_entities.append(span.text)
    
    # Method 3: Regex for specific patterns (measurements, standards)
    # Find temperature measurements
    temp_pattern = r'\b\d+[°℃℉]\s*[CF]?\b'
    temps = re.findall(temp_pattern, text)
    all_entities.extend(temps)
    
    # Find WBGT measurements
    wbgt_pattern = r'WBGT\s*[><=]?\s*\d+[°℃℉]?'
    wbgts = re.findall(wbgt_pattern, text, re.IGNORECASE)
    all_entities.extend(wbgts)
    
    return list(set(all_entities))

# Approach 3: Expand relevant terms with more variations
# Combine with existing relevant_terms
expanded_relevant_terms = set(relevant_terms) | {
    # Measurement and assessment terms
    "wet bulb globe temperature", "wbgt", "heat index", "humidex", 
    "thermal work limit", "physiological strain index", "core temperature",
    "sweat rate", "metabolic rate", "thermal sensation", "thermal comfort",
    
    # Specific interventions
    "cooling vest", "ice vest", "cooling towel", "misting system",
    "evaporative cooling", "radiant cooling", "mechanical ventilation",
    "work-rest cycles", "heat acclimatization program", "hydration protocol",
    
    # Occupational categories
    "construction worker", "agricultural worker", "firefighter", 
    "steel worker", "foundry worker", "outdoor laborer", "roofer",
    
    # Organizations and standards
    "osha", "niosh", "acgih", "iso", "astm", "ashrae",
    "occupational safety and health administration",
    "american conference of governmental industrial hygienists",
    
    # Medical terms
    "heat exhaustion", "heat stroke", "heat cramps", "heat rash",
    "dehydration", "hyperthermia", "thermoregulation", "heat illness",
    
    # Geographic and temporal
    "summer months", "hot climate", "tropical region", "desert climate",
    "heat wave", "extreme temperature", "global warming"
}

# Approach 4: Use noun phrase extraction
def extract_noun_phrases(text):
    """Extract noun phrases that might contain relevant concepts"""
    doc = nlp(text)
    noun_phrases = []
    
    for chunk in doc.noun_chunks:
        # Filter for phrases containing heat-related terms
        chunk_lower = chunk.text.lower()
        if any(term in chunk_lower for term in ['heat', 'thermal', 'cool', 'temperature', 'stress', 'work']):
            if len(chunk.text.split()) <= 4:  # Keep shorter phrases
                noun_phrases.append(chunk.text)
    
    return list(set(noun_phrases))

# Approach 5: Frequency-based filtering
def extract_frequent_terms(texts, min_freq=3):
    """Extract terms that appear frequently across documents"""
    all_terms = []
    for text in texts:
        try:
            doc = nlp(text)
            # Extract noun phrases and named entities
            for chunk in doc.noun_chunks:
                if 2 <= len(chunk.text.split()) <= 3:  # 2-3 word phrases
                    all_terms.append(chunk.text.lower())
            
            for ent in doc.ents:
                all_terms.append(ent.text.lower())
        except Exception as e:
            print(f"Error processing text: {e}")
            continue
    
    # Count frequencies and filter
    term_counts = Counter(all_terms)
    frequent_terms = [term for term, count in term_counts.items() if count >= min_freq]
    
    return frequent_terms, term_counts

# Modified main processing function
def process_entities_improved(extracted_entities, relevant_terms, min_length=3):
    """Improved entity processing with better filtering"""
    
    # Replace abbreviations
    processed = [abbreviation_map.get(entity, entity) for entity in extracted_entities]
    
    # Remove very short terms (unless they're known abbreviations)
    known_abbrevs = set(abbreviation_map.keys())
    processed = [entity for entity in processed 
                if len(entity) >= min_length or entity.upper() in known_abbrevs]
    
    # Remove duplicates (case-insensitive)
    seen = set()
    unique_entities = []
    for entity in processed:
        entity_lower = entity.lower()
        if entity_lower not in seen:
            seen.add(entity_lower)
            unique_entities.append(entity)
    
    # Filter by relevance (more flexible matching)
    relevant_entities = []
    for entity in unique_entities:
        entity_lower = entity.lower()
        # Exact match or partial match for longer terms
        if (entity_lower in relevant_terms or 
            any(term in entity_lower for term in relevant_terms if len(term) > 5)):
            relevant_entities.append(entity)
    
    return relevant_entities

# Function to examine current entities
def examine_entities(entities_list):
    """Examine the extracted entities"""
    print(f"Total unique entities: {len(set(entities_list))}")
    print(f"Total entities (with duplicates): {len(entities_list)}")
    
    # Count frequencies
    entity_counts = Counter(entities_list)
    
    print("\nMost frequent entities:")
    for entity, count in entity_counts.most_common(20):
        print(f"  {entity}: {count}")
    
    print(f"\nAll unique entities:")
    for entity in sorted(set(entities_list)):
        print(f"  - {entity}")
    
    return entity_counts

# Quick test of improved extraction on a sample
def test_improved_extraction(sample_size=5):
    """Test the improved extraction on a few files"""
    pdf_folder = 'papers/'
    all_pdfs = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
    sample_pdfs = all_pdfs[:sample_size]
    
    print("Testing improved extraction...")
    all_entities = []
    
    for pdf_file in sample_pdfs:
        file_path = os.path.join(pdf_folder, pdf_file)
        print(f"\nProcessing {pdf_file}...")
        
        try:
            text = load_and_clean_text_from_pdf(file_path)
            
            # Method 1: Your original NER approach
            original_entities = extract_heat_stress_entities(text)
            original_processed = process_entities(original_entities, relevant_terms)
            
            # Method 2: PhraseMatcher approach
            phrase_entities = extract_domain_entities_v2(text)
            phrase_processed = process_entities_improved(phrase_entities, expanded_relevant_terms)
            
            # Method 3: Noun phrases
            noun_entities = extract_noun_phrases(text)
            noun_processed = process_entities_improved(noun_entities, expanded_relevant_terms)
            
            # Combine all methods
            combined = list(set(original_processed + phrase_processed + noun_processed))
            all_entities.extend(combined)
            
            print(f"  Original method: {len(original_processed)} entities")
            print(f"  PhraseMatcher: {len(phrase_processed)} entities") 
            print(f"  Noun phrases: {len(noun_processed)} entities")
            print(f"  Combined unique: {len(combined)} entities")
            
            if combined:
                print(f"  Sample entities: {combined[:5]}")
                
        except Exception as e:
            print(f"  Error processing {pdf_file}: {e}")
    
    print(f"\nTotal entities from {sample_size} files: {len(all_entities)}")
    return all_entities

# Run the test
print("=== EXAMINING THE CURRENT ENTITIES ===")
current_entity_counts = examine_entities(entities_all_papers)

print("\n" + "="*50)
print("=== TESTING IMPROVED EXTRACTION ===")
improved_entities = test_improved_extraction(5)

=== EXAMINING THE CURRENT ENTITIES ===
Total unique entities: 5
Total entities (with duplicates): 40

Most frequent entities:
  wbgt: 19
  ppe: 12
  albedo: 4
  greenery: 3
  healthcare: 2

All unique entities:
  - albedo
  - greenery
  - healthcare
  - ppe
  - wbgt

=== TESTING IMPROVED EXTRACTION ===
Testing improved extraction...

Processing 0.pdf...
length of extracted_entities after processing: 0
  Original method: 0 entities
  PhraseMatcher: 21 entities
  Noun phrases: 130 entities
  Combined unique: 146 entities
  Sample entities: ['physiological heat loss responses', 'core temperature', 'the temperature', 'body temperature', 'heat exposure']

Processing 1-s2.0-S0001457513003230-main.pdf...
length of extracted_entities after processing: 1
  Original method: 1 entities
  PhraseMatcher: 19 entities
  Noun phrases: 110 entities
  Combined unique: 122 entities
  Sample entities: ['workplace characteristics', 'an effective temperature scale', 'the environmental temperature', 'heat st

In [32]:
# Perform improved extraction on all papers
def extract_entities_all_papers_improved():
    """Apply improved entity extraction to all PDF papers"""
    pdf_folder = 'papers/'
    all_pdfs = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
    
    print(f"Found {len(all_pdfs)} PDF files to process")
    print("Starting improved entity extraction on all papers...")
    
    all_entities_improved = []
    file_entity_counts = {}
    processing_stats = {
        'successful': 0,
        'failed': 0,
        'total_entities': 0
    }
    
    for i, pdf_file in enumerate(all_pdfs, 1):
        file_path = os.path.join(pdf_folder, pdf_file)
        print(f"\n[{i}/{len(all_pdfs)}] Processing {pdf_file}...")
        
        try:
            # Load and clean text
            text = load_and_clean_text_from_pdf(file_path)
            
            if not text or len(text.strip()) < 100:
                print(f"  Warning: Very short or empty text extracted")
                continue
            
            # Apply all improved extraction methods
            file_entities = []
            
            # Method 1: Comprehensive extraction (NER + PhraseMatcher + Regex)
            comprehensive_entities = extract_comprehensive_entities(text)
            comprehensive_processed = process_entities_improved(comprehensive_entities, expanded_relevant_terms)
            file_entities.extend(comprehensive_processed)
            
            # Method 2: Domain-specific PhraseMatcher
            phrase_entities = extract_domain_entities_v2(text)
            phrase_processed = process_entities_improved(phrase_entities, expanded_relevant_terms)
            file_entities.extend(phrase_processed)
            
            # Method 3: Noun phrase extraction
            noun_entities = extract_noun_phrases(text)
            noun_processed = process_entities_improved(noun_entities, expanded_relevant_terms)
            file_entities.extend(noun_processed)
            
            # Remove duplicates from this file
            file_entities_unique = list(set(file_entities))
            
            # Add to overall collection
            all_entities_improved.extend(file_entities_unique)
            file_entity_counts[pdf_file] = len(file_entities_unique)
            
            processing_stats['successful'] += 1
            processing_stats['total_entities'] += len(file_entities_unique)
            
            print(f"  ✓ Extracted {len(file_entities_unique)} unique entities")
            if file_entities_unique:
                print(f"  Sample: {file_entities_unique[:3]}")
                
        except Exception as e:
            print(f"  ✗ Error processing {pdf_file}: {e}")
            processing_stats['failed'] += 1
            continue
    
    # Final statistics
    print(f"\n{'='*60}")
    print("PROCESSING COMPLETE")
    print(f"{'='*60}")
    print(f"Successfully processed: {processing_stats['successful']} files")
    print(f"Failed to process: {processing_stats['failed']} files")
    print(f"Total entities extracted: {processing_stats['total_entities']}")
    print(f"Unique entities across all papers: {len(set(all_entities_improved))}")
    
    return all_entities_improved, file_entity_counts, processing_stats

# Run the improved extraction on all papers
print("Starting improved entity extraction on all papers...")
entities_all_papers_improved, file_counts_improved, stats_improved = extract_entities_all_papers_improved()

# Analyze the results
print(f"\n{'='*60}")
print("ANALYSIS OF IMPROVED EXTRACTION RESULTS")
print(f"{'='*60}")
"""
# Compare with original results if available
if 'entities_all_papers' in globals():
    original_unique = len(set(entities_all_papers))
    improved_unique = len(set(entities_all_papers_improved))
    print(f"Original extraction: {original_unique} unique entities")
    print(f"Improved extraction: {improved_unique} unique entities")
    print(f"Improvement: +{improved_unique - original_unique} entities ({((improved_unique/original_unique)-1)*100:.1f}% increase)")
"""
# Entity frequency analysis
entity_counts_improved = Counter(entities_all_papers_improved)

print(f"\nTop 20 most frequent entities (improved method):")
for entity, count in entity_counts_improved.most_common(20):
    print(f"  {entity}: {count}")

# Files with most entities
print(f"\nTop 10 files by entity count:")
sorted_files = sorted(file_counts_improved.items(), key=lambda x: x[1], reverse=True)
for filename, count in sorted_files[:10]:
    print(f"  {filename}: {count} entities")


Starting improved entity extraction on all papers...
Found 138 PDF files to process
Starting improved entity extraction on all papers...

[1/138] Processing 0.pdf...
  ✓ Extracted 161 unique entities
  Sample: ['sydney school of health sciences prof', 'physiological heat loss responses', 'core temperature']

[2/138] Processing 1-s2.0-S0001457513003230-main.pdf...
  ✓ Extracted 131 unique entities
  Sample: ['workplace characteristics', 'an effective temperature scale', 'the environmental temperature']

[3/138] Processing 1-s2.0-S0003687024000589-main.pdf...
  ✓ Extracted 101 unique entities
  Sample: ['the maximum monthly temperatures', 'the temperature', 'heatstress mitigation strategies']

[4/138] Processing 1-s2.0-S0013935118305772-main.pdf...
  ✓ Extracted 70 unique entities
  Sample: ['the wet bulb temperature', 'farm workers', 'heat exposure risks']

[5/138] Processing 1-s2.0-S001393512100075X-main.pdf...
  ✓ Extracted 101 unique entities
  Sample: ['workplace characteristics', '

'\n# Save results to variables for further analysis\nprint(f"\nResults saved to variables:")\nprint(f"  - entities_all_papers_improved: List of all extracted entities")\nprint(f"  - entity_counts_improved: Counter object with entity frequencies") \nprint(f"  - file_counts_improved: Dictionary with entity counts per file")\nprint(f"  - stats_improved: Processing statistics")\n\n# Optional: Save to file\nsave_to_file = input("\nSave results to text file? (y/n): ").lower().strip() == \'y\'\nif save_to_file:\n    with open(\'improved_entity_extraction_results.txt\', \'w\', encoding=\'utf-8\') as f:\n        f.write("IMPROVED ENTITY EXTRACTION RESULTS\n")\n        f.write("="*50 + "\n\n")\n        \n        f.write(f"Processing Statistics:\n")\n        f.write(f"- Successfully processed: {stats_improved[\'successful\']} files\n")\n        f.write(f"- Failed to process: {stats_improved[\'failed\']} files\n")\n        f.write(f"- Total entities: {stats_improved[\'total_entities\']}\n")\n     

<h2> Summary of Results (Improved  Extraction) </h2>

(the code cell above took approx 30 minutes to run)

1. 137/138 papers were able to be processed, 6474 unique entities and 11,661 total entities were able to be recognized.
2. Core concepts: heat stress, heat exposure, thermal stress
3. Specific conditions: heat stroke, heat exhaustion, heat waves
4. Technical terms: WBGT, acclimatization, occupational heat stress
5. Interventions: air conditioning, shade, ventilation, hydration
6. Context: occupational health, the workplace
7. Distribution analysis: the frequency distribution (129 down to 57) shows a good balance - the analysis is not too concentrated on just a few terms. The entity counts per file (275 down to 13 for the top 10) indicate comprehensive exraction across documents.

In [37]:
# Save results to variables for further analysis
print(f"\nResults saved to variables:")
print(f"  - entities_all_papers_improved: List of all extracted entities")
print(f"  - entity_counts_improved: Counter object with entity frequencies") 
print(f"  - file_counts_improved: Dictionary with entity counts per file")
print(f"  - stats_improved: Processing statistics")

# Optional: Save to file
save_to_file = input("\nSave results to text file? (y/n): ").lower().strip() == 'y'
if save_to_file:
    with open('improved_entity_extraction_results.txt', 'w', encoding='utf-8') as f:
        f.write("IMPROVED ENTITY EXTRACTION RESULTS\n")
        f.write("="*50 + "\n\n")
        
        f.write(f"Processing Statistics:\n")
        f.write(f"- Successfully processed: {stats_improved['successful']} files\n")
        f.write(f"- Failed to process: {stats_improved['failed']} files\n")
        f.write(f"- Total entities: {stats_improved['total_entities']}\n")
        f.write(f"- Unique entities: {len(set(entities_all_papers_improved))}\n\n")
        
        f.write("Entity Frequencies:\n")
        for entity, count in entity_counts_improved.most_common(50):
            f.write(f"{entity}: {count}\n")
        
        f.write(f"\nAll Unique Entities ({len(set(entities_all_papers_improved))}):\n")
        for entity in sorted(set(entities_all_papers_improved)):
            f.write(f"- {entity}\n")
    
    print("Results saved to 'improved_entity_extraction_results.txt'")

print(f"\n{'='*60}")



Results saved to variables:
  - entities_all_papers_improved: List of all extracted entities
  - entity_counts_improved: Counter object with entity frequencies
  - file_counts_improved: Dictionary with entity counts per file
  - stats_improved: Processing statistics



Save results to text file? (y/n):  y


Results saved to 'improved_entity_extraction_results.txt'



<h2> Finding heat stress mitigation measures </h2>

In [40]:
# Define categories for heat stress problems and solutions
HEAT_CONDITIONS = {
    'heat stress', 'heat strain', 'heat exhaustion', 'heat stroke', 'heat illness',
    'hyperthermia', 'dehydration', 'heat cramps', 'heat rash', 'thermal stress',
    'heat exposure', 'occupational heat stress', 'heat-related illness'
}

MITIGATION_MEASURES = {
    'air conditioning', 'ventilation', 'shade', 'hydration', 'cooling',
    'acclimatization', 'work-rest cycles', 'cooling vest', 'ice vest',
    'misting system', 'evaporative cooling', 'mechanical ventilation',
    'hydration protocol', 'rest breaks', 'training', 'education',
    'personal protective equipment', 'ppe', 'engineering controls',
    'administrative controls', 'medical surveillance'
}

def categorize_entities(entities_list):
    """Categorize entities into problems, solutions, and other"""
    categories = {
        'conditions': [],
        'mitigations': [],
        'other': []
    }
    
    for entity in set(entities_list):
        entity_lower = entity.lower()
        
        if any(condition in entity_lower for condition in HEAT_CONDITIONS):
            categories['conditions'].append(entity)
        elif any(mitigation in entity_lower for mitigation in MITIGATION_MEASURES):
            categories['mitigations'].append(entity)
        else:
            categories['other'].append(entity)
    
    return categories

def calculate_cooccurrence_matrix(texts, entities_list, window_size=50):
    """Calculate co-occurrence matrix for entities within a sliding window"""
    unique_entities = list(set(entities_list))
    entity_to_idx = {entity: i for i, entity in enumerate(unique_entities)}
    n_entities = len(unique_entities)
    
    # Initialize co-occurrence matrix
    cooccurrence_matrix = np.zeros((n_entities, n_entities))
    
    print(f"Calculating co-occurrence for {n_entities} entities...")
    
    for text_idx, text in enumerate(texts):
        if text_idx % 20 == 0:
            print(f"Processing text {text_idx + 1}/{len(texts)}")
            
        text_lower = text.lower()
        
        # Find positions of all entities in this text
        entity_positions = []
        for entity in unique_entities:
            start = 0
            while True:
                pos = text_lower.find(entity.lower(), start)
                if pos == -1:
                    break
                entity_positions.append((pos, entity))
                start = pos + 1
        
        # Sort by position
        entity_positions.sort()
        
        # Calculate co-occurrences within window
        for i, (pos1, entity1) in enumerate(entity_positions):
            for j, (pos2, entity2) in enumerate(entity_positions[i+1:], i+1):
                if abs(pos2 - pos1) <= window_size:
                    idx1, idx2 = entity_to_idx[entity1], entity_to_idx[entity2]
                    cooccurrence_matrix[idx1][idx2] += 1
                    cooccurrence_matrix[idx2][idx1] += 1  # Symmetric
                else:
                    break  # No need to check further
    
    return cooccurrence_matrix, unique_entities

def find_condition_mitigation_pairs(cooccurrence_matrix, entities_list, min_cooccurrence=3):
    """Find condition-mitigation pairs based on co-occurrence"""
    categories = categorize_entities(entities_list)
    entity_to_idx = {entity: i for i, entity in enumerate(entities_list)}
    
    condition_mitigation_pairs = []
    
    for condition in categories['conditions']:
        if condition not in entity_to_idx:
            continue
            
        condition_idx = entity_to_idx[condition]
        
        for mitigation in categories['mitigations']:
            if mitigation not in entity_to_idx:
                continue
                
            mitigation_idx = entity_to_idx[mitigation]
            cooccurrence_count = cooccurrence_matrix[condition_idx][mitigation_idx]
            
            if cooccurrence_count >= min_cooccurrence:
                condition_mitigation_pairs.append({
                    'condition': condition,
                    'mitigation': mitigation,
                    'cooccurrence': int(cooccurrence_count),
                    'condition_freq': entities_list.count(condition),
                    'mitigation_freq': entities_list.count(mitigation)
                })
    
    # Sort by co-occurrence strength
    condition_mitigation_pairs.sort(key=lambda x: x['cooccurrence'], reverse=True)
    return condition_mitigation_pairs

def calculate_frequency_weights(entities_list):
    """Calculate frequency-based weights for entities"""
    entity_counts = Counter(entities_list)
    total_entities = len(entities_list)
    
    frequency_weights = {}
    for entity, count in entity_counts.items():
        frequency_weights[entity] = {
            'count': count,
            'frequency': count / total_entities,
            'weight': np.log(count + 1)  # Log-scaled weight
        }
    
    return frequency_weights

def create_mitigation_recommendations(condition_mitigation_pairs, frequency_weights, top_n=10):
    """Create prioritized mitigation recommendations"""
    recommendations = defaultdict(list)
    
    for pair in condition_mitigation_pairs:
        condition = pair['condition']
        mitigation = pair['mitigation']
        
        # Calculate composite score
        cooccurrence_score = pair['cooccurrence']
        frequency_score = frequency_weights[mitigation]['weight']
        composite_score = cooccurrence_score * frequency_score
        
        recommendations[condition].append({
            'mitigation': mitigation,
            'cooccurrence': pair['cooccurrence'],
            'mitigation_frequency': pair['mitigation_freq'],
            'composite_score': composite_score
        })
    
    # Sort recommendations for each condition
    for condition in recommendations:
        recommendations[condition].sort(key=lambda x: x['composite_score'], reverse=True)
        recommendations[condition] = recommendations[condition][:top_n]
    
    return dict(recommendations)

def visualize_cooccurrence_network(condition_mitigation_pairs, min_cooccurrence=5):
    """Create network visualization of condition-mitigation relationships"""
    G = nx.Graph()
    
    # Add nodes and edges
    for pair in condition_mitigation_pairs:
        if pair['cooccurrence'] >= min_cooccurrence:
            G.add_node(pair['condition'], node_type='condition')
            G.add_node(pair['mitigation'], node_type='mitigation')
            G.add_edge(pair['condition'], pair['mitigation'], weight=pair['cooccurrence'])
    
    # Create visualization
    plt.figure(figsize=(15, 10))
    pos = nx.spring_layout(G, k=3, iterations=50)
    
    # Separate nodes by type
    condition_nodes = [n for n, d in G.nodes(data=True) if d.get('node_type') == 'condition']
    mitigation_nodes = [n for n, d in G.nodes(data=True) if d.get('node_type') == 'mitigation']
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, nodelist=condition_nodes, node_color='red', 
                          node_size=1000, alpha=0.7, label='Heat Conditions')
    nx.draw_networkx_nodes(G, pos, nodelist=mitigation_nodes, node_color='blue', 
                          node_size=1000, alpha=0.7, label='Mitigation Measures')
    
    # Draw edges with varying thickness based on co-occurrence
    edges = G.edges(data=True)
    edge_weights = [d['weight'] for u, v, d in edges]
    nx.draw_networkx_edges(G, pos, width=[w/3 for w in edge_weights], alpha=0.6)
    
    # Add labels
    nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold')
    
    plt.title('Heat Stress Conditions and Mitigation Measures Co-occurrence Network')
    plt.legend()
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    return G

# Main analysis function
def analyze_heat_stress_mitigations(entities_list, texts=None):
    """Main function to analyze heat stress mitigations"""
    print("Starting heat stress mitigation analysis...")
    
    # 1. Categorize entities
    print("\n1. Categorizing entities...")
    categories = categorize_entities(entities_list)
    print(f"Found {len(categories['conditions'])} heat conditions")
    print(f"Found {len(categories['mitigations'])} mitigation measures")
    print(f"Found {len(categories['other'])} other entities")
    
    # 2. Calculate frequency weights
    print("\n2. Calculating frequency weights...")
    frequency_weights = calculate_frequency_weights(entities_list)
    
    # 3. Calculate co-occurrence (if texts available)
    if texts is not None:
        print("\n3. Calculating co-occurrence matrix...")
        cooccurrence_matrix, unique_entities = calculate_cooccurrence_matrix(texts, entities_list)
        
        # 4. Find condition-mitigation pairs
        print("\n4. Finding condition-mitigation pairs...")
        condition_mitigation_pairs = find_condition_mitigation_pairs(
            cooccurrence_matrix, unique_entities, min_cooccurrence=2
        )
        
        # 5. Create recommendations
        print("\n5. Creating mitigation recommendations...")
        recommendations = create_mitigation_recommendations(
            condition_mitigation_pairs, frequency_weights
        )
        
        return {
            'categories': categories,
            'frequency_weights': frequency_weights,
            'cooccurrence_matrix': cooccurrence_matrix,
            'unique_entities': unique_entities,
            'condition_mitigation_pairs': condition_mitigation_pairs,
            'recommendations': recommendations
        }
    else:
        # Simple frequency-based analysis without co-occurrence
        print("\n3. Creating frequency-based recommendations...")
        
        # Create simple recommendations based on frequency
        simple_recommendations = {}
        for condition in categories['conditions']:
            condition_freq = frequency_weights.get(condition, {}).get('count', 0)
            if condition_freq > 5:  # Only for frequently mentioned conditions
                mitigation_scores = []
                for mitigation in categories['mitigations']:
                    mit_data = frequency_weights.get(mitigation, {})
                    if mit_data.get('count', 0) > 3:
                        mitigation_scores.append({
                            'mitigation': mitigation,
                            'frequency': mit_data['count'],
                            'weight': mit_data['weight']
                        })
                
                mitigation_scores.sort(key=lambda x: x['weight'], reverse=True)
                simple_recommendations[condition] = mitigation_scores[:10]
        
        return {
            'categories': categories,
            'frequency_weights': frequency_weights,
            'simple_recommendations': simple_recommendations
        }

# Display results function
def display_mitigation_analysis(analysis_results):
    """Display the analysis results in a readable format"""
    
    print("="*80)
    print("HEAT STRESS MITIGATION ANALYSIS RESULTS")
    print("="*80)
    
    categories = analysis_results['categories']
    frequency_weights = analysis_results['frequency_weights']
    
    # Show categorization
    print(f"\nENTITY CATEGORIZATION:")
    print(f"Heat Conditions ({len(categories['conditions'])}):")
    for condition in sorted(categories['conditions'], 
                           key=lambda x: frequency_weights.get(x, {}).get('count', 0), 
                           reverse=True)[:10]:
        count = frequency_weights.get(condition, {}).get('count', 0)
        print(f"  - {condition}: {count} mentions")
    
    print(f"\nMitigation Measures ({len(categories['mitigations'])}):")
    for mitigation in sorted(categories['mitigations'], 
                           key=lambda x: frequency_weights.get(x, {}).get('count', 0), 
                           reverse=True)[:10]:
        count = frequency_weights.get(mitigation, {}).get('count', 0)
        print(f"  - {mitigation}: {count} mentions")
    
    # Show recommendations
    if 'recommendations' in analysis_results:
        recommendations = analysis_results['recommendations']
        print(f"\nCONDITION-MITIGATION RECOMMENDATIONS:")
        print(f"(Based on co-occurrence analysis)")
        
        for condition, mitigations in list(recommendations.items())[:5]:  # Top 5 conditions
            print(f"\nFor '{condition}':")
            for i, rec in enumerate(mitigations[:5], 1):  # Top 5 recommendations
                print(f"  {i}. {rec['mitigation']}")
                print(f"     Co-occurrence: {rec['cooccurrence']}, "
                      f"Frequency: {rec['mitigation_frequency']}, "
                      f"Score: {rec['composite_score']:.2f}")
    
    elif 'simple_recommendations' in analysis_results:
        recommendations = analysis_results['simple_recommendations']
        print(f"\nFREQUENCY-BASED RECOMMENDATIONS:")
        
        for condition, mitigations in list(recommendations.items())[:5]:
            print(f"\nFor '{condition}':")
            for i, rec in enumerate(mitigations[:5], 1):
                print(f"  {i}. {rec['mitigation']} (frequency: {rec['frequency']})")

# Run the analysis (you'll need to provide texts if you want co-occurrence analysis)
print("Running heat stress mitigation analysis...")

# If you have the original texts from PDFs, use this:
# analysis_results = analyze_heat_stress_mitigations(entities_all_papers_improved, texts=all_texts)

# If you only have entities, use this simpler version:
analysis_results = analyze_heat_stress_mitigations(entities_all_papers_improved)

# Display results
display_mitigation_analysis(analysis_results)

Running heat stress mitigation analysis...
Starting heat stress mitigation analysis...

1. Categorizing entities...
Found 1356 heat conditions
Found 753 mitigation measures
Found 4365 other entities

2. Calculating frequency weights...

3. Creating frequency-based recommendations...
HEAT STRESS MITIGATION ANALYSIS RESULTS

ENTITY CATEGORIZATION:
Heat Conditions (1356):
  - heat stress: 129 mentions
  - heat exposure: 110 mentions
  - heat stroke: 84 mentions
  - heat exhaustion: 63 mentions
  - occupational heat stress: 60 mentions
  - thermal stress: 57 mentions
  - heat illness: 56 mentions
  - the heat stress: 39 mentions
  - occupational heat exposure: 36 mentions
  - workplace heat exposure: 27 mentions

Mitigation Measures (753):
  - acclimatization: 79 mentions
  - hydration: 70 mentions
  - air conditioning: 63 mentions
  - shade: 62 mentions
  - ventilation: 57 mentions
  - personal protective equipment: 44 mentions
  - rest breaks: 28 mentions
  - heat acclimatization: 25 men