In [None]:
# Install pdfplumber library to extract text from PDFs
!pip install pdfplumber

# Import necessary libraries
import pdfplumber
import pandas as pd
import os
import re
from google.colab import files  # For file download (works in Google Colab)

# List of keywords related to economic topics (e.g., damage, investment, housing)
keywords = [
    "economic", "loss", "damage", "impact", "investment", "unmet", "cost", "expenditure",
    "estimate", "revenue", "budget", "expenses", "repair", "relief", "insurance",
    "business", "property", "crop", "recovery", "housing", "infrastructure", "funding"
]

# Compile a regular expression to match dollar amounts (e.g., "$1 million" or "$500,000")
dollar_regex = re.compile(r'\$\s?\d[\d,]*(\.\d+)?\s?(billion|million)?', re.IGNORECASE)

# Initialize an empty list to store the relevant paragraphs found
relevant_data = []

# Loop through all PDFs in the current directory
for filename in os.listdir():
    # Check if the file is a PDF
    if filename.endswith(".pdf"):
        with pdfplumber.open(filename) as pdf:
            # Loop through each page of the PDF
            for page in pdf.pages:
                text = page.extract_text()
                if not text:
                    continue

                # Split the extracted text into paragraphs using two or more newlines
                paragraphs = re.split(r'\n{2,}', text)

                # Loop through each paragraph to check for keywords or dollar amounts
                for para in paragraphs:
                    # Clean up the paragraph by stripping extra spaces and replacing line breaks with spaces
                    para_clean = para.strip().replace("\n", " ")

                    # Check if any keyword is present in the paragraph (case-insensitive)
                    keyword_found = any(k in para_clean.lower() for k in keywords)
                    # Check if a dollar amount is present in the paragraph
                    dollar_found = bool(dollar_regex.search(para_clean))

                    # If a keyword or dollar amount is found, add the paragraph to the relevant data list
                    if keyword_found or dollar_found:
                        relevant_data.append({
                            "filename": filename,  # Store the filename for reference
                            "paragraph": para_clean  # Store the cleaned paragraph text
                        })

# Convert the list of relevant paragraphs into a DataFrame for easier handling
df = pd.DataFrame(relevant_data)

# Define the CSV filename to save the results
csv_filename = "relevant_paragraphs.csv"

# Save the DataFrame as a CSV file
df.to_csv(csv_filename, index=False)

# Download the CSV file (this works in Google Colab)
files.download(csv_filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done! File downloaded. Total relevant paragraphs: 141
