WEB SCRAPER FOR PHILVOCS

In [1]:
pip install selenium pandas beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.4.0 (from urllib3[socks]~=2.4.0->selenium)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.4.26 (from selenium)
  Downloading certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.30.0->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collect

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\Shahani\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import urllib3

# Disable SSL verification warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Output directory
output_dir = "earthquake_data"
os.makedirs(output_dir, exist_ok=True)
missing_log = []

# Define months and years
months = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]
years = range(2015, 2026)

# Base URL
base_url = "https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly"

for year in years:
    for month in months:
        url = f"{base_url}/{year}/{year}_{month}.html"
        print(f"🔍 Fetching: {url}")
        try:
            response = requests.get(url, timeout=10, verify=False)

            if response.status_code != 200:
                print(f"❌ Page not found: {url}")
                missing_log.append(f"{year} {month} — HTTP {response.status_code}")
                continue

            soup = BeautifulSoup(response.text, "html.parser")
            rows = soup.find_all("tr")
            data = []

            for row in rows:
                cells = row.find_all(["td", "th"])
                values = [cell.get_text(strip=True) for cell in cells]

                # Filter valid rows only
                if (
                    len(values) == 6 and
                    "Date" not in values[0] and
                    "SEISMICITY MAP" not in values[0].upper() and
                    values[0].upper() not in [m.upper() for m in months]
                ):
                    try:
                        date_part, time_part = values[0].split(" - ")
                        data.append([
                            date_part.strip(),              # Date
                            time_part.strip(),              # Time
                            values[1].strip(),              # Latitude
                            values[2].strip(),              # Longitude
                            values[3].strip(),              # Depth
                            values[4].strip(),              # Magnitude
                            values[5].strip()               # Location
                        ])
                    except ValueError:
                        continue  # Skip rows with malformed date/time

            if data:
                df = pd.DataFrame(data, columns=[
                    "Date", "Time", "Latitude", "Longitude", "Depth", "Magnitude", "Location"
                ])
                filename = f"{output_dir}/earthquakes_{year}_{month}.csv"
                df.to_csv(filename, index=False, encoding="utf-8-sig")
                print(f"✅ Saved {len(df)} rows to {filename}")
            else:
                print(f"⚠️ No earthquake data for {month} {year}")
                missing_log.append(f"{year} {month} — No earthquake entries")

        except Exception as e:
            print(f"❌ Error: {year} {month} — {type(e).__name__}: {e}")
            missing_log.append(f"{year} {month} — {type(e).__name__}: {e}")

# Save missing logs
if missing_log:
    with open(f"{output_dir}/missing_files.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(missing_log))
    print(f"📄 Missing months saved to: {output_dir}/missing_files.txt")
else:
    print("✅ All months fetched successfully.")


🔍 Fetching: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_January.html
❌ Page not found: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_January.html
🔍 Fetching: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_February.html
❌ Page not found: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_February.html
🔍 Fetching: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_March.html
❌ Page not found: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_March.html
🔍 Fetching: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_April.html
❌ Page not found: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_April.html
🔍 Fetching: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_May.html
❌ Page not found: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2015/2015_May.html
🔍 Fetching: https://earthquake.phivolcs.dost.gov.ph/EQLatest-Mon