In [6]:
import time
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


PWS_ID = "IJERUS70"

# set the date range you want
START_DATE = "2025-11-13"
END_DATE   = "2025-11-14"


def scrape_one_day(driver, pws_id, date_str):
    """
    Load the WU daily table page in a real browser (with JS),
    find the table that has a 'Time' column, and return it as a DataFrame.
    """
    url = f"https://www.wunderground.com/dashboard/pws/{pws_id}/table/{date_str}/{date_str}/daily"
    print("Opening:", url)
    driver.get(url)

    # wait for any table to appear (JS needs a bit of time)
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )

    # find all tables and pick the one with a 'Time' header
    tables = driver.find_elements(By.TAG_NAME, "table")
    target_html = None

    for tbl in tables:
        header_elems = tbl.find_elements(By.TAG_NAME, "th")
        headers = [h.text.strip() for h in header_elems]
        if any("time" in h.lower() for h in headers):
            target_html = tbl.get_attribute("outerHTML")
            break

    if target_html is None:
        print("No suitable table found for", date_str)
        return None

    # parse the table HTML with pandas
    dfs = pd.read_html(target_html)
    if not dfs:
        print("read_html returned no tables for", date_str)
        return None

    df = dfs[0]

    if "Time" not in df.columns:
        print("Table for", date_str, "has no 'Time' column. Columns:", df.columns)
        return None

    # drop rows with missing time
    df = df.dropna(subset=["Time"])

    # build datetime column
    df["datetime"] = pd.to_datetime(
        df["Time"].apply(lambda t: f"{date_str} {t}"),
        errors="coerce"
    )

    # detect temperature column (anything with 'temp' in its name)
    temp_col = None
    for c in df.columns:
        if "temp" in c.lower():
            temp_col = c
            break

    if temp_col:
        # extract numeric temperature (handles values like '15.6 °C' or '60.1 °F')
        df["temp_C"] = (
            df[temp_col]
            .astype(str)
            .str.extract(r"([-+]?\d*\.?\d+)")[0]
            .astype(float)
        )
        # if units are actually °F, convert here
        # but according to your screenshot it is already °C
    else:
        print("No temperature column found for", date_str)

    return df


def date_range(start_str, end_str):
    """
    Generator for all dates between start and end (inclusive).
    """
    start = datetime.strptime(start_str, "%Y-%m-%d")
    end = datetime.strptime(end_str, "%Y-%m-%d")
    current = start
    while current <= end:
        yield current.strftime("%Y-%m-%d")
        current += timedelta(days=1)


def main():
    # set up Chrome WebDriver (will auto-download driver if needed)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # run without opening a visible window
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                              options=options)

    all_days = []

    try:
        for date_str in date_range(START_DATE, END_DATE):
            df_day = scrape_one_day(driver, PWS_ID, date_str)
            if df_day is None or df_day.empty:
                print("No data for", date_str)
            else:
                # save per-day CSV
                day_fname = f"{PWS_ID}_{date_str}.csv"
                df_day.to_csv(day_fname, index=False)
                print("Saved:", day_fname)
                all_days.append(df_day)

            # be nice to the server
            time.sleep(3)

    finally:
        driver.quit()

    if not all_days:
        print("No data collected at all.")
        return

    # concatenate all days to one big CSV
    df_all = pd.concat(all_days, ignore_index=True)
    all_fname = f"{PWS_ID}_{START_DATE}_to_{END_DATE}.csv"
    df_all.to_csv(all_fname, index=False)
    print("Saved combined CSV:", all_fname)

    # quick temperature plot for the last day (if we have temp_C)
    last_day = all_days[-1]
    if "temp_C" in last_day.columns:
        plt.figure(figsize=(10, 5))
        plt.plot(last_day["datetime"], last_day["temp_C"], marker="o")
        plt.title(f"Temperature on {END_DATE} for {PWS_ID}")
        plt.xlabel("Time")
        plt.ylabel("Temperature (°C)")
        plt.grid(True)
        plt.tight_layout()
        plt.show()
    else:
        print("No temp_C column found to plot.")

    print("Done.")


if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'selenium'