# List of Weather Station for Wunderground

**Kuala Pilah & Alor Gajah** 
Location: MALACCA AIRPORT STATION https://www.wunderground.com/history/daily/my/malacca/WMKM/date/


**Jengka & Gambang** 
Location: KUANTAN AIRPORT STATION https://www.wunderground.com/history/daily/my/kuantan/WMKD/date/ 


**Permatang Pauh**
Location: PENANG INTERNATIONAL AIRPORT STATION https://www.wunderground.com/history/daily/my/bayan-lepas/WMKP/date/

**Segamat**
Location: SEGAMAT AIRPORT STATION https://www.wunderground.com/history/daily/my/segamat/WMAZ/date/

**Dungun**
  Location: KERTEH AIRPORT STATION https://www.wunderground.com/history/daily/my/kerteh/WMKE/date/

In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup as BS
from functools import reduce
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Function to render the page with optional settings clicks
def render_page(url, type):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless=new')
    options.add_argument('--window-size=1920,1080')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(5)

    if type == "C":
        try:
            # Open settings and switch to Celsius
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, 'wuSettings'))
            )
            element.click()
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="wuSettings-quick"]/div/a[2]'))
            )
            element.click()
            time.sleep(15)

            # Screenshot to confirm what renders
            driver.save_screenshot("page_debug.png")
            
        except Exception as e:
            print("Error setting temperature unit:", e)

    # Save page source for debugging
    page_source = driver.page_source
    with open(f"debug_output_{type}_{int(time.time())}.html", "w", encoding="utf-8") as f:
        f.write(page_source)

    driver.quit()
    return page_source

# Main scraping function
def hourly_scraper(page, dates, type):
    all_data = []

    for d in dates:
        url = f"{page}{d}"
        print(f"Fetching {url}")
        r = render_page(url, type)
        soup = BS(r, "html.parser")

        # Find the table (adapt this if needed)
        table = soup.find('table')
        if not table:
            print(f"{d}: No <table> found in the page.")
            continue

        rows = table.find_all('tr')
        if len(rows) <= 1:
            print(f"{d}: No data rows found.")
            continue

        records = []
        for row in rows[1:]:  # Skip header
            cols = row.find_all('td')
            values = [col.get_text(strip=True) for col in cols]
            if values:
                records.append(values)

        if not records:
            print(f"{d}: No usable data extracted.")
            continue

        # Create a DataFrame based on column count
        try:
            df = pd.DataFrame(records, columns=["Time", "Temperature", "Dew Point", "Humidity", "Wind", "Wind Speed", "Wind Gust", "Pressure", "Precipitation", "Condition"])
        except Exception as e:
            print(f"{d}: Error creating DataFrame: {e}")
            continue

         # Clean numeric values
        def clean_numeric(col):
            return pd.to_numeric(col.str.replace(r"[^\d\.\-]", "", regex=True), errors='coerce')

        numeric_columns = ["Temperature", "Dew Point", "Humidity", "Wind Speed", "Wind Gust", "Pressure", "Precipitation"]
        for col in numeric_columns:
            if col in df.columns:
                df[col] = clean_numeric(df[col])

        df['Date'] = d
        all_data.append(df)
        print(f"{d} finished!")

    # Combine all daily DataFrames
    if all_data:
        output = pd.concat(all_data, ignore_index=True)
        return output
    else:
        print("No data collected for any dates.")
        return pd.DataFrame()


In [None]:
page_url = "https://www.wunderground.com/history/daily/my/malacca/WMKM/date/"
dates = ["2024-05-01", "2024-05-02"]  # Example date list
temperature_unit = "C"  # "C" for Celsius, "F" for Fahrenheit

df = hourly_scraper(page_url, dates, temperature_unit)

# Save result
if not df.empty:
    df.to_csv("weather_dataClean.csv", index=False)
    print("Data saved to weather_data.csv")
