In [1]:
import pandas as pd
import requests as re
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
def fetch_weather_data(date):
    # https://api.weather.com/v1/location/KLGA:9:US/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=e&startDate=20240501&endDate=20240531
    url = f"https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/{date}"
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    
    driver.get(url)
    
    wait = WebDriverWait(driver, 5)
    table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.observation-table.ng-star-inserted')))
    
    html = driver.page_source
    soup = bs(html, 'html.parser')
    table = soup.find('div', class_='observation-table ng-star-inserted')
    
    data = []
    rows = table.find_all('tr')
    for row in rows:
        cells = [cell.get_text(strip=True) for cell in row.find_all('td')]
        if cells:
            data.append(cells)
    
    driver.quit()
    return data


In [3]:
def collect_may_weather_data():
    may_wheather_data = []
    
    for day in range(1, 32):
        date = f"2024-5-{day}"
        daily_data = fetch_weather_data(date)
        for entry in daily_data:
            may_wheather_data.append([date] + entry)
        print(date, 'done')
    
    return may_wheather_data

## extract

In [4]:
columns = ['Date', 'Time', 'Temperature', 'Dew Point', 'Humidity', 'Wind', 'Wind Speed', 'Wind Gust', 'Pressure', 'Precipitation', 'Condition']
may_weather_data = collect_may_weather_data()
df = pd.DataFrame(may_weather_data, columns=columns)
df

2024-5-1 done
2024-5-2 done
2024-5-3 done
2024-5-4 done
2024-5-5 done
2024-5-6 done
2024-5-7 done
2024-5-8 done
2024-5-9 done
2024-5-10 done
2024-5-11 done
2024-5-12 done
2024-5-13 done
2024-5-14 done
2024-5-15 done
2024-5-16 done
2024-5-17 done
2024-5-18 done
2024-5-19 done
2024-5-20 done
2024-5-21 done
2024-5-22 done
2024-5-23 done
2024-5-24 done
2024-5-25 done
2024-5-26 done
2024-5-27 done
2024-5-28 done
2024-5-29 done
2024-5-30 done
2024-5-31 done


Unnamed: 0,Date,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precipitation,Condition
0,2024-5-1,12:44 AM,51°F,47°F,86°%,NNE,5°mph,0°mph,29.87°in,0.0°in,Light Rain
1,2024-5-1,12:51 AM,51°F,47°F,86°%,NE,6°mph,0°mph,29.85°in,0.0°in,Light Rain
2,2024-5-1,1:46 AM,52°F,46°F,82°%,NE,5°mph,0°mph,29.86°in,0.0°in,Light Rain
3,2024-5-1,1:51 AM,51°F,48°F,89°%,ENE,5°mph,0°mph,29.85°in,0.0°in,Light Rain
4,2024-5-1,2:51 AM,51°F,47°F,86°%,E,5°mph,0°mph,29.85°in,0.0°in,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...
896,2024-5-31,7:51 PM,73°F,41°F,31°%,NW,10°mph,0°mph,30.07°in,0.0°in,Partly Cloudy
897,2024-5-31,8:51 PM,72°F,41°F,33°%,NW,15°mph,21°mph,30.08°in,0.0°in,Fair
898,2024-5-31,9:51 PM,71°F,40°F,32°%,NW,12°mph,0°mph,30.11°in,0.0°in,Fair
899,2024-5-31,10:51 PM,69°F,41°F,36°%,NW,9°mph,0°mph,30.12°in,0.0°in,Fair


## transform

In [5]:
# Assuming df is your DataFrame
def fahrenheit_to_celsius(f_temp):
    return (float(f_temp) - 32) * 5.0/9.0

df['Time'] = pd.to_datetime(df['Time'], format='%I:%M %p').dt.strftime('%H:%M')
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['Temperature'] = df['Temperature'].str.replace('°F', '').astype(float).apply(fahrenheit_to_celsius).round(2).astype(str)
df['Dew Point'] = df['Dew Point'].str.replace('°F', '').astype(float).apply(fahrenheit_to_celsius).round(2).astype(str)
df['Humidity'] = df['Humidity'].str.replace('°', '')
df['Wind'] = df['Wind'].str.replace('°', '')
df['Wind Speed'] = df['Wind Speed'].str.replace('°', '')
df['Wind Gust'] = df['Wind Gust'].str.replace('°', '')
df['Pressure'] = df['Pressure'].str.replace('°in', '')
df = df.drop(['Date', 'Time', 'Dew Point', 'Wind', 'Wind Gust', 'Precipitation'], axis=1)

df

Unnamed: 0,Temperature,Humidity,Wind Speed,Pressure,Condition,Datetime
0,10.56,86%,5mph,29.87,Light Rain,2024-05-01 00:44:00
1,10.56,86%,6mph,29.85,Light Rain,2024-05-01 00:51:00
2,11.11,82%,5mph,29.86,Light Rain,2024-05-01 01:46:00
3,10.56,89%,5mph,29.85,Light Rain,2024-05-01 01:51:00
4,10.56,86%,5mph,29.85,Mostly Cloudy,2024-05-01 02:51:00
...,...,...,...,...,...,...
896,22.78,31%,10mph,30.07,Partly Cloudy,2024-05-31 19:51:00
897,22.22,33%,15mph,30.08,Fair,2024-05-31 20:51:00
898,21.67,32%,12mph,30.11,Fair,2024-05-31 21:51:00
899,20.56,36%,9mph,30.12,Fair,2024-05-31 22:51:00


#### 빈 시간 채우기

In [6]:
first = df.iloc[0,:].copy()
last = df.iloc[-1,:].copy()
first['Datetime'] = pd.Timestamp("2024-05-01 00:00:00")
last['Datetime'] = pd.Timestamp("2024-05-31 23:59:00")

if df.iloc[0]['Datetime'] != pd.Timestamp("2024-05-01 00:00:00"):
    df = pd.concat([pd.DataFrame([first]), df])
if df.iloc[-1]['Datetime'] != pd.Timestamp("2024-05-31 23:59:00"):
    df = pd.concat([df, pd.DataFrame([last])])

df = df.set_index('Datetime').sort_index()

df_resampled = df.resample('min').ffill()
df_resampled

Unnamed: 0_level_0,Temperature,Humidity,Wind Speed,Pressure,Condition
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-05-01 00:00:00,10.56,86%,5mph,29.87,Light Rain
2024-05-01 00:01:00,10.56,86%,5mph,29.87,Light Rain
2024-05-01 00:02:00,10.56,86%,5mph,29.87,Light Rain
2024-05-01 00:03:00,10.56,86%,5mph,29.87,Light Rain
2024-05-01 00:04:00,10.56,86%,5mph,29.87,Light Rain
...,...,...,...,...,...
2024-05-31 23:55:00,20.0,37%,8mph,30.13,Fair
2024-05-31 23:56:00,20.0,37%,8mph,30.13,Fair
2024-05-31 23:57:00,20.0,37%,8mph,30.13,Fair
2024-05-31 23:58:00,20.0,37%,8mph,30.13,Fair


## load

In [9]:
filename = 'may_2024_weather_data.csv'
df_resampled.to_csv(filename)

`docker cp may_2024_weather_data.csv w4m2:/root/`