In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# SCRAPE STATION INFO

In [119]:
# URL of PWS list:
# https://www.wunderground.com/weatherstation/ListStations.asp?selectedState=WA&selectedCountry=United+States&MR=1

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_station_info(state="WA"):

    url = "https://www.wunderground.com/weatherstation/ListStations.asp?selectedState=" + state + "&selectedCountry=United+States&MR=1"
    raw_site_content = requests.get(url).content
    soup = BeautifulSoup(raw_site_content, 'html.parser')
    
    list_stations_info = soup.find_all("tr")  # one text element in list for each station

    all_station_info = np.array(['id','neighborhood','city','type'])
    
    for i in range(1, len(list_stations_info)):  # start at 1 to omit first element (col headers)

        station_info = str(list_stations_info[i]).splitlines()

        # pull out station info
        station_id = station_info[1].split('ID=')[1].split('"')[0].strip()
        station_neighborhood = station_info[2].split('<td>')[1].split('\xa0')[0].strip()
        station_city = station_info[3].split('<td>')[1].split('\xa0')[0].strip()
        station_type = station_info[4].split('station-type">')[1].split('\xa0')[0].strip()

        all_station_info = np.vstack([all_station_info,[station_id,station_neighborhood,station_city,station_type]])

    return all_station_info

array([['id', 'neighborhood', 'city', 'type'],
       ['KWAVENER2', 'Alpine, WA',
        '0.8Mi NNE of Venersborg Store, Venersborg', 'Davis Vantage Vue'],
       ['KWAMARYS7', 'Kruse Junction', 'North Marysville',
        'Davis Vantage Pro 2'],
       ['KWAWALLA2', 'Foothills of the Blue Mountains', 'Walla Walla',
        'Davis Vantage Pro2 (Wireless)']], 
      dtype='<U66')

In [140]:
all_info = scrape_station_info()

array(['id', 'KWAVENER2', 'KWAMARYS7', ..., 'MC9363', 'MCMOW1', 'MTZILL'], 
      dtype='<U66')

In [150]:
all_info[all_info[:,2]=="Seattle"]

array([['KWASEATT107', 'Roosevelt - Tomster', 'Seattle',
        'Davis Vantage Pro2'],
       ['KWASEATT295', 'Haller Lake', 'Seattle',
        'Davis Vantage Vue (Wireless)'],
       ['KWASEATT313', 'Fauntleroy', 'Seattle',
        'AcuRite Pro Weather Center'],
       ..., 
       ['MBNBMD', '', 'Seattle', ''],
       ['ME6711', '', 'Seattle', ''],
       ['MXGGN', '', 'Seattle', '']], 
      dtype='<U66')

### requests

In [27]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://www.wunderground.com/weatherstation/ListStations.asp?selectedState=WA&selectedCountry=United+States&MR=1"

raw_site_content = requests.get(url).content
soup = BeautifulSoup(raw_site_content, 'html.parser')

In [82]:
list_stations_info = soup.find_all("tr")  # one text element in list for each station

all_stations = np.array(['id','neighborhood','city','type'])

for i in range(1, len(list_stations_info)):  # start at 1 to omit first element (col headers)
    
    station_info = str(list_stations_info[i]).splitlines()
    
    # pull out station info
    station_id = station_info[1].split('ID=')[1].split('"')[0].strip()
    station_neighborhood = station_info[2].split('<td>')[1].split('\xa0')[0].strip()
    station_city = station_info[3].split('<td>')[1].split('\xa0')[0].strip()
    station_type = station_info[4].split('station-type">')[1].split('\xa0')[0].strip()

    all_stations = np.vstack([all_stations,[station_id,station_neighborhood,station_city,station_type]])


### urllib3

In [277]:
import urllib3
http = urllib3.PoolManager()
raw_site_urllib3 = http.request('GET', url)

raw_site_content = raw_site_urllib3.data;



In [77]:
#for link in soup.find_all('a'):
#    print(link.get('href'))

/login.asp
/member/registration
javascript:void(0);
/member/membersettings.html
/email/mainmenu.php
/webcams/signup.html
/email/logout.php
#
javascript:void(0);
None
None
/cgi-bin/findweather/getForecast?setpref=EXPFCT&value=1&referer=%2fweatherstation%2fListStations%2easp%3fselectedState%3dWA%26selectedCountry%3dUnited%2bStates%26MR%3d1
/cgi-bin/findweather/getForecast?setpref=EXPFCT&value=0&referer=%2fweatherstation%2fListStations%2easp%3fselectedState%3dWA%26selectedCountry%3dUnited%2bStates%26MR%3d1
/about/data.asp#differences
javascript:void(0);
javascript:void(0);
/weather-radar/
/wundermap/?radar=1
/wundermap/?sat=1
/wundermap/
/maps/
/ndfdimage/viewimage/
/maps/catalog/
javascript:void(0);
javascript:void(0);
/severe.asp
/severe/europe.asp
/hurricane/
/severeconvective.asp
/wundermap?fire=1
/prepare/
/email/emailsettings.asp
javascript:void(0);
javascript:void(0);
/cat6
/blog/
/news/
/weather-infographics/
/weather-posters/
javascript:void(0);
javascript:void(0);
/wximage/
/web

# SCRAPE OBSERVATION DATA

In [1]:
start_date = 20160909
end_date = 20160912

In [2]:
import pandas as pd

start_date = str(start_date)
start_date_yyyy = int(start_date[0:4])
start_date_mm = int(start_date[4:6])
start_date_dd = int(start_date[6:8])

end_date = str(end_date)
end_date_yyyy = int(end_date[0:4])
end_date_mm = int(end_date[4:6])
end_date_dd = int(end_date[6:8])

start_date = pd.datetime(start_date_yyyy, start_date_mm, start_date_dd)
end_date = pd.datetime(end_date_yyyy, end_date_mm, end_date_dd)

date_range = pd.date_range(start_date, end_date)

In [63]:
day = date_range[0]

In [69]:
day.year

2016

In [10]:
import requests
import csv
import os

temp_file_name = 'temp_csv.csv'
url = 'https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=KWAEDMON15&day=10&month=9&year=2016&graphspan=day&format=1'
download = requests.get(url)

# with open(temp_file_name, 'w') as temp_file:
#     temp_file.writelines(download.text)

In [18]:
temp = download.text
type(temp)
temp = temp.replace("\n","")
temp = temp.splitlines()
temp

['Time,TemperatureF,DewpointF,PressureIn,WindDirection,WindDirectionDegrees,WindSpeedMPH,WindSpeedGustMPH,Humidity,HourlyPrecipIn,Conditions,Clouds,dailyrainin,SoftwareType,DateUTC<br>2016-09-10 00:00:00,52.9,48.8,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:00:00,<br>2016-09-10 00:05:00,52.8,48.7,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:05:00,<br>2016-09-10 00:10:00,52.6,48.5,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:10:00,<br>2016-09-10 00:15:00,52.4,48.6,30.20,N/A,-737280,0.0,-999.0,87,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:15:00,<br>2016-09-10 00:20:00,52.3,48.9,30.19,N/A,-737280,0.0,-999.0,88,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:20:00,<br>2016-09-10 00:25:00,52.3,49.5,30.19,N/A,-737280,0.0,-999.0,90,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:25:00,<br>2016-09-10 00:30:00,52.3,49.5,30.19,N/A,-737280,0.0,-999.0,90,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:

In [20]:
import urllib3
from bs4 import BeautifulSoup as Soup

http = urllib3.PoolManager()
r = http.request('GET', url)
r.data



b'\nTime,TemperatureF,DewpointF,PressureIn,WindDirection,WindDirectionDegrees,WindSpeedMPH,WindSpeedGustMPH,Humidity,HourlyPrecipIn,Conditions,Clouds,dailyrainin,SoftwareType,DateUTC<br>\n2016-09-10 00:00:00,52.9,48.8,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:00:00,\n<br>\n2016-09-10 00:05:00,52.8,48.7,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:05:00,\n<br>\n2016-09-10 00:10:00,52.6,48.5,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:10:00,\n<br>\n2016-09-10 00:15:00,52.4,48.6,30.20,N/A,-737280,0.0,-999.0,87,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:15:00,\n<br>\n2016-09-10 00:20:00,52.3,48.9,30.19,N/A,-737280,0.0,-999.0,88,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:20:00,\n<br>\n2016-09-10 00:25:00,52.3,49.5,30.19,N/A,-737280,0.0,-999.0,90,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:25:00,\n<br>\n2016-09-10 00:30:00,52.3,49.5,30.19,N/A,-737280,0.0,-999.0,90,0.00,,,0.00,Wunder

In [180]:
temp.replace("<br>","")

'\nTime,TemperatureF,DewpointF,PressureIn,WindDirection,WindDirectionDegrees,WindSpeedMPH,WindSpeedGustMPH,Humidity,HourlyPrecipIn,Conditions,Clouds,dailyrainin,SoftwareType,DateUTC\n2016-09-10 00:00:00,52.9,48.8,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:00:00,\n\n2016-09-10 00:05:00,52.8,48.7,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:05:00,\n\n2016-09-10 00:10:00,52.6,48.5,30.20,N/A,-737280,0.0,-999.0,86,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:10:00,\n\n2016-09-10 00:15:00,52.4,48.6,30.20,N/A,-737280,0.0,-999.0,87,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:15:00,\n\n2016-09-10 00:20:00,52.3,48.9,30.19,N/A,-737280,0.0,-999.0,88,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:20:00,\n\n2016-09-10 00:25:00,52.3,49.5,30.19,N/A,-737280,0.0,-999.0,90,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:25:00,\n\n2016-09-10 00:30:00,52.3,49.5,30.19,N/A,-737280,0.0,-999.0,90,0.00,,,0.00,Wunderground v.1.15,2016-09-10 07:3

In [174]:
# temp = download.text.split('<br>')
# temp = [ line.strip() for line in temp ]
# temp

In [4]:
def scrape_data(station_id="KWAEDMON15", year=2017, month=4, day=18):

    test_url = "https: // www.wunderground.com / weatherstation / WXDailyHistory.asp?" \
               "ID = KWAEDMON15 & day = 18 & month = 4 & year = 2017 & graphspan = day & format = 1"

    filename = station_id + "_" + str(year) + str(month) + str(day)

    print(filename)

In [5]:
scrape_data()

KWAEDMON15_2017418


In [28]:
import requests

In [22]:
station_id="KWAEDMON15"
year=2017
month=4
day=18

print("https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={}&day={}&month={}&year={}&graphspan=day&format=1".format(station_id, day, month, year))

https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=KWAEDMON15&day=18&month=4&year=2017&graphspan=day&format=1


In [27]:
station_id="KWAEDMON15"
year=2017
month=4
day=18

url = "https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={}&day={}&month={}&year={}&graphspan=day&format=1".format(station_id, day, month, year)
print(url)

https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=KWAEDMON15&day=18&month=4&year=2017&graphspan=day&format=1
