In [1]:
!pip install requests beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote
import pandas as pd
import json
from concurrent.futures import ThreadPoolExecutor

In [3]:
state_to_fips = {
    'ALABAMA': '01',
    'ALASKA': '02',
    'ARIZONA': '04',
    'ARKANSAS': '05',
    'CALIFORNIA': '06',
    'COLORADO': '08',
    'CONNECTICUT': '09',
    'DELAWARE': '10',
    'FLORIDA': '12',
    'GEORGIA': '13',
    'HAWAII': '15',
    'IDAHO': '16',
    'ILLINOIS': '17',
    'INDIANA': '18',
    'IOWA': '19',
    'KANSAS': '20',
    'KENTUCKY': '21',
    'LOUISIANA': '22',
    'MAINE': '23',
    'MARYLAND': '24',
    'MASSACHUSETTS': '25',
    'MICHIGAN': '26',
    'MINNESOTA': '27',
    'MISSISSIPPI': '28',
    'MISSOURI': '29',
    'MONTANA': '30',
    'NEBRASKA': '31',
    'NEVADA': '32',
    'NEW HAMPSHIRE': '33',
    'NEW JERSEY': '34',
    'NEW MEXICO': '35',
    'NEW YORK': '36',
    'NORTH CAROLINA': '37',
    'NORTH DAKOTA': '38',
    'OHIO': '39',
    'OKLAHOMA': '40',
    'OREGON': '41',
    'PENNSYLVANIA': '42',
    'RHODE ISLAND': '44',
    'SOUTH CAROLINA': '45',
    'SOUTH DAKOTA': '46',
    'TENNESSEE': '47',
    'TEXAS': '48',
    'UTAH': '49',
    'VERMONT': '50',
    'VIRGINIA': '51',
    'WASHINGTON': '53',
    'WEST VIRGINIA': '54',
    'WISCONSIN': '55',
    'WYOMING': '56',
    'DISTRICT OF COLUMBIA': '11',
    'PUERTO RICO': '72',
    'GUAM': '66',
    'AMERICAN SAMOA': '60',
    'U.S. VIRGIN ISLANDS': '78',
    'NORTHERN MARIANA ISLANDS': '69'
}



In [26]:
def create_noaa_url(state_fips, start_date, end_date, state):

    def format_date_component(date_component):
      """Ensure date components are two digits."""
      if len(date_component) == 1:
          return '0' + date_component
      return date_component
    # Split the dates to extract year, month, and day components
    start_year, start_month, start_day = start_date.split('-')
    end_year, end_month, end_day = end_date.split('-')

    start_month = format_date_component(start_month)
    start_day = format_date_component(start_day)
    end_month = format_date_component(end_month)
    end_day = format_date_component(end_day)

    base_url = 'https://www.ncdc.noaa.gov/stormevents/listevents.jsp?'
    url = (f"{base_url}eventType=ALL"
           f"&beginDate_mm={start_month}&beginDate_dd={start_day}&beginDate_yyyy={start_year}"
           f"&endDate_mm={end_month}&endDate_dd={end_day}&endDate_yyyy={end_year}"
           f"&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000"
           f"&sort=DT&submitbutton=Search&statefips={state_fips}%2C{state}")

    return url

In [27]:
state = 'NEW YORK'

start_date = '01/12/2020'
url = create_noaa_url(state_to_fips[state], start_date, start_date, quote(state))
print(url)

ValueError: not enough values to unpack (expected 3, got 1)

In [28]:
def mine_info(url):
    session = requests.Session()
    response = requests.get(url)
    event_list = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', id='results')
        table_rows = table.find_all('tr')
        
        # For each event. A resutls table contains a list of events with a link to more data
        for row in table_rows:
            # Open a dictionary to store the data for an event
            data_table = {}
            cells = row.find_all('td')
            if cells:  # Make sure cells list is not empty
                first_cell_link = cells[0].find('a')
                if first_cell_link and first_cell_link.has_attr('href') and len(cells) > 9:

                    county_zone = cells[1].text
                    tz = cells[5].text
                    event_type = cells[6].text
                    prd = cells[10].text
                    
                    href = first_cell_link['href'].strip(' ')
                    
                    # Use urljoin to resolve relative URL
                    full_url = urljoin('https://www.ncdc.noaa.gov/stormevents/', href)

                    link_response = requests.get(full_url)
                    
                    # Super dirty error handling - looking to fix later
                    
                    if link_response.status_code == 200:
                        linked_soup = BeautifulSoup(link_response.text, 'html.parser')
                        
                        if (linked_soup):
                            rows = linked_soup.find_all('tr')
                            event_narrative = rows[len(rows) - 1].get_text(strip=True)
                            data_table['event narrative'] = event_narrative
                        else:
                            data_table['event narrative'] = None
                            
                        data_table['county/zone'] = county_zone
                        data_table['tz'] = tz
                        data_table['event type'] = event_type
                        data_table['prd'] = prd
                        event_list.append(data_table)
                    else:
                        print(f"Failed to retrieve the linked webpage, status code: {link_response.status_code}")
                        return link_response.status_code
        return json.dumps(event_list)
    else:
        print("Failed to retrieve the webpage, status code: ", response.status_code)
        return response.status_code

In [29]:
#info = mine_info(url)
#print(info)

In [30]:
# We can now synthesize these to add description for multiple events
df = pd.read_csv('../tables/state_regularized_v2.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,Event Type,Valid Date,Valid Time,State,Cities,Counties,Dirty
0,1,Ice Storm,2002-01-29,,,Kansas City,,1.0
1,0,Ice Storm,2002-01-30,6:00:00,Oklahoma,,,
2,2,Ice Storm,2002-01-30,16:00:00,Missouri,,,
3,3,Severe Weather,2002-03-09,0:00:00,Michigan,,,1.0
4,4,Fire,2002-07-20,12:40:00,New York,,,
5,5,Severe Weather,2002-08-28,14:09:00,Florida,,,
6,6,Hurricane Lily,2002-10-03,3:33:00,Louisiana,,,1.0
7,7,Winter Storm,2002-11-06,22:00:00,California,,,1.0
8,8,Ice Storm,2002-11-17,6:00:00,Connecticut,,,1.0
9,9,Ice Storm,2002-12-03,18:30:00,Arkansas,,,


In [31]:
def get_description(row):

    try:
      if (type(row['State']) != str):
        return None
      state_upper = row['State'].upper()

      # Construct the URL from date and state (this is hypothetical and needs to match your URL scheme)
      url = create_noaa_url(state_to_fips[state_upper], row['Valid Date'], row['Valid Date'], quote(state_upper))

      print(url)

      # Call mine_info and get description
      description = mine_info(url)

      # Return the description
      return description
    except:
      print('Failed on ' + row['State'] + ' ' + str(row['Valid Date']))
      return 'FAILED'

In [None]:
# Spawn 16 workers
# switching and scheduling
workers = 16

with ThreadPoolExecutor(max_workers=workers) as executor:
    rows_list = [row for _, row in df.iterrows()]
    result = list(executor.map(get_description, rows_list))

# Assign the result back to the DataFrame
df['description'] = result

https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=01&beginDate_dd=30&beginDate_yyyy=2002&endDate_mm=01&endDate_dd=30&endDate_yyyy=2002&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=40%2COKLAHOMAhttps://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=01&beginDate_dd=30&beginDate_yyyy=2002&endDate_mm=01&endDate_dd=30&endDate_yyyy=2002&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=29%2CMISSOURI

https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=03&beginDate_dd=09&beginDate_yyyy=2002&endDate_mm=03&endDate_dd=09&endDate_yyyy=2002&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=26%2CMICHIGAN
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=07&beginDate_dd=20&beginDate_yyyy=2002&endDate_mm=07&endDate_dd=20&endDate_yyyy=2002&county=ALL&hailfilt

https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=09&beginDate_dd=18&beginDate_yyyy=2003&endDate_mm=09&endDate_dd=18&endDate_yyyy=2003&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=24%2CMARYLAND
Failed on Pennsylvania, Maryland, Virginia 2003-09-18
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=09&beginDate_dd=18&beginDate_yyyy=2003&endDate_mm=09&endDate_dd=18&endDate_yyyy=2003&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=37%2CNORTH%20CAROLINA
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=09&beginDate_dd=18&beginDate_yyyy=2003&endDate_mm=09&endDate_dd=18&endDate_yyyy=2003&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=24%2CMARYLAND
Failed on All PPL includingDIRTY 2003-09-18
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_m

https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=05&beginDate_dd=21&beginDate_yyyy=2004&endDate_mm=05&endDate_dd=21&endDate_yyyy=2004&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=39%2COHIO
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=06&beginDate_dd=01&beginDate_yyyy=2004&endDate_mm=06&endDate_dd=01&endDate_yyyy=2004&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=48%2CTEXAS
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=06&beginDate_dd=02&beginDate_yyyy=2004&endDate_mm=06&endDate_dd=02&endDate_yyyy=2004&county=ALL&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=40%2COKLAHOMA
https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=ALL&beginDate_mm=06&beginDate_dd=02&beginDate_yyyy=2004&endDate_mm=06&endDate_dd=02&endDate_yyyy=2004&county=ALL&hailfilter=0.00

In [11]:
df.to_csv('all_weather_events_labeled.csv')

In [12]:
#for m in gai.list_models():
#  if 'generateContent' in m.supported_generation_methods:
#    print(m)

In [13]:
#def send_prompt(model, text):
#  response = model.generate_content([text])
#  print(response)
#  return response

In [14]:
#model = gai.GenerativeModel('gemini-1.0-pro')

#response = send_prompt(model, "Hi")
#print(response)

In [15]:
#response.text