# Save the weather details drought information for every county that had a wildfire

## Install the Python requirements

In [None]:
pip install pandas==1.3.2
pip install tqdm==4.62.2
pip install psycopg2-binary==2.9.1
pip install sqlalchemy==1.4.23

## Import the required packages

In [39]:
import numpy as np
import pandas as pd
import re
import io
import json
import requests
import psycopg2
import http.client
import os

from io import StringIO
from datetime import date
from datetime import timedelta
from requests.exceptions import ConnectionError

from tqdm.auto import tqdm  # for notebooks
from amazon_cred import ENDPOINT, PORT, USER, PASSWORD, DATABASE

from data_constants import ZONE_COUNTY_CORR_URL

## Constant URLs, storm categories, and strings

In [5]:
USDM_CSV_BASE_URL = 'https://droughtmonitor.unl.edu/DmData/GISData.aspx?mode=table&aoi=county&date='

NCDC_TOKEN = '***REMOVED***'
NCDC_BASE_URL = 'www.ncdc.noaa.gov'
NCDC_WEATHER_STATIONS_LST_URL = '/cdo-web/api/v2/stations?locationid=FIPS:'

In [38]:
df_zone_county = pd.read_csv(ZONE_COUNTY_CORR_URL, sep='|', header=None,
                    names=['STATE', 'ZONE', 'CWA', 'NAME', 'STATE_ZONE',
                           'CZ_NAME', 'FIPS', 'TIME_ZONE', 'FE_AREA', 'LAT', 'LON'])
df_zone_county = df_zone_county[['FIPS', 'LAT', 'LON']]

def haversine(lat1, lon1, lat2, lon2):
    metric_constant = 3959
    
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1,lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    haversine_distance = metric_constant * c
    return haversine_distance

def closestn(df_stations, county, n=3):
    def calc_haversine(row):
        row['hav'] = haversine(row['latitude'], row['longitude'], county['LAT'], county['LON'])
        return row
    df_stations = df_stations.apply(calc_haversine, axis=1)
    return df_stations.nsmallest(n, ['hav'])

def donwload_climate_info(station_number, start_date, end_date):
    api =  f'https://www.ncei.noaa.gov/access/services/data/v1.?dataset=daily-summaries&stations={station_number}'
    api += f'&startDate={start_date}&endDate={end_date}&units=metric'
    while True:
        try:
            res = requests.get(api, timeout=5)
            break
        except ConnectionError:
            print('Connection error')
    return res.content


def getWildFireEvents(year):
    wildfire = "'Wildfire'"    
    conn = psycopg2.connect(
        host=ENDPOINT,
        port=PORT,
        user=USER,
        password=PASSWORD,
        database = DATABASE
    )
    with conn:
        select = f'SELECT * from counties WHERE "Year" = {year} AND "EVENT_TYPE" = {wildfire}'
        df_counties = pd.read_sql(select, con=conn)
    conn.close()
    
    os.mkdir(str(year))
    # print("Processing : ", df_counties.shape[0], " for ", str(year))
    for r, row in tqdm(df_counties.iterrows()):
        today = row['EVENT_DATE']
        dirname = str(year) + '/' + today.strftime('%m_%d') + '_' + row['FIPS']
        if os.path.exists(dirname):
            continue # we already have the county weather
    
        os.mkdir(dirname)

        offset = (today.weekday() - 1)%7
        last_tuesday = today - timedelta(days=offset)
        twelve_years_past = last_tuesday - timedelta(days=12*365.25) # request for additional data
        twelve_years_str = twelve_years_past.strftime('%Y-%m-%d')
        last_tuesday_str = last_tuesday.strftime('%Y-%m-%d')

        r = requests.get(USDM_CSV_BASE_URL+last_tuesday_str)
        content = r.content.decode('utf-8')
        df_usdm = pd.read_csv(StringIO(content))

        df_usdm.to_csv(dirname + '/' + 'usdm.csv')

        #df_usdm['FIPS'] = df_usdm['FIPS'].astype(str) # cast it as string in order to zfill
        #df_usdm['FIPS'] = df_usdm['FIPS'].apply(lambda x: x.zfill(5)) # no need to zfill

        df_usdm_merged = df_usdm.merge(df_zone_county, on=['FIPS'])
        wildfire_county_df = df_usdm_merged[df_usdm_merged['FIPS'] == int(row['FIPS'])]

        # Precipitation data
        # list of weather stations using NCDC api: https://www.ncdc.noaa.gov/cdo-web/webservices/v2#gettingStarted
        headers = {'token': NCDC_TOKEN}
        conn_http = http.client.HTTPSConnection(NCDC_BASE_URL)
        conn_http.request('GET', NCDC_WEATHER_STATIONS_LST_URL + row['FIPS'] + 
                     f'&startdate={twelve_years_str}&enddate={last_tuesday_str}&limit=100' +
                     '&dataset=GHCND&datacategoryid=TEMP,PRCP', # Daily summaries of temperature and precipitation
                     headers=headers)

        result = conn_http.getresponse()
        result = result.read()
        # somehow curl does not allow me to redirect output.
        #!curl -H "token:***REMOVED***" "https://www.ncdc.noaa.gov/cdo-web/api/v2/stations?locationid=FIPS:08049"

        data = json.loads(result)
        try:
            df_stations = pd.DataFrame(data["results"])
        except KeyError:
            print("No weather stations found ", row['NAME'])
            continue
        
        # print(df_stations.shape[0], " df_stations found")

        df_stations['mindate'] = pd.to_datetime(df_stations['mindate'])
        df_stations['maxdate'] = pd.to_datetime(df_stations['maxdate'])
        df_stations = df_stations[(df_stations['mindate'] <= twelve_years_past) &
                                  (df_stations['maxdate'] >= last_tuesday)]
        # print("df_stations: ", df_stations.shape[0])
        if df_stations.shape[0]:
            last_10_yr_date = last_tuesday - timedelta(days=10*365.25)
            last_10_yr_date = pd.to_datetime(str(last_10_yr_date.year) + '-01-01')
            last_10_yr_date_str = last_10_yr_date.strftime('%Y-%m-%d')

            df_closestn = closestn(df_stations, wildfire_county_df.iloc[0])
            foundRec = False
            for _, row_c in df_closestn.iterrows():
                id_gnd = row_c['id'].split(':')
                # print(id_gnd[1])
                response = donwload_climate_info(id_gnd[1], last_10_yr_date_str, last_tuesday_str)

                df_weather = pd.read_csv(StringIO(response.decode("UTF-8")))[['DATE', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']]
                df_copy = df_weather.copy()
                df_copy.dropna(subset=['TMIN','TMAX','PRCP'], inplace=True)
                # print(df_weather.shape[0], last_10_yr_date_str, last_tuesday_str)
                if df_copy.shape[0] <= (0.5*df_weather.shape[0]): # 50 % of the records missing!
                    continue
                else:
                    df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
                    foundRec = True
                    break
            if not foundRec:
                print("No weather record found for ", row['NAME'], " out of ", df_closestn.shape[0])
            
            # Fill in the missing dates with np.NaN
            df_weather.set_index('DATE', inplace=True)
            idx = pd.date_range(last_10_yr_date_str, today.strftime('%Y-%m-%d'))
            df_weather = df_weather.reindex(idx, fill_value = np.NaN)
            df_weather.reset_index(inplace=True)
            df_weather.rename(columns={'index':'DATE'}, inplace=True)
            
            # Save it to a CSV file instead of database
            df_weather.to_csv(dirname + '/weather.csv', index = False)
    
    return

start_year = 2000
end_year = 2011
for i in tqdm(range(start_year, end_year)):
    getWildFireEvents(i)

  0%|          | 0/11 [00:00<?, ?it/s]

0it [00:00, ?it/s]

No weather record found for  SAN MIGUEL, NEW MEXICO  out of  3


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

No weather record found for  FORSYTH, GEORGIA  out of  2
No weather record found for  DOUGLAS, GEORGIA  out of  2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

No weather stations found  NORTHAMPTON, PENNSYLVANIA
No weather record found for  GREATER LAKE TAHOE AREA, CALIFORNIA  out of  3


0it [00:00, ?it/s]

No weather record found for  WISE, TEXAS  out of  3


0it [00:00, ?it/s]

0it [00:00, ?it/s]