# Libraries and Settings

#### Libraries

In [1]:
##########==========##########==========##########==========##########==========

## general purpose libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request as url

from datetime       import datetime
from bs4            import BeautifulSoup
from os             import listdir, mkdir
from os.path        import isdir, isfile
from geopy.distance import distance


## time stamp functions
set_timing = dict()
def time_check(id_str = 'ZZ'):
    raw_time = datetime.now()
    current_time = [raw_time.hour, raw_time.minute, round(raw_time.second)]
    current_time = [str(i).zfill(2) for i in current_time]
    current_time = ':'.join(current_time)
    current_time = 'Time Check ' + id_str.ljust(4) + ' = ' + current_time
    set_timing[id_str] = current_time
    if id_str == 'ZZ':
        for i in set_timing:
            print(set_timing[i])

time_check('AA')

#### Settings

In [2]:
## url source of the data
set_url = "https://www.ncei.noaa.gov/data/local-climatological-data/access/{0}/"
set_year = range(2012, 2022)
set_prefix = ['69', '72', '74', '91', '99']

## cache settings
set_dev_mode = True
set_cache = {'hard_reset': False, 'testing': True, 'station_roster': True,
             'collection' : True,  'compilation': True, 'modeling': True}

## defines dtype for the columns in the raw weather data files
set_col_list = {'DATE':str, 'LATITUDE':float,
    'LONGITUDE':float, 'ELEVATION':float,
    "HourlyDryBulbTemperature":float,
    "HourlyPrecipitation":float}

## set modeling parameters
set_param = {'window': 14, 'mid_summer': 213}

#### set up file system (if needed)

In [3]:
## make directories that script expects
def make_directories():
    all_dirs = ['A_Input', 'B_Process', 'C_Output']
    all_dirs = all_dirs + ['B_Process/downloads']
    for i in all_dirs:
        if not isdir(i): mkdir(i)
        
make_directories()

# Import Data

#### retrieve city roster file

In [4]:
city_roster = pd.read_excel('A_Input/city_list.xlsx',
    usecols = {'City': str, 'Route': str, 'lon': float, 'lat': float})

#### retrieve links page for first year

In [5]:
def make_data_roster(the_year, the_url = set_url, valid_prefix = set_prefix):
  
    ## retrive raw web page
    the_url = the_url.format(the_year)
    with url.urlopen(the_url) as conn:
        all_files = conn.read()
        conn.close()
        
    ## extract links to csv files
    all_files = BeautifulSoup(all_files)
    all_files = all_files.find_all('a')
    all_files = [i.string for i in all_files if i.string.find('.csv') != -1]
        
    ## package as pandas files; limit to valid prefix
    all_files = pd.DataFrame({'prefix':0, 'file':all_files,
                              'lon': np.nan, 'lat': np.nan, 'dist': np.nan})
    all_files['prefix'] = [i[0:2] for i in all_files['file']]
    all_files = all_files[all_files.prefix.isin(valid_prefix)]
    all_files = all_files.reset_index()

    del all_files['index']
    
    return all_files 

## execute code
data_roster = make_data_roster(set_year[-1])
if set_dev_mode: data_roster = data_roster.loc[0:9, ]

#### determine which stations are near route cities

In [6]:
def read_station_data(x, ucl = set_col_list):
    the_csv = pd.read_csv(x, usecols = ucl, parse_dates = ['DATE'], dtype = str)
    for j in ucl.keys():
        if ucl[j] == float:
            the_csv[j] = pd.to_numeric(the_csv[j], errors = 'coerce')
    return the_csv

def filter_to_proximate_data(
    dat_rost = data_roster, the_url = set_url, city = city_roster):
    
    for i in dat_rost.index:
        
        ## download file and coerce numeric columns to numeric
        full_url = set_url.format(set_year[-1]) + dat_rost.loc[i, 'file']
        the_csv = read_station_data(full_url)
        
        ## find distance to closest rostered metro area
        min_dist = 999999
        for j in city.index:
            dist_j = distance(
                tuple(the_csv.loc[0, ['LATITUDE', 'LONGITUDE']]),
                tuple(city.loc[j, ['lat', 'lon']])).miles
            min_dist = min(min_dist, dist_j)
        dat_rost.loc[i, 'dist'] = np.round(min_dist)
        
    ## save roster file to disk
    dat_rost.to_csv('B_Process/station_roster.csv')
    return dat_rost
    
if not set_cache['station_roster'] or not isfile('B_Process/station_roster.csv'):
    station_roster = filter_to_proximate_data()
else:
    station_roster = pd.read_csv('B_Process/station_roster.csv')

#### retrieve data from subsequent years for proximate stations

In [7]:
def retrieve_station_data(
    roster = station_roster, url = set_url, year = set_year, ucl = set_col_list):
    
    ## download all relavent data files
    for i in year:
        if not str(i) in roster.columns: roster[str(i)] = False
        needed_file = (roster.dist < 30) & ~roster[str(i)]
        for j in roster.index[needed_file]:
            ## download data file
            try:
                error_flag = False
                x = read_station_data(url.format(i) + roster.loc[j, 'file'])
                if i == year[0]:
                    roster.loc[j, 'lon'] = x.loc[0, 'LONGITUDE']
                    roster.loc[j, 'lat'] = x.loc[0, 'LATITUDE']
                x.to_csv(
                    'B_Process/downloads/' + str(i) + '_'+ roster.loc[j, 'file'])
            except:
                error_flag = True
            
            ## update roster
            if error_flag: roster.loc[j, str(i)] = False
            else: roster.loc[j, str(i)] = True
            roster.to_csv('B_Process/station_roster.csv')

retrieve_station_data() 

#### record completion time stamp

In [8]:
time_check('ID')

# Refine Data

In [9]:
time_check('RD')

# Build Model

In [10]:
time_check('BM')

# Model Routes

In [11]:
time_check('MR')

# Render Visualization

In [12]:
time_check('RV')

# Test Code

In [13]:
time_check('TC')

# Miscellaneous

In [14]:
## Display time statistics
time_check('ZZ')

Time Check AA   = 09:35:06
Time Check ID   = 09:36:05
Time Check RD   = 09:36:05
Time Check BM   = 09:36:05
Time Check MR   = 09:36:05
Time Check RV   = 09:36:05
Time Check TC   = 09:36:05
Time Check ZZ   = 09:36:05
