In [1]:
##########----------##########----------##########----------##########----------

# Libraries, setup, and general use objects

In [2]:
import urllib.request as url
import pandas as pd
import ipyparallel as ipp

from bs4 import BeautifulSoup
from os import listdir, mkdir
from os.path import isdir
from sklearn import

set_gather_data = False

##
valid_prefix = [69, 72, 74, 99]
use_col_list = {'DATE':str, 'LATITUDE':float, 'LONGITUDE':float,
    'ELEVATION':float, "HourlyDryBulbTemperature":float,
                "HourlyPrecipitation":float} # "HourlyRelativeHumidity":float,

#### Generate directories as needed

In [3]:
def make_directories():
    all_dirs = ['A_Input', 'B_Process', 'C_Output']
    all_dirs = all_dirs + ['B_Process/downloads', 'B_Process/model_data']
    for i in all_dirs:
        if not isdir(i): mkdir(i)
        
make_directories()

# Import Data

#### read_year_links

In [4]:
def read_year_links(dir_address = 'A_Input/year_links.txt'):
    the_connection = open(dir_address, 'r')
    year_links = the_connection.readlines()
    the_connection.close()
    year_links = [i.strip() for i in year_links]
    return year_links

#### extract_links

In [5]:


def extract_links(address):
    
    ## retrieve raw web page
    url_connect = url.urlopen(address)
    all_links = url_connect.read()
    url_connect.close()
    
    ## extract all links to csv files
    all_links = BeautifulSoup(all_links)
    all_links = all_links.find_all('a')
    all_links = [i.string for i in all_links if i.string.find('.csv') != -1]
    
    ## eliminate files unlikely to represent US weather stations
    def us_range(x, target_range = valid_prefix):
        try:
            int(x[0:2])
        except:
            return False
        if int(x[0:2]) in target_range:
            return True
        else:
             return False
    all_links = filter(us_range, all_links)
    
    ## remove url if files have already been downloaded
    already_downloaded = listdir('B_Process/downloads')
    valid_links = list()
    for i in all_links:
        if address[-5:-1] + '_' + i + '.gz' in already_downloaded:
            pass
        else:
            x = address[-5:-1] + '_' + i + '.gz'
            y = address + i
            valid_links.append((x, y))
            
    return valid_links

#### download_files

In [6]:
def download_files(links_ext, ucl = use_col_list):
    import pandas as pd
    try:
        the_csv = pd.read_csv(links_ext[1], usecols = list(ucl.keys()),
            parse_dates = ['DATE'], dtype = ucl)
    except:
        the_csv = pd.read_csv(links_ext[1], usecols = list(ucl.keys()),
            parse_dates = ['DATE'], dtype = str)
        for j in ucl.keys():
            if ucl[j] == float:
                the_csv[j] = pd.to_numeric(the_csv[j], errors = 'coerce')
        
    the_csv = the_csv.round(3)
    the_csv.to_csv('B_Process/downloads/' + links_ext[0], encoding = 'utf-8',
                    index = False)

#### Execute code

In [7]:
if set_gather_data:

    ## read in list of links to file directory pages for each year of the data
    year_links = read_year_links()

    ## iterative extract files from the links on each directory page
    for i in year_links:
        links_extracted = extract_links(i)
    
        with ipp.Cluster(n = 4) as rc:
            par_processes = rc.load_balanced_view()
            par_result = par_processes.map_async(download_files, links_extracted)
            par_result.wait_interactive()
            final_result = par_result.get()

    del par_processes, par_result, final_result

# Refine Data

#### refine_data (and compile)

In [8]:
def refine_data(file_dir, segment, ucl = use_col_list):
    
    ## generate roster of files
    list_files = listdir(file_dir)
    list_files = [i for i in list_files if i[0] != '.']
    
    ## filter files to specified segment
    def us_range(x, target_range = segment):
        try:
            x[0:7]
        except:
            return False
        if x[0:7] in target_range:
            return True
        else:
             return False
    list_files = filter(us_range, list_files)
    
    ## assemble files
    all_data = list()
    for i in list_files:
        file_iter = pd.read_csv(file_dir + '/' + i, parse_dates = ['DATE'],
                                dtype = ucl)
        
        ## deconstruct day/times
        file_iter['Day'] = file_iter['DATE'].dt.dayofyear
        file_iter['Hour'] = file_iter['DATE'].dt.hour
        file_iter = file_iter.drop('DATE', axis = 1)
        
        ## score weather
        file_iter['HourlyPrecipitation'] = file_iter[
            'HourlyPrecipitation'].fillna(0)
        file_iter['Temperate'] = (file_iter['HourlyDryBulbTemperature'] > 55) &\
            (file_iter['HourlyDryBulbTemperature'] < 75) &\
            (file_iter['HourlyPrecipitation'] < 0.1)
        file_iter['Temperate'] = file_iter['Temperate'].astype(int)
        file_iter = file_iter.drop(['HourlyDryBulbTemperature',
            'HourlyPrecipitation'], axis = 1)
        
        ## rename columns to make capitalization consistent
        file_iter = file_iter.rename(columns = {'LATITUDE':'Lat',
            'LONGITUDE': 'Lon', 'ELEVATION':"Elev"})
        
        ## drop files outside the US's rough lat/lon box
        if max(file_iter.Lat) > 25 and min(file_iter.Lat) < 50:
            if max(file_iter.Lon) < -60 and min(file_iter.Lon) > -130:
                all_data.append(file_iter)
    
    ## compile files and save
    if len(all_data) > 0:
        all_data = pd.concat(all_data, axis = 0)
        all_data.to_csv('B_Process/model_data/' + str(segment[0]) +\
            '_weather_data.csv.gz', index = False, encoding = 'utf-8')
        return all_data
    else:
        pass
        #print('WARNING: ' + str(segment[0]) + ' files contain no valid data')

#### Execute Code

In [25]:
if set_gather_data:
    for i in valid_prefix:
        for j in range(2010, 2020):
            segment_iter = [str(j) + '_' + str(i)]
            refine_data('B_Process/downloads', segment = segment_iter)

# Build Model

#### load_model_data

In [33]:
def load_model_data(file_directory = 'B_Process/model_data'):
    all_data = listdir(file_directory)
    all_data = [i for i in all_data if i[-6:] == 'csv.gz']
    for i in range(len(all_data)):
        all_data[i] = pd.read_csv(file_directory + '/' + all_data[i])
    all_data = pd.concat(all_data, axis = 0)
    return all_data

#### train_model

In [11]:
def train_model():
    pass

#### Execute Code

In [34]:
model_data = load_model_data()

# Model Routes

In [37]:
## LogisticRegression
## PLSRegression
## SVR
## DecisionTreeRegressor
## RandomForestRegressor
## MLPRegressor

Unnamed: 0,Lat,Lon,Elev,Day,Hour,Temperate
0,34.05,-94.401,108.2,1,0,0
1,34.05,-94.401,108.2,1,0,0
2,34.05,-94.401,108.2,1,1,0
3,34.05,-94.401,108.2,1,1,0
4,34.05,-94.401,108.2,1,1,0
...,...,...,...,...,...,...
33715676,44.65,-73.467,71.3,365,21,0
33715677,44.65,-73.467,71.3,365,22,0
33715678,44.65,-73.467,71.3,365,23,0
33715679,44.65,-73.467,71.3,365,23,0


# Render Dashboard