In [1]:
##########----------##########----------##########----------##########----------

# Libraries, setup, and general use objects

In [2]:
import urllib.request as url
import pandas as pd
import numpy as np
import ipyparallel as ipp
import pickle
import datetime as dt

from bs4 import BeautifulSoup
from os import listdir, mkdir
from os.path import isdir, isfile

from sklearn.metrics   import f1_score
from sklearn.metrics   import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble  import RandomForestClassifier
from sklearn.svm       import SVC

set_gather_data = False
set_sample_frac = 1 / 20
set_parallel_cores = {'Download': 3, 'Model':12}

def time_check(s = 'A'):
    print('Time Check: Point ' + s)
    print(dt.datetime.now())

##
valid_prefix = [69, 72, 74, 99]
use_col_list = {'DATE':str, 'LATITUDE':float, 'LONGITUDE':float,
    'ELEVATION':float, "HourlyDryBulbTemperature":float,
                "HourlyPrecipitation":float} # "HourlyRelativeHumidity":float,

#### Generate directories as needed

In [3]:
def make_directories():
    all_dirs = ['A_Input', 'B_Process', 'C_Output']
    all_dirs = all_dirs + ['B_Process/downloads', 'B_Process/model_data']
    for i in all_dirs:
        if not isdir(i): mkdir(i)
        
make_directories()

# Import Data

#### read_year_links

In [4]:
def read_year_links(dir_address = 'A_Input/year_links.txt'):
    the_connection = open(dir_address, 'r')
    year_links = the_connection.readlines()
    the_connection.close()
    year_links = [i.strip() for i in year_links]
    return year_links

#### extract_links

In [5]:
def extract_links(address):
    
    ## retrieve raw web page
    url_connect = url.urlopen(address)
    all_links = url_connect.read()
    url_connect.close()
    
    ## extract all links to csv files
    all_links = BeautifulSoup(all_links)
    all_links = all_links.find_all('a')
    all_links = [i.string for i in all_links if i.string.find('.csv') != -1]
    
    ## eliminate files unlikely to represent US weather stations
    def us_range(x, target_range = valid_prefix):
        try:
            int(x[0:2])
        except:
            return False
        if int(x[0:2]) in target_range:
            return True
        else:
             return False
    all_links = filter(us_range, all_links)
    
    ## remove url if files have already been downloaded
    already_downloaded = listdir('B_Process/downloads')
    valid_links = list()
    for i in all_links:
        if address[-5:-1] + '_' + i + '.gz' in already_downloaded:
            pass
        else:
            x = address[-5:-1] + '_' + i + '.gz'
            y = address + i
            valid_links.append((x, y))
            
    return valid_links

#### download_files

In [6]:
def download_files(links_ext, ucl = use_col_list):
    import pandas as pd
    try:
        the_csv = pd.read_csv(links_ext[1], usecols = list(ucl.keys()),
            parse_dates = ['DATE'], dtype = ucl)
    except:
        the_csv = pd.read_csv(links_ext[1], usecols = list(ucl.keys()),
            parse_dates = ['DATE'], dtype = str)
        for j in ucl.keys():
            if ucl[j] == float:
                the_csv[j] = pd.to_numeric(the_csv[j], errors = 'coerce')
        
    the_csv = the_csv.round(3)
    the_csv.to_csv('B_Process/downloads/' + links_ext[0], encoding = 'utf-8',
                    index = False)

#### Execute code

In [7]:
if set_gather_data:

    ## read in list of links to file directory pages for each year of the data
    year_links = read_year_links()

    ## iterative extract files from the links on each directory page
    for i in year_links:
        links_extracted = extract_links(i)
        if len(links_extracted) < 1: continue
    
        with ipp.Cluster(n = set_parallel_cores['Download']) as rc:
            par_processes = rc.load_balanced_view()
            par_result = par_processes.map_async(download_files, links_extracted)
            par_result.wait_interactive()
            final_result = par_result.get()
            del par_processes, par_result, final_result

# Refine Data

#### refine_data (and compile)

In [8]:
def refine_data(file_dir, segment, ucl = use_col_list,
                sample_fraction = set_sample_frac):
    
    ## generate roster of files
    list_files = listdir(file_dir)
    list_files = [i for i in list_files if i[0] != '.']
    
    ## filter files to specified segment
    def us_range(x, target_range = segment):
        try:
            x[0:7]
        except:
            return False
        if x[0:7] in target_range:
            return True
        else:
             return False
    list_files = filter(us_range, list_files)
    
    ## assemble files
    all_data = list()
    for i in list_files:
        file_iter = pd.read_csv(file_dir + '/' + i, parse_dates = ['DATE'],
                                dtype = ucl)
        
        ## deconstruct day/times
        file_iter['Day'] = file_iter['DATE'].dt.dayofyear
        file_iter['Hour'] = file_iter['DATE'].dt.hour
        file_iter = file_iter.drop('DATE', axis = 1)
        
        ## score weather
        file_iter['HourlyPrecipitation'] = file_iter[
            'HourlyPrecipitation'].fillna(0)
        file_iter['Temperate'] = (file_iter['HourlyDryBulbTemperature'] > 55) &\
            (file_iter['HourlyDryBulbTemperature'] < 75) &\
            (file_iter['HourlyPrecipitation'] < 0.1)
        file_iter['Temperate'] = file_iter['Temperate'].astype(int)
        file_iter = file_iter.drop(['HourlyDryBulbTemperature',
            'HourlyPrecipitation'], axis = 1)
        
        ## rename columns to make capitalization consistent
        file_iter = file_iter.rename(columns = {'LATITUDE':'Lat',
            'LONGITUDE': 'Lon', 'ELEVATION':"Elev"})
        
        ## sample data if specified
        if sample_fraction < 1:
            assert sample_fraction > 0
            sample_n = file_iter.shape[0] * sample_fraction
            sample_n = int(sample_n)
            sample_n = np.max([sample_n, int((0.5**2)/(0.05**2))])
            sample_n = np.min([sample_n, file_iter.shape[0]])
            file_iter = file_iter.sample(
                n = int(sample_n),
                weights = (file_iter['Temperate'] * 1) + 1
            )

        ## drop files outside the US's rough lat/lon box; append others
        if max(file_iter.Lat) > 25 and min(file_iter.Lat) < 50:
            if max(file_iter.Lon) < -60 and min(file_iter.Lon) > -130:
                all_data.append(file_iter)

    ## compile files and save
    if len(all_data) > 0:
        all_data = pd.concat(all_data, axis = 0)
        all_data.to_csv('B_Process/model_data/' + str(segment[0]) +\
            '_weather_data.csv.gz', index = False, encoding = 'utf-8')
        return all_data
    else:
        pass
        #print('WARNING: ' + str(segment[0]) + ' files contain no valid data')

#### load_model_data

In [9]:
def load_model_data(file_directory = 'B_Process/model_data'):
    all_data = listdir(file_directory)
    all_data = [i for i in all_data if i[-6:] == 'csv.gz']
    for i in range(len(all_data)):
        all_data[i] = pd.read_csv(file_directory + '/' + all_data[i])
    all_data = pd.concat(all_data, axis = 0)
    all_data.reset_index()
    return all_data

#### Execute Code

In [10]:
if set_gather_data:
    for i in valid_prefix:
        for j in range(2015, 2020):
            segment_iter = [str(j) + '_' + str(i)]
            refine_data('B_Process/downloads', segment = segment_iter)

model_data = load_model_data()

# Build Model

#### split data into train and test subsets

In [11]:
## split data into train and test subsets
model_data.loc[:, 'Split'] = np.random.binomial(
    n = 1, size = (model_data.shape[0],), p = 0.8).astype(bool)

#### model_weather

In [35]:
def model_weather(dat = model_data):
    
    ## split data into train and test data
    test_data = dat[~dat['Split']]
    dat = dat[dat['Split']]
    
    ## round off data and average for each rounded grid
    simple_dat = dat.drop(['Elev', 'Split'], axis = 1).round()
    simple_dat.loc[:, 'Day']  = np.ceil(simple_dat.loc[:, 'Day']  / 5) * 5
    simple_dat.loc[:, 'Hour'] = np.ceil(simple_dat.loc[:, 'Hour'] / 2) * 2
    simple_dat.loc[:, ['Lon', 'Lat', 'Day', 'Hour']] = simple_dat.astype(int)
    simple_dat = simple_dat.groupby(['Lon', 'Lat', 'Day', 'Hour']).mean()
    simple_dat = simple_dat.reset_index()
    
    ## train k nearest neighbor model
    knn_model = KNeighborsRegressor(weights = 'distance')
    knn_model = knn_model.fit(
                                simple_dat[['Lon', 'Lat', 'Day', 'Hour']],
                                simple_dat['Temperate']
                                )
    
    ## construct predictor function (using Python context-saving feature)
    def find_closest_mean(new_data):
        new_data = new_data[['Lon', 'Lat', 'Day', 'Hour']]
        return knn_model.predict(new_data)
    
    ## announce accuracy of predictor function
    ml_score = find_closest_mean(test_data) > 0.5
    ml_score = ml_score.astype(int)
    ml_score = f1_score(test_data['Temperate'].values, ml_score).round(3)
    print('Weather Model F1: ' + str(ml_score) + ' (Threshold = 0.5)')
    
    return find_closest_mean

Weather Model F1: 0.833 (Cutoff = 0.5)


In [14]:
def model_temperate(dat, chron_mod, elev_mod):
    
    ## prepare data
    dat.loc[:,'Elev'] = np.nan
    dat.loc[:, 'Chron'] = np.nan
    test_data = dat[~dat['Split']].drop('Split', axis = 1)
    dat = dat[dat['Split']].drop('Split', axis = 1)
    
    ## model elevation and chronology for training
    dat['Elev'] = elev_mod(dat)
    dat['Chron'] = chron_mod(dat)
    
#    ## declare model - random forest
#    main_model = RandomForestClassifier(
#        n_jobs = set_parallel_cores['Model'],
#        max_features = None,
#        n_estimators = np.math.factorial(4)
#        )

    ## declare model - support vector machine (+ scale data)
    dat['Lon'] = (dat['Lon'] + 130)/ (-60 + 130)
    dat['Lat'] = (dat['Lat'] - 25) / (50 - 25)
    dat['Elev'] = dat['Elev'] / 5280
    main_model = SVC(kernel = 'rbf')
    
    ## fit model
    main_model = main_model.fit(
        dat[['Lon', 'Lat', 'Elev', 'Chron']],
        dat['Temperate']
        )
    
    ## construct predictor function
    def temperate_predictor(x_pd):
        x_pd['Elev'] = elev_mod(x_pd)
        x_pd['Chron'] = chron_mod(x_pd)
        x_pd = x_pd[['Lon', 'Lat', 'Elev', 'Chron']]
        return main_model.predict(x_pd)

    ## annouce accuracy of predictor function
    ml_score = temperate_predictor(test_data)
    ml_score = ml_score.astype(int)
    ml_score = f1_score(test_data['Temperate'].values, ml_score).round(3)
    print('Temperate Weather Model - F1 Score is: ' + str(ml_score))
    
    return temperate_predictor

#### Execute Code

In [39]:
time_check('A')
weather_model = model_weather()
time_check('Z')

Time Check: Point A
2022-03-04 09:17:39.659477
Weather Model F1: 0.833 (Cutoff = 0.5)
Time Check: Point Z
2022-03-04 09:17:49.361891


# Model Routes

# Render Dashboard