In [1]:
# set up path to import my python scripts
import sys
import os
src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path+"/src")

In [2]:
%load_ext autoreload
%autoreload 2
import prep_data as prep

In [3]:
import pandas as pd
import numpy as np


In [4]:
# link for data files
base_url = "http://kopiko.ifa.hawaii.edu/weather/archivedata/"
# get list of all data file urls
csv_urls = prep.get_csv_file_links(base_url)

# data column names
column_names = ['date_time','temperature','pressure','humidity','wind_speed','wind_direction','visibility','co2','insolation','vertical_wind_speed','precipitation','10min','dewpoint']
columns_of_interest = ['date_time','temperature','humidity','wind_speed','visibility','precipitation','dewpoint','10min']

# Define reasonable ranges for each column
acceptable_ranges = {
    'temperature': (-273,40),
    'humidity': (0,100),
    'wind_speed': (0,100),
    'visibility': (0,100000),
    'precipitation': (0,100),
    'dewpoint': (-273,40)
    }
# Define the thresholds for ('Green', 'Red') weather - plan to use config file in future
thresholds = {
        'humidity': (75,85),
        'wind_sust': (10,12),
        'wind_gust': (15,15),
        'visibility': (50000,40000),
        'precipitation': (0,0),
        'dewpoint_delta': (6,3)
        }


In [5]:
#----TEMP----#
# for initial testing just grab one year - will  use a loop for this later
year = 2018
link = prep.get_specific_year(year,csv_urls)
df_2018 = prep.read_data_of_interest(link, column_names,columns_of_interest)

In [6]:
 # create a small subset of data to check reasonable data against
df = df_2018.iloc[:20].copy()
# change NaNs to numbers for now
# df.humidity.mask([True]*len(df),np.random.uniform(50,100,size=len(df)),inplace=True)
# df.visibility.mask([True]*len(df),np.random.uniform(30000,50000,size=len(df)),inplace=True)
# df.dewpoint.mask([True]*len(df),np.random.uniform(0,40,size=len(df)),inplace=True)

# check for reasonable values
prep.remove_unreasonable_measurements(df,acceptable_ranges,inplace=True)


# split wind into sustaind and gusts
df = prep.determine_wind_sust_and_gust(df)


# add delta dew point
df['dewpoint_delta'] = df['temperature'] - df['dewpoint']

In [7]:
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01 00:00:05,12.08,17.8,5.6,43682.0,,-42.26,0,5.6,5.6,54.34
2018-01-01 00:00:15,12.09,18.1,5.3,43682.0,,-46.06,0,5.45,5.3,58.15
2018-01-01 00:00:25,12.07,18.3,4.8,46022.0,,-46.98,0,5.233333,4.8,59.05
2018-01-01 00:00:36,12.07,18.3,3.9,48408.0,,-38.2,0,4.9,3.9,50.27
2018-01-01 00:00:46,12.06,18.8,3.6,48408.0,,-45.39,0,4.64,3.6,57.45
2018-01-01 00:00:56,12.07,19.1,3.7,48233.0,,-43.03,0,4.483333,3.7,55.1
2018-01-01 00:01:06,12.06,19.5,3.9,48233.0,,-33.74,0,4.4,3.9,45.8
2018-01-01 00:01:16,12.07,19.4,4.6,48233.0,,-40.99,0,4.425,4.6,53.06
2018-01-01 00:01:27,12.06,19.8,5.1,48233.0,,-33.48,0,4.5,5.1,45.54
2018-01-01 00:01:37,12.07,20.0,5.2,48233.0,,-33.13,0,4.57,5.2,45.2


In [8]:
df['status'] = prep.get_weather_status(df,thresholds)
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta,status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 00:00:05,12.08,17.8,5.6,43682.0,,-42.26,0,5.6,5.6,54.34,Yellow
2018-01-01 00:00:15,12.09,18.1,5.3,43682.0,,-46.06,0,5.45,5.3,58.15,Yellow
2018-01-01 00:00:25,12.07,18.3,4.8,46022.0,,-46.98,0,5.233333,4.8,59.05,Yellow
2018-01-01 00:00:36,12.07,18.3,3.9,48408.0,,-38.2,0,4.9,3.9,50.27,Yellow
2018-01-01 00:00:46,12.06,18.8,3.6,48408.0,,-45.39,0,4.64,3.6,57.45,Yellow
2018-01-01 00:00:56,12.07,19.1,3.7,48233.0,,-43.03,0,4.483333,3.7,55.1,Yellow
2018-01-01 00:01:06,12.06,19.5,3.9,48233.0,,-33.74,0,4.4,3.9,45.8,Yellow
2018-01-01 00:01:16,12.07,19.4,4.6,48233.0,,-40.99,0,4.425,4.6,53.06,Yellow
2018-01-01 00:01:27,12.06,19.8,5.1,48233.0,,-33.48,0,4.5,5.1,45.54,Yellow
2018-01-01 00:01:37,12.07,20.0,5.2,48233.0,,-33.13,0,4.57,5.2,45.2,Yellow


In [9]:
# check if working if no NaNs and some green and red visibility
df['precipitation'] = np.random.binomial(1,.2,20)
df.loc[np.random.choice(df.index.values,5),'visibility'] = 50000
df.loc[np.random.choice(df.index.values,5),'visibility'] = 35000

In [10]:
df['status'] = prep.get_weather_status(df,thresholds)
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta,status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 00:00:05,12.08,17.8,5.6,43682.0,0,-42.26,0,5.6,5.6,54.34,Yellow
2018-01-01 00:00:15,12.09,18.1,5.3,50000.0,0,-46.06,0,5.45,5.3,58.15,Green
2018-01-01 00:00:25,12.07,18.3,4.8,46022.0,0,-46.98,0,5.233333,4.8,59.05,Yellow
2018-01-01 00:00:36,12.07,18.3,3.9,50000.0,0,-38.2,0,4.9,3.9,50.27,Green
2018-01-01 00:00:46,12.06,18.8,3.6,35000.0,0,-45.39,0,4.64,3.6,57.45,Red
2018-01-01 00:00:56,12.07,19.1,3.7,48233.0,1,-43.03,0,4.483333,3.7,55.1,Red
2018-01-01 00:01:06,12.06,19.5,3.9,35000.0,0,-33.74,0,4.4,3.9,45.8,Red
2018-01-01 00:01:16,12.07,19.4,4.6,50000.0,0,-40.99,0,4.425,4.6,53.06,Green
2018-01-01 00:01:27,12.06,19.8,5.1,48233.0,0,-33.48,0,4.5,5.1,45.54,Yellow
2018-01-01 00:01:37,12.07,20.0,5.2,35000.0,1,-33.13,0,4.57,5.2,45.2,Red


Works without any NaNs

-----------------
# Now what if there are NaNs

In [11]:
df.loc[np.random.choice(df.index.values,5),'wind_gust'] = np.nan
df.precipitation = np.nan

In [12]:
df['status'] = prep.get_weather_status(df,thresholds)
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta,status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 00:00:05,12.08,17.8,5.6,43682.0,,-42.26,0,5.6,,54.34,Yellow
2018-01-01 00:00:15,12.09,18.1,5.3,50000.0,,-46.06,0,5.45,5.3,58.15,Yellow
2018-01-01 00:00:25,12.07,18.3,4.8,46022.0,,-46.98,0,5.233333,4.8,59.05,Yellow
2018-01-01 00:00:36,12.07,18.3,3.9,50000.0,,-38.2,0,4.9,3.9,50.27,Yellow
2018-01-01 00:00:46,12.06,18.8,3.6,35000.0,,-45.39,0,4.64,3.6,57.45,Red
2018-01-01 00:00:56,12.07,19.1,3.7,48233.0,,-43.03,0,4.483333,3.7,55.1,Yellow
2018-01-01 00:01:06,12.06,19.5,3.9,35000.0,,-33.74,0,4.4,3.9,45.8,Red
2018-01-01 00:01:16,12.07,19.4,4.6,50000.0,,-40.99,0,4.425,,53.06,Yellow
2018-01-01 00:01:27,12.06,19.8,5.1,48233.0,,-33.48,0,4.5,5.1,45.54,Yellow
2018-01-01 00:01:37,12.07,20.0,5.2,35000.0,,-33.13,0,4.57,,45.2,Red


Red works because just one condition needs to be true.  All the NaNs in precipitation cause all Green condition checks to fail.