# Determining the hours

In [38]:
# set up path to import my python scripts
import sys
import os
src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path+"/src")

In [39]:
%load_ext autoreload
%autoreload 2
import prep_data as prep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
import pandas as pd
import numpy as np


In [41]:
# link for data files
base_url = "http://kopiko.ifa.hawaii.edu/weather/archivedata/"
# get list of all data file urls
csv_urls = prep.get_csv_file_links(base_url)

# data column names
column_names = ['date_time','temperature','pressure','humidity','wind_speed','wind_direction','visibility','co2','insolation','vertical_wind_speed','precipitation','10min','dewpoint']
columns_of_interest = ['date_time','temperature','humidity','wind_speed','visibility','precipitation','dewpoint','10min']

# Define reasonable ranges for each column
acceptable_ranges = {
    'temperature': (-273,40),
    'humidity': (0,100),
    'wind_speed': (0,100),
    'visibility': (0,100000),
    'precipitation': (0,100),
    'dewpoint': (-273,40)
    }
# Define the thresholds for ('Green', 'Red') weather - plan to use config file in future
thresholds = {
        'humidity': (75,85),
        'wind_sust': (10,12),
        'wind_gust': (15,15),
        'visibility': (50000,40000),
        'precipitation': (0,0),
        'dewpoint_delta': (6,3)
        }


In [5]:
#----TEMP----#
# for initial testing just grab one year - will  use a loop for this later
year = 2018
link = prep.get_specific_year(year,csv_urls)
df_2018 = prep.read_data_of_interest(link, column_names,columns_of_interest)

In [64]:
year = 1994
link = prep.get_specific_year(year,csv_urls)
df_1994 = prep.read_data_of_interest(link, column_names,columns_of_interest)
df = df_1994.copy()

In [65]:
 # create a small subset of data for testing
df = df_2018.iloc[:100000].copy()

# check for reasonable values
prep.remove_unreasonable_measurements(df,acceptable_ranges,inplace=True)
# split wind into sustaind and gusts
df = prep.determine_wind_sust_and_gust(df)
# add delta dew point
df['dewpoint_delta'] = df['temperature'] - df['dewpoint']
# give other values for precipitation and visibility
# df['precipitation'] = np.random.binomial(1,.2,len(df))
# df.loc[np.random.choice(df.index.values,5),'visibility'] = 50000
# df.loc[np.random.choice(df.index.values,5),'visibility'] = 35000
# add status column
df['status'] = prep.get_weather_status(df,thresholds)
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta,status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1994-09-11 17:20:00,13.93,,7.350,,0.0,,1,7.350,,,Green
1994-09-11 17:30:00,13.48,,7.290,,0.0,,1,7.290,,,Green
1994-09-11 17:40:00,13.67,,7.140,,0.0,,1,7.140,,,Green
1994-09-11 17:50:00,13.95,,7.310,,0.0,,1,7.310,,,Green
1994-09-11 18:00:00,13.85,,7.960,,0.0,,1,7.960,,,Green
...,...,...,...,...,...,...,...,...,...,...,...
1994-12-31 23:10:00,17.21,,2.825,,0.0,,1,2.825,,,Green
1994-12-31 23:20:00,17.18,,2.209,,0.0,,1,2.209,,,Green
1994-12-31 23:30:00,17.55,,3.314,,0.0,,1,3.314,,,Green
1994-12-31 23:40:00,17.09,,3.867,,0.0,,1,3.867,,,Green


In [43]:
df['status_change'] = df.status.shift() != df.status
df

Unnamed: 0,date_time,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta,status,status_change
0,2018-01-01 00:00:05,12.08,17.8,5.6,43682.0,,-42.26,0,5.600000,5.6,54.34,Yellow,True
1,2018-01-01 00:00:15,12.09,18.1,5.3,43682.0,,-46.06,0,5.450000,5.3,58.15,Yellow,False
2,2018-01-01 00:00:25,12.07,18.3,4.8,46022.0,,-46.98,0,5.233333,4.8,59.05,Yellow,False
3,2018-01-01 00:00:36,12.07,18.3,3.9,48408.0,,-38.20,0,4.900000,3.9,50.27,Yellow,False
4,2018-01-01 00:00:46,12.06,18.8,3.6,48408.0,,-45.39,0,4.640000,3.6,57.45,Yellow,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2018-01-12 23:25:33,11.81,57.7,5.3,49728.0,,-48.86,0,3.433333,5.3,60.67,Yellow,False
99996,2018-01-12 23:25:43,11.79,57.5,4.3,49633.0,,-42.51,0,3.500000,4.3,54.30,Yellow,False
99997,2018-01-12 23:25:54,11.80,57.7,4.4,49633.0,,-41.70,0,3.566667,4.4,53.50,Yellow,False
99998,2018-01-12 23:26:04,11.80,57.7,3.1,50000.0,,-46.89,0,3.500000,3.1,58.69,Yellow,False


In [44]:
sum(df.status_change)

9615

In [45]:
df.loc[df.status_change].index

Int64Index([    0,   153,   156,   285,   286,   292,   293,   351,   353,
              392,
            ...
            99855, 99865, 99927, 99932, 99953, 99965, 99970, 99974, 99980,
            99984],
           dtype='int64', length=9615)

In [46]:
df.index[df.status_change]

Int64Index([    0,   153,   156,   285,   286,   292,   293,   351,   353,
              392,
            ...
            99855, 99865, 99927, 99932, 99953, 99965, 99970, 99974, 99980,
            99984],
           dtype='int64', length=9615)

## For MVP
Group by day and sum the number of status labels by time step (10 sec or 600 sec). This would leave some incomplete hours if time is skipped, but is a good start.


In [45]:
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min,wind_sust,wind_gust,dewpoint_delta,status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1994-09-11 17:20:00,13.93,,7.350,,0.0,,1,7.350,,,Green
1994-09-11 17:30:00,13.48,,7.290,,0.0,,1,7.290,,,Green
1994-09-11 17:40:00,13.67,,7.140,,0.0,,1,7.140,,,Green
1994-09-11 17:50:00,13.95,,7.310,,0.0,,1,7.310,,,Green
1994-09-11 18:00:00,13.85,,7.960,,0.0,,1,7.960,,,Green
...,...,...,...,...,...,...,...,...,...,...,...
1994-12-31 23:10:00,17.21,,2.825,,0.0,,1,2.825,,,Green
1994-12-31 23:20:00,17.18,,2.209,,0.0,,1,2.209,,,Green
1994-12-31 23:30:00,17.55,,3.314,,0.0,,1,3.314,,,Green
1994-12-31 23:40:00,17.09,,3.867,,0.0,,1,3.867,,,Green


In [66]:
df_status_hours = prep.generate_status_hours_df(df)
df_status_hours

Unnamed: 0_level_0,Green,Yellow,Red
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1994-09-11,6.000000,,0.666667
1994-09-12,23.000000,,1.000000
1994-09-13,22.500000,,1.500000
1994-09-14,24.000000,,
1994-09-15,15.333333,,0.166667
...,...,...,...
1994-12-27,24.000000,,
1994-12-28,24.000000,,
1994-12-29,10.666667,6.333333,0.333333
1994-12-30,5.500000,,


In [67]:
prep.save_df_to_csv(df_status_hours,year,base_path='../data/')

<br>

## Looking at days with low total hours
1994-12-30 only had 5.5 total, all green

In [15]:
pd.to_datetime('1994-12-30').date()

datetime.date(1994, 12, 30)

In [27]:
df[(df.date==(pd.to_datetime('1994-12-30').date())) | (df.date==(pd.to_datetime('1994-12-29').date()))].index.to_list()

[Timestamp('1994-12-29 00:00:00'),
 Timestamp('1994-12-29 00:10:00'),
 Timestamp('1994-12-29 00:20:00'),
 Timestamp('1994-12-29 00:30:00'),
 Timestamp('1994-12-29 00:40:00'),
 Timestamp('1994-12-29 00:50:00'),
 Timestamp('1994-12-29 01:00:00'),
 Timestamp('1994-12-29 01:10:00'),
 Timestamp('1994-12-29 01:20:00'),
 Timestamp('1994-12-29 01:30:00'),
 Timestamp('1994-12-29 01:40:00'),
 Timestamp('1994-12-29 01:50:00'),
 Timestamp('1994-12-29 02:00:00'),
 Timestamp('1994-12-29 02:10:00'),
 Timestamp('1994-12-29 02:20:00'),
 Timestamp('1994-12-29 02:30:00'),
 Timestamp('1994-12-29 02:40:00'),
 Timestamp('1994-12-29 02:50:00'),
 Timestamp('1994-12-29 03:00:00'),
 Timestamp('1994-12-29 03:10:00'),
 Timestamp('1994-12-29 03:20:00'),
 Timestamp('1994-12-29 03:30:00'),
 Timestamp('1994-12-29 03:40:00'),
 Timestamp('1994-12-29 03:50:00'),
 Timestamp('1994-12-29 04:00:00'),
 Timestamp('1994-12-29 04:10:00'),
 Timestamp('1994-12-29 04:20:00'),
 Timestamp('1994-12-29 04:30:00'),
 Timestamp('1994-12-

Data is missing between 1994-12-29 17:10:00 and 1994-12-30 18:30:00

In [34]:
df.loc[pd.to_datetime('1994-12-29 17:10:00')]

temperature            12.51
humidity                 NaN
wind_speed              7.44
visibility               NaN
precipitation            0.0
dewpoint                 NaN
10min                      1
wind_sust               7.44
wind_gust                NaN
dewpoint_delta           NaN
status                 Green
seconds                  600
date              1994-12-29
Name: 1994-12-29 17:10:00, dtype: object

In [37]:
df_status_hours

Unnamed: 0,Green,Yellow,Red
1994-09-11,6.000000,,0.666667
1994-09-12,23.000000,,1.000000
1994-09-13,22.500000,,1.500000
1994-09-14,24.000000,,
1994-09-15,15.333333,,0.166667
...,...,...,...
1994-12-27,24.000000,,
1994-12-28,24.000000,,
1994-12-29,10.666667,6.333333,0.333333
1994-12-30,5.500000,,


To deal with incomplete days I will adjust the daily status so the total hours = 24, keeping the proportions the same. (Normalize and then multiply by 24)  
This will prevent times missing data from pulling down the average.