# Establish Prep Data Pipeline

In [2]:
# set up path to import my python scripts
import sys
import os
src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path+"/src")

In [3]:
%load_ext autoreload
%autoreload 2
import prep_data as prep

In [4]:
import pandas as pd
import numpy as np


In [5]:
# link for data files
base_url = "http://kopiko.ifa.hawaii.edu/weather/archivedata/"

# data column names
column_names = ['date_time','temperature','pressure','humidity','wind_speed','wind_direction','visibility','co2','insolation','vertical_wind_speed','precipitation','10min','dewpoint']
columns_of_interest = ['date_time','temperature','humidity','wind_speed','visibility','precipitation','dewpoint','10min']

# Define reasonable ranges for each column
acceptable_ranges = {
    'temperature': (-273,40),
    'humidity': (0,100),
    'wind_speed': (0,100),
    'visibility': (0,100000),
    'precipitation': (0,100),
    'dewpoint': (-273,40)
    }
# Define the thresholds for ('Green', 'Red') weather - plan to use config file in future
thresholds = {
        'humidity': (75,85),
        'wind_sust': (10,12),
        'wind_gust': (15,15),
        'visibility': (50000,40000),
        'precipitation': (0,0),
        'dewpoint_delta': (6,3)
        }


## Set up for loop with try except in case the file doesn't load

In [19]:
# get list of all data file urls
csv_urls = prep.get_csv_file_links(base_url)


In [25]:
# get list of all data file urls
temp_csv_urls = ['1564.csv',csv_urls[0],csv_urls[1],'2023.csv']

for url in temp_csv_urls:
    # grab the year 
    year = url.split('/')[-1].split('.')[0]
    try:
        df = prep.read_data_of_interest(url,column_names,columns_of_interest)
    except:
        print(f'Failed to read data for {year} at: {url} ')
        continue
    print(f'{year} data loaded, run rest of code')



Failed to read data for 1564 at: 1564.csv 
1993 data loaded, run rest of code
1994 data loaded, run rest of code
Failed to read data for 2023 at: 2023.csv 


In [27]:
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1994-09-11 17:20:00,13.93,,7.350,,0.0,,1
1994-09-11 17:30:00,13.48,,7.290,,0.0,,1
1994-09-11 17:40:00,13.67,,7.140,,0.0,,1
1994-09-11 17:50:00,13.95,,7.310,,0.0,,1
1994-09-11 18:00:00,13.85,,7.960,,0.0,,1
...,...,...,...,...,...,...,...
1994-12-31 23:10:00,17.21,,2.825,,0.0,,1
1994-12-31 23:20:00,17.18,,2.209,,0.0,,1
1994-12-31 23:30:00,17.55,,3.314,,0.0,,1
1994-12-31 23:40:00,17.09,,3.867,,0.0,,1


In [20]:
year = 2020
link = prep.get_specific_year(2020,csv_urls)
df = prep.read_data_of_interest(url,column_names,columns_of_interest)

  df = prep.read_data_of_interest(url,column_names,columns_of_interest)


In [22]:
df

Unnamed: 0_level_0,temperature,humidity,wind_speed,visibility,precipitation,dewpoint,10min
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 00:00:00,11.33,10.1,16.7,50000.0,,,0.0
2020-01-01 00:00:10,11.31,10.4,17.1,50000.0,,,0.0
2020-01-01 00:00:20,11.31,10.3,16.5,50000.0,,,0.0
2020-01-01 00:00:30,11.31,9.6,17.1,50000.0,,,0.0
2020-01-01 00:00:41,11.32,9.2,18.2,50000.0,,,0.0
...,...,...,...,...,...,...,...
2020-12-31 23:59:16,6.46,25.8,11.1,50000.0,,-10.26,
2020-12-31 23:59:26,6.47,25.2,12.1,50000.0,,,
2020-12-31 23:59:36,6.46,24.9,15.1,50000.0,,,
2020-12-31 23:59:46,6.47,26.8,14.1,50000.0,,,


## Rest of pipeline (actually in the for loop)

In [23]:
# check for reasonable values
prep.remove_unreasonable_measurements(df,acceptable_ranges,inplace=True)

# split wind into sustaind and gusts
df = prep.determine_wind_sust_and_gust(df)

# add delta dew point
df['dewpoint_delta'] = df['temperature'] - df['dewpoint']

# convert thresholds to status
df['status'] = prep.get_weather_status(df,thresholds)

# make new df with daily hours
df_status_hours = prep.generate_status_hours_df(df)

# save new df
prep.save_df_to_csv(df_status_hours,year)

TypeError: '<' not supported between instances of 'str' and 'int'