In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

import os, re, glob

from IPython import display

from spatial_interpolation.pipelines.noaa import (
    data_processing as noaa_proceesing,
    metadata_extraction as noaa_metadata,
    data_extraction as noaa_extraction,
    utils as noaa_utils
)

from spatial_interpolation import (
    utils,
    visualization as viz,
)

from spatial_interpolation.visualization import map_viz

%load_ext autoreload
%autoreload 2
%matplotlib inline

%load_ext line_profiler

In [2]:
# Load data
buoy_locations_info_df = pd.read_csv("references/buoy_locations.csv")
parquet_files = glob.glob("data/03_processed/buoy_stdmet_data/*.parquet")
buoy_df = pd.concat(
    [pd.read_parquet(f) for f in parquet_files],
    axis="index",
).sort_values(
    "time"
).set_index(
    ["buoy_id","time"],
)
utils.print_memory_usage()

memory use: 1.9521 GB


In [None]:
gdf = gpd.GeoDataFrame(buoy_locations_info_df)
gdf["geometry"] = gpd.GeoSeries.from_xy(gdf.longitude,gdf.latitude).drop(columns=["latitude","longitude"])

In [5]:
# Data to perform feature engineering
df = buoy_df.groupby( # Group by buoy_id and time and aggregate hourly data
    [buoy_df.index.get_level_values(0)]+[pd.Grouper(freq="H", level="time")]
).mean().reset_index()

In [17]:
# Save to parquet
for buoy_id in df.buoy_id.unique():
    df[df.buoy_id==buoy_id].to_parquet(
        f"../data/04_feature/buoy_stdmet_data/{buoy_id}.parquet",
        index=False
    )

In [16]:
parquet_files = glob.glob("data/04_feature/buoy_stdmet_data/*.parquet")
df = pd.concat(
    [pd.read_parquet(f) for f in parquet_files],
    axis="index",
)

# Data to model and estimate wave height
# wh_df = df.dropna(
#     subset=["wave_height"]
# )
wh_df = df

In [None]:
t = df.time.sample().values[0]
delta = pd.Timedelta(hours=3)
k_nearest = 5

print("Timestamp:",t)

buoy_ids = noaa_utils.get_active_buoys_at_time(
    t, df, delta
)
print("buoy:",buoy_ids[0])
buoy_dists = noaa_utils.get_buoy_distances_to(
    buoy_ids[0],
    buoy_locations_info_df[buoy_locations_info_df.buoy_id.isin(buoy_ids)],
    year=pd.Timestamp(t).year
)

nearest_buoys = buoy_dists[:k_nearest].index.tolist()
print("Nearest buoys:", nearest_buoys)

trange = t-delta,t+delta
df[(df.time.between(*trange)) & (df.buoy_id.isin(nearest_buoys))].groupby(
    "buoy_id"
).mean()

Timestamp: 2010-05-29T21:00:00.000000000
buoy: 41001
Nearest buoys: ['41025', '41048', '41035', '41013', '41004']


Unnamed: 0_level_0,air_temperature,average_wave_period,dew_point,dominant_wave_period,mean_wave_direction,sea_level_pressure,tide_level,visibility,water_temperature,wave_height,wind_direction,wind_gust,wind_speed,barometer
buoy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
41004,24.728571,4.801429,21.657143,8.027143,,1014.442857,,,24.1,1.048571,115.0,5.928571,4.942857,
41013,24.183333,5.186667,18.133333,8.025,93.333333,1014.75,,,24.7,1.295,142.166667,6.35,5.3,
41025,21.828571,5.178571,16.942857,8.147143,,1015.7,,,20.7,1.26,183.285714,4.457143,3.842857,
41035,23.6,4.575714,18.971429,5.185714,139.571429,1015.228571,,,23.342857,0.894286,157.714286,5.3,4.557143,
41048,20.957143,5.871429,12.4,9.111429,,1016.771429,,,23.114286,1.515714,116.714286,6.857143,5.485714,


In [None]:
noaa_utils.get_buoy_relative_directions_to(
    buoy_ids[0],
    buoy_locations_info_df[buoy_locations_info_df.buoy_id.isin(nearest_buoys+[buoy_ids[0]])],
    year=pd.Timestamp(t).year
)[nearest_buoys]

buoy_id
41025    263.021073
41048     48.505559
41035    272.487382
41013    283.798170
41004    288.759256
Name: direction, dtype: float64