In [1]:
import pandas as pd
import numpy as np
import mlflow

# experiments
from experiments.noaa.machine_learning import NOAAMLTraining, FeatureExtractionExperiment
from experiments.noaa.deterministic import NOAADeterministicExperiment
from experiments.noaa.kriging_experiment import NOAAKrigingExperiment

In [17]:
mlflow_experiment = mlflow.get_experiment_by_name(NOAADeterministicExperiment.experiment_name)
df = mlflow.search_runs([mlflow_experiment.experiment_id]).sort_values(["start_time"], ascending=False)
metric_cols =  df.columns[df.columns.str.contains("metrics")].tolist()
param_cols = ["rbf","params.epsilon","params.eval_set", "tags.config"]
eps_results_df = (
    df.dropna(how="all",axis=1)
    .loc[(df["tags.config"].str.contains("rbf"))&(df.start_time>"2022-06-19")]
    .assign(rbf=lambda x: x["tags.config"].str.replace("rbf_(.*)_eps_(.*)", r"\1", regex=True))
    .loc[:,param_cols+sorted(metric_cols)]
    .groupby(["params.eval_set","rbf"])
    .apply(lambda x: x.sort_values("metrics.mae",ascending=True))
    .drop(columns=["params.eval_set","rbf"])
    .dropna(subset=["metrics.rmse"])
)
eps_results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,params.epsilon,tags.config,metrics.mae,metrics.r2,metrics.rmse,metrics.time_to_eval,metrics.time_to_evaluate
params.eval_set,rbf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"""set1""",gaussian,54,0.5,rbf_gaussian_eps_0_5_set1,0.216903,0.754345,0.32575,162.74674,
"""set1""",gaussian,63,1.0,rbf_gaussian_eps_1_set1,0.25039,0.712908,0.352153,162.285774,
"""set1""",gaussian,50,0.1,rbf_gaussian_eps_0_1_set1,0.272163,0.614247,0.408202,185.055384,
"""set1""",gaussian,62,1.5,rbf_gaussian_eps_1_5_set1,0.294929,0.630311,0.399613,165.248395,
"""set1""",gaussian,61,2.0,rbf_gaussian_eps_2_set1,0.308527,0.60111,0.415095,193.488767,
"""set1""",gaussian,44,5.0,rbf_gaussian_eps_5_set1,0.310586,0.596542,0.417465,211.388195,
"""set1""",gaussian,67,0.05,rbf_gaussian_eps_0_05_set1,0.318156,0.453311,0.485949,167.746082,
"""set1""",gaussian,55,0.01,rbf_gaussian_eps_0_01_set1,0.454036,-0.148589,0.704374,156.10574,
"""set1""",inverse_multiquadric,46,1.0,rbf_inverse_multiquadric_eps_1_set1,0.205769,0.779647,0.308518,180.211232,
"""set1""",inverse_multiquadric,59,0.5,rbf_inverse_multiquadric_eps_0_5_set1,0.210978,0.766285,0.317734,169.106194,


In [2]:
available_configs = NOAADeterministicExperiment.config.config.keys()
mlflow_experiment = mlflow.get_experiment_by_name(NOAADeterministicExperiment.experiment_name)
df = mlflow.search_runs([mlflow_experiment.experiment_id]).sort_values(["start_time"], ascending=False)
metric_cols =  df.columns[df.columns.str.contains("metrics")].tolist()
param_cols = ["params.epsilon","params.eval_set", "tags.config"]
without_eps = ["thin_plate_spline", "cubic", "idw","linear"]
results_by_set = (
    df.dropna(how="all",axis=1)
    .loc[(df.start_time>"2022-06-19 18:00:00")&(df["tags.config"].isin(available_configs))]
    # .assign(rbf=lambda x: x["tags.config"].str.replace("rbf_(.*)_eps_(.*)", r"\1", regex=True))
    .loc[:,param_cols+sorted(metric_cols)]
    .rename(columns={col: col.replace("metrics.","").replace("params.","") for col in df.columns})
    .rename(columns={"eval_set":"eval_area", "tags.config":"algorithm"})
    .assign(eval_area = lambda df: df.eval_area.str.replace("set","Area "))
    .replace(
        {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
        value=None,
        regex=True
    )
    .assign(epsilon = lambda df: df.apply(lambda row: row.epsilon if not any(a in row.algorithm for a in without_eps) else "-", axis=1))
    .drop_duplicates(subset=["eval_area","algorithm"], keep="first")
    .groupby(["eval_area"])
    .apply(lambda x: x.sort_values("rmse",ascending=True))
    .reset_index(level=-1,drop=True)
    .set_index("algorithm", append=True)
    .drop(columns=["eval_area","time_to_evaluate"])#, "params.epsilon"])
    .dropna(subset=["rmse"])
    .sort_index(axis=1)    
)
# results_by_set = results_by_set[~results_by_set.index.get_level_values("algorithm").str.contains("time")]
results_by_set

Unnamed: 0_level_0,Unnamed: 1_level_0,epsilon,mae,r2,rmse,time_to_eval
eval_area,algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Area 1,linear barycentric time,-,0.204399,0.772895,0.312554,122.803654
Area 1,rbf cubic,-,0.207246,0.772344,0.31487,574.081649
Area 1,linear barycentric,-,0.203498,0.770427,0.31533,571.922491
Area 1,rbf thin plate spline,1.0,0.207805,0.768965,0.317198,564.232491
Area 1,rbf multiquadric,1.0,0.209071,0.767002,0.318542,222.836038
Area 1,rbf inverse multiquadric,0.5,0.20956,0.765577,0.319515,227.52367
Area 1,idw,-,0.223756,0.757033,0.325286,572.280333
Area 1,rbf gaussian,0.5,0.216872,0.750819,0.329419,227.593938
Area 2,linear barycentric,-,0.200597,0.769605,0.306951,411.565576
Area 2,rbf cubic,-,0.212199,0.773006,0.315696,421.51487


In [None]:
print(results_by_set.round(2).to_latex(index=True))

\begin{tabular}{lllrrrr}
\toprule
       &     & epsilon &   mae &    r2 &  rmse &  time\_to\_eval \\
eval\_area & algorithm &         &       &       &       &               \\
\midrule
Area 1 & rbf cubic &       - &  0.21 &  0.77 &  0.31 &        574.08 \\
       & linear barycentric &       - &  0.20 &  0.77 &  0.32 &        571.92 \\
       & rbf thin plate spline &     1.0 &  0.21 &  0.77 &  0.32 &        564.23 \\
       & idw &       - &  0.22 &  0.76 &  0.33 &        572.28 \\
Area 2 & linear barycentric &       - &  0.20 &  0.77 &  0.31 &        411.57 \\
       & rbf cubic &       - &  0.21 &  0.77 &  0.32 &        421.51 \\
       & rbf thin plate spline &     1.0 &  0.21 &  0.77 &  0.32 &        403.41 \\
       & rbf multiquadric  &     1.0 &  0.22 &  0.76 &  0.32 &       3248.54 \\
       & rbf inverse multiquadric  &     0.5 &  0.22 &  0.76 &  0.32 &       3259.89 \\
       & rbf gaussian  &     0.5 &  0.23 &  0.73 &  0.34 &       3239.66 \\
       & idw &       - &  0.2

In [4]:
import subprocess
available_configs = NOAADeterministicExperiment.config.config.keys()
c = 0
for config_name in available_configs: 
    # if "set3" in config_name and "eps" in config_name:
    if config_name in df.loc[(df.start_time>"2022-06-19 18:00:00")&(df["tags.config"].isin(available_configs)), "tags.config"].values:
        # print(config_name)
        continue
    c += 1
    print(config_name)
    experiment_name = NOAADeterministicExperiment.__name__
    eval_frac = "0.2" if "set1" in config_name else "0.1"
    p = subprocess.Popen(["python", "-m", "experiments", experiment_name , config_name, f"--eval_frac={eval_frac}"])
    print(f"Started experiment {config_name} on process {p.pid}")
    if c%5==0:
        p.communicate()

linear_time_set2
Started experiment linear_time_set2 on process 14472
linear_time_set3
Started experiment linear_time_set3 on process 14473
rbf_time_cubic_set1
Started experiment rbf_time_cubic_set1 on process 14474
rbf_time_cubic_set2
Started experiment rbf_time_cubic_set2 on process 14475
rbf_time_cubic_set3
Started experiment rbf_time_cubic_set3 on process 14476
Attempting to run experiment NOAADeterministicExperiment with config rbf_time_cubic_set3...
Running experiment NOAADeterministicExperiment with experiment_config='rbf_time_cubic_set3' and params: {'eval_frac': 0.1}
Attempting to run experiment NOAADeterministicExperiment with config linear_time_set3...
Running experiment NOAADeterministicExperiment with experiment_config='linear_time_set3' and params: {'eval_frac': 0.1}
Attempting to run experiment NOAADeterministicExperiment with config rbf_time_cubic_set2...
Running experiment NOAADeterministicExperiment with experiment_config='rbf_time_cubic_set2' and params: {'eval_frac'

INFO:NOAADeterministicExperiment:Filtering data on area POLYGON ((-45 12.282308, -45 35.55, -98.251934 35.55, -98.251934 12.282308, -45 12.282308))
INFO:NOAADeterministicExperiment:Filtering data on area POLYGON ((-45 12.282308, -45 35.55, -98.251934 35.55, -98.251934 12.282308, -45 12.282308))
INFO:NOAADeterministicExperiment:Filtered data has shapes (2543380, 14) and (333, 4)
INFO:NOAADeterministicExperiment:Filtered data has shapes (2543380, 14) and (333, 4)
INFO:NOAADeterministicExperiment:Filtering data on area POLYGON ((-80 18.930645, -97.806644 18.930645, -97.806644 30.366655, -82.836914 30.366655, -82.836914 35.55, -68.115234 35.55, -68.115234 25.799891, -80 25.799891, -80 18.930645))
INFO:NOAADeterministicExperiment:Filtered data has shapes (1760006, 14) and (232, 4)
INFO:NOAADeterministicExperiment:Filtering data on area POLYGON ((-80 18.930645, -97.806644 18.930645, -97.806644 30.366655, -82.836914 30.366655, -82.836914 35.55, -68.115234 35.55, -68.115234 25.799891, -80 25.7

Split test obtained with dataset(s)=['buoys_data'] and shape(s)=[(80437, 14)]
Split test obtained with dataset(s)=['buoys_data'] and shape(s)=[(116336, 14)]
Split test obtained with dataset(s)=['buoys_data'] and shape(s)=[(116336, 14)]
Split test obtained with dataset(s)=['buoys_data'] and shape(s)=[(122017, 14)]


INFO:NOAADeterministicExperiment:Interpolator is temporal?: True
INFO:NOAADeterministicExperiment:A random subset of 20.0% of the test data will be used for evaluation
fitting interpolator at eval times...:   0%|          | 6/13179 [00:04<2:57:35,  1.24it/s]

Split test obtained with dataset(s)=['buoys_data'] and shape(s)=[(122017, 14)]


fitting interpolator at eval times...:   0%|          | 10/13179 [00:06<2:31:41,  1.45it/s]INFO:NOAADeterministicExperiment:Interpolator is temporal?: True
INFO:NOAADeterministicExperiment:A random subset of 10.0% of the test data will be used for evaluation
fitting interpolator at eval times...:   0%|          | 6/7967 [00:12<4:33:17,  2.06s/it]INFO:NOAADeterministicExperiment:Interpolator is temporal?: True
fitting interpolator at eval times...:   0%|          | 25/13179 [00:19<3:28:19,  1.05it/s]INFO:NOAADeterministicExperiment:A random subset of 10.0% of the test data will be used for evaluation
fitting interpolator at eval times...:   2%|▏         | 163/7967 [00:29<28:01,  4.64it/s]INFO:NOAADeterministicExperiment:Interpolator is temporal?: True
INFO:NOAADeterministicExperiment:A random subset of 10.0% of the test data will be used for evaluation
fitting interpolator at eval times...:   3%|▎         | 215/7967 [00:39<14:53,  8.67it/s]INFO:NOAADeterministicExperiment:Interpolator i

------

In [15]:
config = FeatureExtractionExperiment("set3").get_config()
train_df = pd.concat(
    [pd.read_parquet(f"{config.output.train_dir}/{year}.parquet") for year in range(2011,2022)],
    axis=0).sort_index()
test_df = pd.concat(
    [pd.read_parquet(f"{config.output.eval_dir}/{year}.parquet") for year in range(2011,2022)],
    axis=0).sort_index()

In [16]:
train_df.distance_to_shore.isnull().sum()

AttributeError: 'DataFrame' object has no attribute 'distance_to_shore'