In [None]:
import pandas as pd
import numpy as np
import mlflow, re

# experiments
from experiments.noaa.machine_learning import NOAAMLTraining, FeatureExtractionExperiment
from experiments.noaa.deterministic import NOAADeterministicExperiment
from experiments.noaa.kriging_experiment import NOAAKrigingExperiment
from experiments.noaa import *
client = mlflow.tracking.MlflowClient()

In [None]:
mlflow_experiment = mlflow.get_experiment_by_name(NOAADeterministicExperiment.experiment_name)
available_configs = NOAADeterministicExperiment.config.config.keys()
df = mlflow.search_runs([mlflow_experiment.experiment_id]).sort_values(["start_time"], ascending=False)
metric_cols =  df.columns[df.columns.str.contains("metrics")].tolist()
param_cols = ["rbf","params.epsilon","params.eval_set", "tags.config"]
without_eps = ["thin_plate_spline", "cubic", "idw","linear"]
eps_results_df = (
    df.dropna(how="all",axis=1)
    .loc[(df.start_time>"2022-06-19 18:00:00")]
    .loc[(df["tags.config"].str.contains("rbf"))&(df.start_time>"2022-06-19")]
    .assign(rbf=lambda x: x["tags.config"].str.replace("rbf_(.*)_eps_(.*)", r"\1", regex=True))
    .loc[:,param_cols+sorted(metric_cols)]
    .replace(
        {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
        value=None,
        regex=True
    )
    .groupby(["params.eval_set","rbf"])
    .apply(lambda x: x.sort_values("metrics.mae",ascending=True))
    .drop(columns=["params.eval_set","rbf", "tags.config"])
    .dropna(subset=["metrics.rmse"])
    .rename(columns={col: col.replace("metrics.","").replace("params.","") for col in df.columns})
)
eps_results_df.loc[["set1","set3"]]

In [None]:
available_configs = NOAADeterministicExperiment.config.config.keys()
mlflow_experiment = mlflow.get_experiment_by_name(NOAADeterministicExperiment.experiment_name)
df = (
    mlflow.search_runs([mlflow_experiment.experiment_id])
    .dropna(how="all",axis=1)
    .set_index("run_id")
    .sort_values(["start_time"], ascending=False)
)
df = df.loc[(df.start_time>"2022-06-19 18:00:00")&(df["tags.config"].isin(available_configs))]
metric_cols =  df.columns[df.columns.str.contains("metrics")].sort_values(ascending=False).tolist()
param_cols = ["params.epsilon","params.eval_frac","params.num_evaluated_points","params.eval_set", "tags.config"]
without_eps = ["thin_plate_spline", "cubic", "idw","linear"]
results_by_set = (
    df.dropna(how="all",axis=1)
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    # .assign(rbf=lambda x: x["tags.config"].str.replace("rbf_(.*)_eps_(.*)", r"\1", regex=True))
    .loc[:,param_cols+metric_cols]
    .rename(columns={col: col.replace("metrics.","").replace("params.","") for col in df.columns})
    .rename(columns={"eval_set":"eval_area"})#, "tags.config":"algorithm"})
    .assign(
        eval_area = lambda df: df.eval_area.str.replace("set","Area "), 
        inference_time_per_point = lambda df: (df.time_to_eval/df.num_evaluated_points.astype(int))*1000, # ms
        algorithm = lambda df: df["tags.config"].values
    )
    .replace(
        {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
        value=None,
        regex=True
    )
    .assign(epsilon = lambda df: df.apply(lambda row: row.epsilon if not any(a in row.algorithm for a in without_eps) else "-", axis=1))
    .groupby(["eval_area"])
    .apply(lambda x: x.sort_values("rmse",ascending=True))
    .reset_index(level=-1,drop=True)
    .drop_duplicates(subset=["eval_area","algorithm"], keep="last")
    .set_index("algorithm", append=True)
    .drop(columns=["eval_area", "time_to_eval", "tags.config"])#, "params.epsilon"])
    .dropna(subset=["rmse"])
    # .sort_index(axis=1, ascending=False)
)
# results_by_set = results_by_set[~results_by_set.index.get_level_values("algorithm").str.contains("time")]
results_by_set

In [None]:
print(results_by_set.drop(columns=["eval_frac", "n_jobs"]).round(3).to_latex(index=True))

In [None]:
df_runs = (
    df
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .dropna(subset=metric_cols)
)
df_partials = []
for run_id in df_runs.index:
    conf_name = df_runs.loc[run_id,"tags.config"].lower()
    if "time" in conf_name or any(s in conf_name for s in ["gaussian", "idw"]):
        continue
    path = client.download_artifacts(run_id, "partial_eval_metrics.html")
    df_partial = pd.read_html(path)[0]
    # df.columns = pd.MultiIndex.from_columns(df.columns)
    df_partial.columns = df_partial.columns.droplevel(-1)
    df_partial.columns = ["partial_set"] + df_partial.columns.tolist()[1:]
    df_partial = (
        df_partial.assign(
            algorithm = lambda df: conf_name,
            eval_frac = lambda df: df_runs.loc[run_id,"params.eval_frac"].lower(),
            eval_area = df_runs.loc[run_id,"params.eval_set"].replace("set","Area ").replace('"',"")
        ).replace(
            {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
            value=None,
            regex=True
        )
        # .set_index(["eval_area","algorithm", "partial_set"])
        .loc[:, ["eval_area","algorithm", "partial_set"]+["rmse", "r2", "mae"]]
        .round(3)
        .pivot_table(index=["eval_area","algorithm"], columns="partial_set", values=["rmse"])
    )
    df_partials.append(df_partial)
partials_df = pd.concat(df_partials).sort_index()
partials_df

In [None]:
print(
    partials_df
    .to_latex(index=True)
    .replace("NaN", "-")
)

------- 
### Kriging

In [None]:
available_configs = NOAAKrigingExperiment.config.config.keys()
mlflow_experiment = mlflow.get_experiment_by_name(NOAAKrigingExperiment.experiment_name)
df = (
    mlflow.search_runs([mlflow_experiment.experiment_id])
    .dropna(how="all",axis=1)
    .set_index("run_id")
    .sort_values(["start_time"], ascending=False)
)
df = df.loc[(df.start_time>"2022-06-19 18:00:00")&(df["tags.config"].isin(available_configs))]
metric_cols =  df.columns[df.columns.str.contains("metrics")].sort_values(ascending=False).tolist()
param_cols = ["params.model","params.eval_frac","params.num_evaluated_points","params.eval_set", "tags.config", "params.n_jobs"]
results_by_set = (
    df
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .loc[:,param_cols+metric_cols]
    .dropna()
    .assign(algorithm=lambda x: x["tags.config"].str.replace("ordinary_kriging_(.*)", r"\1", regex=True))    
    .rename(columns={col: col.replace("metrics.","").replace("params.","") for col in df.columns})
    .rename(columns={"eval_set":"eval_area", "tags.config":"algorithm", "model":"variogram"})
    .assign(
        eval_area = lambda df: df.eval_area.str.replace("set","Area "), 
        inference_time_per_point = lambda df: (df.time_to_eval/df.num_evaluated_points.astype(int))*1000, # convert to ms
    )
    .replace(
        {"_set.": "", "ok":"ordinary kriging", '"':'',"_": " "},
        value=None,
        regex=True
    )
    .drop_duplicates(subset=["eval_area","variogram"], keep="first")
    .groupby(["eval_area"])
    .apply(lambda x: x.sort_values("rmse",ascending=True))
    .reset_index(level=-1,drop=True)
    .set_index("variogram", append=True)
    .drop(columns=["eval_area","time_to_eval","algorithm", "num_evaluated_points"])#, "params.epsilon"])
    .dropna(subset=["rmse"])
    # .sort_index(axis=1, ascending=False)
)
# results_by_set = results_by_set[~results_by_set.index.get_level_values("algorithm").str.contains("time")]
results_by_set

In [None]:
print(results_by_set.drop(columns=["eval_frac", "n_jobs"]).round(3).to_latex(index=True))

In [None]:
df_runs = (
    df
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .dropna(subset=metric_cols)
)
df_partials = []
for run_id in df_runs.index:
    conf_name = df_runs.loc[run_id,"tags.config"].lower()
    if "time" in conf_name or any(s in conf_name for s in ["gaussian", "idw"]):
        continue
    path = client.download_artifacts(run_id, "partial_eval_metrics.html")
    df_partial = pd.read_html(path)[0]
    # df.columns = pd.MultiIndex.from_columns(df.columns)
    df_partial.columns = df_partial.columns.droplevel(-1)
    df_partial.columns = ["partial_set"] + df_partial.columns.tolist()[1:]
    df_partial = (
        df_partial.assign(
            algorithm = "OK " + re.sub("ok_(.*)_.*",r"\1",df_runs.loc[run_id, "tags.config"]).capitalize(),
            eval_area = df_runs.loc[run_id,"params.eval_set"].replace("set","Area ").replace('"',"")
        ).replace(
            {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
            value=None,
            regex=True
        )
        # .set_index(["eval_area","algorithm", "partial_set"])
        .loc[:, ["eval_area","algorithm", "partial_set"]+["rmse", "r2", "mae"]]
        .round(3)
        .pivot_table(index=["eval_area","algorithm"], columns="partial_set", values=["rmse"])
    )
    df_partials.append(df_partial)
partials_df = pd.concat(df_partials).sort_index()
partials_df.columns = partials_df.columns.droplevel(0)
partials_df

In [None]:
print(
    re.sub(
        "&\s+([a-f])\s+&",
        lambda m: " & " + m.group(1).upper() + " & ",
        partials_df
        .to_latex(index=True)
        .replace("NaN", "-")
        .replace("eval_area", "Area")
        .replace("algorithm", "Algorithm")
    )

)

In [None]:
print(
    results_by_set
    .reset_index()
    .rename(columns={
        "algorithm":"rbf", 
        "num_evaluated_points":"evaluated points", 
        "inference_time_per_point":"inference time / point (ms)",
    })
    .assign(rbf=lambda df: df.rbf.str.replace("rbf ",""))
    .replace({"rbf":""},value=None)
    .drop(columns=["eval_frac"])
    .round(3)
    .round({"inference time / point (ms)":2})
    .set_index(["eval_area","rbf"])
    .to_latex(index=True)
)

----

In [None]:
available_configs = NOAAMLTraining.config.config.keys()
mlflow_experiment = mlflow.get_experiment_by_name(NOAAMLTraining.experiment_name)
df = (
    mlflow.search_runs([mlflow_experiment.experiment_id])
    .dropna(how="all",axis=1)
    .set_index("run_id")
    .sort_values(["start_time"], ascending=False)
)
df = df.loc[(df.start_time>"2022-06-22 22:00:00")&(df["tags.config"].isin(available_configs))]
metric_cols =  ["metrics.rmse", "metrics.r2", "metrics.mae", "metrics.inference_time", "metrics.fit_time" ]
param_cols = ["params.model", "tags.config", "params.eval_set"]
results_by_set = (
    df
    .assign(**{
        "metrics.inference_time":lambda df: np.where(df["metrics.inference_time"].isnull(), df["params.inference_time"], df["metrics.inference_time"]).astype(float),
        "metrics.fit_time":lambda df: np.where(df["metrics.fit_time"].isnull(), df["params.fit_time"], df["metrics.fit_time"]).astype(float),
        "metrics.mae": lambda df: np.where(df["metrics.mae"].isnull(), df["metrics.eval_mae"], df["metrics.mae"]).astype(float),
    })
    # .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .loc[:,param_cols+metric_cols]
    # .dropna()
    .rename(columns={col: col.replace("metrics.","").replace("params.","") for col in df.columns})
    # .rename(columns={"eval_set":"eval_area"})
    .assign(
        eval_area = lambda df: df["tags.config"].str.split("_").str[-1].str.replace("set","Area "), 
        inference_time = lambda df: (df.inference_time)*1000, # convert to ms
    )
    .replace(
        {"_set.": "", "ok":"ordinary kriging", '"':'',"_": " "},
        value=None,
        regex=True
    )
    .assign(model=lambda x: x["model"].str.split(".").str[-1].str[:-2].str.capitalize())
    # .drop_duplicates(subset=["eval_area","model"], keep="first")
    .groupby(["eval_area"])
    .apply(lambda x: x.sort_values("rmse",ascending=True))
    .reset_index(level=-1,drop=True)
    .set_index("model", append=True)
    .drop(columns=["eval_area", "eval_set", "tags.config"])#, "params.epsilon"])
    .sort_index(axis=1, ascending=False)
    .round(4)
)
# results_by_set = results_by_set[~results_by_set.index.get_level_values("algorithm").str.contains("time")]
results_by_set

In [None]:
print(
    results_by_set
    .to_latex(index=True)
    .replace("regressor", "")
)

In [None]:
df_runs = (
    df
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .dropna(subset=metric_cols)
)
df_partials = []
for run_id in df_runs.index:
    conf_name = df_runs.loc[run_id,"tags.config"].lower()
    if "time" in conf_name or any(s in conf_name for s in ["gaussian", "idw"]):
        continue
    path = client.download_artifacts(run_id, "partial_eval_metrics.html")
    df_partial = pd.read_html(path)[0]
    # df.columns = pd.MultiIndex.from_columns(df.columns)
    df_partial.columns = df_partial.columns.droplevel(-1)
    df_partial.columns = ["partial_set"] + df_partial.columns.tolist()[1:]
    df_partial = (
        df_partial.assign(
            algorithm = re.sub("ok_(.*)_.*",r"\1",df_runs.loc[run_id, "tags.config"]).replace("config","".strip()).split(".")[0],
            eval_area = df_runs.loc[run_id,"params.eval_set"].replace("set","Area ").replace('"',"")
        ).replace(
            {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
            value=None,
            regex=True
        )
        # .set_index(["eval_area","algorithm", "partial_set"])
        .loc[:, ["eval_area","algorithm", "partial_set"]+["rmse", "r2", "mae"]]
        .round(3)
        .pivot_table(index=["eval_area","algorithm"], columns="partial_set", values=["rmse"])
    )
    df_partials.append(df_partial)
partials_df = pd.concat(df_partials).sort_index()
partials_df.columns = partials_df.columns.droplevel(0)
partials_df

In [None]:
print(
    re.sub(
        "&\s+([a-f])\s+&",
        lambda m: " & " + m.group(1).upper() + " & ",
        partials_df
        .to_latex(index=True)
        .replace("NaN", "-")
        .replace("eval\_area", "Area")
        .replace("partial\_set", "Partial Set")
        .replace("algorithm", "Model")
    )

)

In [None]:
available_configs = NOAARegressionKrigingExperiment.config.config.keys()
mlflow_experiment = mlflow.get_experiment_by_name(NOAAKrigingExperiment.experiment_name)
df = (
    mlflow.search_runs([mlflow_experiment.experiment_id])
    .dropna(how="all",axis=1)
    .set_index("run_id")
    .sort_values(["start_time"], ascending=False)
)
df = df[df["tags.config"].str.startswith("rk")]

df = df.loc[(df.start_time>"2022-06-22 22:00:00")&(df["tags.config"].isin(available_configs))]
metric_cols =  df.columns[df.columns.str.contains("metrics")].sort_values(ascending=False).tolist()
param_cols = ["params.model","params.eval_frac","params.num_evaluated_points","params.eval_set", "tags.config", "params.n_jobs"]

results_by_set = (
    df
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .loc[:,param_cols+metric_cols]
    .dropna()
    .assign(algorithm=lambda x: x["tags.config"].str.replace("ordinary_kriging_(.*)", r"\1", regex=True))    
    .rename(columns={col: col.replace("metrics.","").replace("params.","") for col in df.columns})
    .rename(columns={"eval_set":"eval_area", "tags.config":"algorithm", "model":"variogram"})
    .assign(
        eval_area = lambda df: df.eval_area.str.replace("set","Area "), 
        inference_time_per_point = lambda df: (df.time_to_eval/df.num_evaluated_points.astype(int))*1000, # convert to ms
    )
    .replace(
        {"_set.": "", "ok":"ordinary kriging", '"':'',"_": " "},
        value=None,
        regex=True
    )
    .drop_duplicates(subset=["eval_area","variogram"], keep="first")
    .groupby(["eval_area"])
    .apply(lambda x: x.sort_values("rmse",ascending=True))
    .reset_index(level=-1,drop=True)
    .set_index("variogram", append=True)
    .drop(columns=["eval_area","time_to_eval","algorithm", "num_evaluated_points"])#, "params.epsilon"])
    .dropna(subset=["rmse"])
    # .sort_index(axis=1, ascending=False)
)
# results_by_set = results_by_set[~results_by_set.index.get_level_values("algorithm").str.contains("time")]
results_by_set

In [None]:
print(results_by_set.drop(columns=["eval_frac", "n_jobs"]).round(3).to_latex(index=True))

In [None]:
df_runs = (
    df
    .drop_duplicates(subset=["tags.config", "params.eval_set"], keep="first")
    .dropna(subset=metric_cols)
)
df_partials = []
for run_id in df_runs.index:
    conf_name = df_runs.loc[run_id,"tags.config"].lower()
    if "time" in conf_name: #or any(s in conf_name for s in ["gaussian", "idw"]):
        continue
    path = client.download_artifacts(run_id, "partial_eval_metrics.html")
    df_partial = pd.read_html(path)[0]
    # df.columns = pd.MultiIndex.from_columns(df.columns)
    df_partial.columns = df_partial.columns.droplevel(-1)
    df_partial.columns = ["partial_set"] + df_partial.columns.tolist()[1:]
    df_partial = (
        df_partial.assign(
            algorithm = re.sub("rk_(.*)_.*",r"\1",df_runs.loc[run_id, "tags.config"]).capitalize(),
            eval_area = df_runs.loc[run_id,"params.eval_set"].replace("set","Area ").replace('"',"")
        ).replace(
            {"_set.": "", "linear":"linear_barycentric", '"':'', "eps_.*":"","_": " "},
            value=None,
            regex=True
        )
        # .set_index(["eval_area","algorithm", "partial_set"])
        .loc[:, ["eval_area","algorithm", "partial_set"]+["rmse", "r2", "mae"]]
        .round(3)
        .pivot_table(index=["eval_area","algorithm"], columns="partial_set", values=["rmse"])
    )
    df_partials.append(df_partial)
partials_df = pd.concat(df_partials).sort_index()
partials_df.columns = partials_df.columns.droplevel(0)
partials_df

In [None]:
print(partials_df.to_latex(index=True))

In [None]:
import subprocess
available_configs = NOAARegressionKrigingExperiment.config.config.keys()
experiment_name = NOAARegressionKrigingExperiment.__name__
c = 0
for config_name in available_configs: 
    c += 1
    if "linear" in config_name or "matern" in config_name:
        continue
    print(config_name)
    eval_frac = "0.2" #if "set1" in config_name else "0.1"
    p = subprocess.Popen(["python", "-m", "experiments", experiment_name , config_name, "--eval_frac=0.5"])
    print(f"Started experiment {config_name} on process {p.pid}")
    p.communicate()

In [None]:
available_configs

------