FIX

In [4]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import glob
import json
import os

from joblib import Parallel, delayed
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import mlflow
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Optimize database creation")

# import custom modules
import sys
sys.path.append('../')
from HARP import harptools as ht
from ML.learn import cross_validation

from make_tab_db.create_agg_db import aggregate_orbit_data

ROOT = "../../Data/"
PARALELLIZE = True
OVERWRITE = False

In [5]:
def tune_hyperparameters(Hparams, show_progress=True):
    """
    Function that tunes the hyperparameters for the aggregation function
    """
    with mlflow.start_run():
        mlflow.log_params(Hparams)

        with open(ROOT+"aoi.json", "r") as f:
            aoi = json.load(f)

        aucs = []
        for aoi_name, area_of_interest in aoi.items():
            # temporary
            if aoi_name != "Mediterranean":
                continue
            
            file_name = "_".join([aoi_name]+[str(val) for val in Hparams.values()])
            if os.path.exists(ROOT+f"AGG_DB/{file_name}.csv") and not OVERWRITE:
                # print(f"File {file_name} already exists, skipping...")
                all_data = pd.read_csv(ROOT+f"AGG_DB/{file_name}.csv")
                results = cross_validation(all_data, groupkfold=True)
                opt_score = np.mean(results["NO2, HCHO, SO2"]["test_roc_auc"])
                aucs.append(opt_score)
                continue
            
            tropomi_files = glob.glob(ROOT+f"TinyEODATA/{aoi_name}/Sentinel-5P/TROPOMI/L3__Merged_/*/*/*/*.nc")
            # replace \ with / for windows
            tropomi_files = [file.replace("\\", "/") for file in tropomi_files]
            
            sea_polygon = ht.get_water_geometry(area_of_interest, ROOT)
            
            if PARALELLIZE:
                if show_progress:
                    results = list(
                        tqdm(
                            Parallel(return_as="generator", n_jobs=-1)(
                                delayed(aggregate_orbit_data)
                                (orbit_file=orbit_file, area_of_interest=area_of_interest, ocean_poly=sea_polygon, aoi_name=aoi_name, Hparams=Hparams) 
                                for orbit_file in tropomi_files
                                ), 
                            total=len(tropomi_files), 
                            colour="green", 
                            desc=f"Processing {aoi_name} data",
                            leave=False
                            )
                        )
                else:
                    results = Parallel(n_jobs=-1)(
                        delayed(aggregate_orbit_data)
                        (orbit_file=orbit_file, area_of_interest=area_of_interest, ocean_poly=sea_polygon, aoi_name=aoi_name, Hparams=Hparams) 
                        for orbit_file in tropomi_files
                        )
                all_data = pd.concat([result for result in results if result is not None])
            else:
                all_data = []
                for orbit_file in tqdm(tropomi_files):
                    result = aggregate_orbit_data(orbit_file=orbit_file, area_of_interest=area_of_interest, ocean_poly=sea_polygon, aoi_name=aoi_name, Hparams=Hparams)
                    if result is not None:
                        all_data.append(result)
                all_data = pd.concat(all_data)
            
            # shuffle the data
            all_data = all_data.sample(frac=1).reset_index(drop=True)
            all_data.to_csv(ROOT+f"AGG_DB/{file_name}.csv", index=False)
            
            results = cross_validation(all_data, groupkfold=True)
            opt_score = np.mean(results["NO2, HCHO, SO2"]["test_roc_auc"])
            aucs.append(opt_score)
        
        mlflow.log_metric("mean_roc_auc", np.mean(aucs))

    # print(f"Mean ROC AUC: {1-np.mean(losses)}")
    return {
        "loss": -np.mean(aucs),
        "params": Hparams,
        "status": STATUS_OK
    }

In [6]:
choice_lists = {
    "ORBIT_TIME_METHOD": ["max", "mean", "min"],
    "INTERPOLATE_SHIP_LENGTH": [True, False],
    "RESOLUTION": [0.005, 0.05, 0.1],
    "MIN_LENGTH": [50, 90, 120, 180],
    "MAX_SPARSITY": [0.1, 0.25, 0.5, 0.75],
    "TILING": ["random", "grid"]
}

space = {
        'LOOKBACK_M': hp.randint("LOOKBACK_M", 60, 60*5), # time in minutes
        'MIN_SPEED': hp.randint('MIN_SPEED', 4, 10), 
        'MIN_LENGTH': hp.choice('MIN_LENGTH', choice_lists["MIN_LENGTH"]),
        'RESOLUTION': hp.choice('RESOLUTION', choice_lists["RESOLUTION"]),
        'INTERPOLATE_SHIP_LENGTH': hp.choice('ESTIMATE_SHIP_LENGTH', [True, False]),
        'MAX_SPARSITY': hp.choice('MAX_SPARSITY', choice_lists["MAX_SPARSITY"]),
        'ORBIT_TIME_METHOD': hp.choice('ORBIT_TIME_METHOD', choice_lists["ORBIT_TIME_METHOD"]),
        'TILING': hp.choice('TILING', choice_lists["TILING"])
    }

trials = Trials()
# sparktrials = SparkTrials()
best = fmin(tune_hyperparameters, space, algo=tpe.suggest, max_evals=50, trials=trials, trials_save_file="trials", show_progressbar=True)

for trial in trials.trials:
    roc_auc = -trial['result']['loss']
    values = trial['misc']['vals']
    for key, value in values.items():
        values[key] = value[0]
    # convert the indices to the actual values
    for key, value in choice_lists.items():
        values[key] = choice_lists[key][values[key]]

  2%|▏         | 1/50 [07:08<5:50:11, 428.81s/trial, best loss: -0.6545601219495469]


KeyboardInterrupt: 

In [5]:
# print(type(values["LOOKBACK_M"]))
# print(best)

best_values = best.copy()
for key, value in choice_lists.items():
    best_values[key] = value[best_values[key]]
    
file_name = "_".join(["Mediterranean"]+[str(val) for val in best_values.values()])
assert os.path.exists(ROOT+f"AGG_DB/{file_name}.csv")
print(f"File `{file_name}.csv` has the best ROC AUC score overall.")

File `Mediterranean_False_202_0.1_180_6_min_0.05.csv` has the best ROC AUC score overall.
