In [1]:
import pandas as pd
import glob
import os
from tqdm.notebook import tqdm


results_paths = glob.glob("results/results_*/raw")
results_paths


['results/results_ICPSR_1992_EPEST_2003/raw',
 'results/results_ICPSR_1992_EPEST_2002/raw',
 'results/results_ICPSR_1992_EPEST_1997/raw',
 'results/results_ICPSR_1992_EPEST_2001/raw']

In [2]:
def combine_metrics_to_csv(folder_path, save_path):
    # Use glob to get all files matching the pattern
    files = glob.glob(os.path.join(folder_path, "*_metrics.json"))
    # print(files)

    # List to hold individual DataFrames
    dfs = []

    # Process each file
    for file in tqdm(files):
        # Extract the filename without extension
        base_name = os.path.basename(file).replace("_metrics.json", "")

        # Split the filename into pesticide and model_type
        pesticide, model_type = base_name.rsplit("_", 1)

        # Load the JSON data into a DataFrame
        df = pd.read_json(file, typ="series").to_frame().T
        df["pesticide"] = pesticide
        df["model_type"] = model_type

        # Append the DataFrame to dfs list
        dfs.append(df)

    # Concatenate all DataFrames
    final_df = pd.concat(dfs, ignore_index=True)

    # Rearrange columns
    final_df = final_df[["pesticide",
                         "model_type", "mse", "rmse", "mae", "r2"]]
    final_df = final_df.sort_values(by=["pesticide", "model_type"])

    # Write to CSV

    final_df.to_csv(os.path.join(save_path, "metrics.csv"), index=False)


# Usage
print(f"Combining metrics from {len(results_paths)} folders...")
for folder_path in tqdm(results_paths):
    save_path = os.path.dirname(folder_path)
    print(save_path)

    os.makedirs(save_path, exist_ok=True)

    combine_metrics_to_csv(folder_path, save_path)


Combining metrics from 4 folders...


  0%|          | 0/4 [00:00<?, ?it/s]

results/results_ICPSR_1992_EPEST_2003


  0%|          | 0/321 [00:00<?, ?it/s]

results/results_ICPSR_1992_EPEST_2002


  0%|          | 0/320 [00:00<?, ?it/s]

results/results_ICPSR_1992_EPEST_1997


  0%|          | 0/295 [00:00<?, ?it/s]

results/results_ICPSR_1992_EPEST_2001


  0%|          | 0/314 [00:00<?, ?it/s]

In [3]:
from pprint import pprint
import pandas as pd
import os
import glob


def get_files_with_prefix(folder_path, prefix_pattern):
    """Return a list of files matching the prefix pattern."""
    return glob.glob(os.path.join(folder_path, prefix_pattern))


def group_files_by_prefix(files):
    """Groups files by their shared prefix."""
    grouped_files = {}
    for file in files:
        prefix = "_".join(os.path.basename(file).split("_")[:1])
        if prefix not in grouped_files:
            grouped_files[prefix] = []
        grouped_files[prefix].append(file)
    # print(grouped_files)
    return grouped_files


def merge_multiple_files_on_index(files):
    """Merge multiple files on the index and return the merged DataFrame."""
    # Start with the first file
    # print(files)

    cols = get_cols(files)

    merged_df = pd.read_csv(files[0])

    # Merge remaining files one by one
    # display(merged_df)
    for file in files[1:]:
        df = pd.read_csv(file).drop(columns=["feature_name"])
        # display(df)
        merged_df = pd.merge(merged_df, df, how="outer", on="feature_code")
        # display(merged_df)
    merged_df.columns = cols

    return merged_df


def get_cols(files):
    cols = ["feature_code", "feature_name"]
    for f_ in files:
        filename = os.path.basename(f_)
        col_name = ""
        if "_xgboost" in filename:
            col_name += "XGBoost"
        elif "_rf" in filename:
            col_name += "RF"
        if "_permute" in filename:
            col_name += " importance (permute)"
        elif "_mdi" in filename:
            col_name += " importance (MDI)"
        else:
            cols_to_add = [" p-value (permute)", " std (permute)"]
            cols_to_add = [col_name + col for col in cols_to_add]
            cols += cols_to_add
            continue
        cols.append(col_name)
    # print(f"Getting cols: {cols}")
    return cols


def merge_matching_files(folder_path, prefix, save_path):
    files = get_files_with_prefix(folder_path, f"{prefix}_*.csv")
    grouped_files = group_files_by_prefix(files)
    print(f"Found {len(grouped_files)} groups of files with prefix {prefix}.")

    for prefix, file_group in grouped_files.items():
        if len(file_group) < 2:
            print(f"Skipping {prefix}, found only {len(file_group)} file(s).")
            continue

        merged_df = merge_multiple_files_on_index(file_group)
        merged_df.set_index("feature_code", inplace=True)

        merged_df = merged_df.sort_index(axis=1)
        save_path = os.path.join(save_path, f"{prefix}_merged.csv")
        merged_df.to_csv(save_path)
        # print(f"Merged {prefix} files into {save_path}")


def get_all_prefixes(files):
    """Return all unique prefixes from a list of files."""
    prefixes = set()

    for file in files:
        prefix = "_".join(os.path.basename(file).split("_")[:1])
        prefixes.add(prefix)

    return list(prefixes)


In [4]:
for folder_path in tqdm(results_paths):
    glob_pattern = os.path.join(folder_path, "*.csv")

    files = glob.glob(glob_pattern)
    unique_prefixes = get_all_prefixes(files)

    save_path = os.path.dirname(folder_path)

    #
    print(f"Found {len(unique_prefixes)} unique prefixes in {folder_path}")
    print(f"Saving to {save_path}")
    for prefix in tqdm(unique_prefixes, desc="Merging files"):
        merge_matching_files(folder_path, prefix, save_path)


  0%|          | 0/4 [00:00<?, ?it/s]

Found 321 unique prefixes in results/results_ICPSR_1992_EPEST_2003/raw
Saving to results/results_ICPSR_1992_EPEST_2003


Merging files:   0%|          | 0/321 [00:00<?, ?it/s]

Found 1 groups of files with prefix Chloropicrin.
Found 1 groups of files with prefix Zeta-Cypermethrin.
Found 1 groups of files with prefix Clodinafop.
Found 1 groups of files with prefix Butralin.
Found 1 groups of files with prefix Pelargonic Acid.
Found 1 groups of files with prefix Maneb.
Found 1 groups of files with prefix Pyridaben.
Found 1 groups of files with prefix Diquat.
Found 1 groups of files with prefix Pyraflufen Ethyl.
Found 1 groups of files with prefix Iodosulfuron.
Found 1 groups of files with prefix Metam.
Found 1 groups of files with prefix Atrazine.
Found 1 groups of files with prefix Thiodicarb.
Found 1 groups of files with prefix Hexythiazox.
Found 1 groups of files with prefix Bacillus Subtilis.
Found 1 groups of files with prefix Thiram.
Found 1 groups of files with prefix Etridiazole.
Found 1 groups of files with prefix Imazapic.
Found 1 groups of files with prefix Potassium Oleate.
Found 1 groups of files with prefix Myclobutanil.
Found 1 groups of files wi

Merging files:   0%|          | 0/320 [00:00<?, ?it/s]

Found 1 groups of files with prefix Chloropicrin.
Found 1 groups of files with prefix Zeta-Cypermethrin.
Found 1 groups of files with prefix Clodinafop.
Found 1 groups of files with prefix Butralin.
Found 1 groups of files with prefix Pelargonic Acid.
Found 1 groups of files with prefix Maneb.
Found 1 groups of files with prefix Pyridaben.
Found 1 groups of files with prefix Diquat.
Found 1 groups of files with prefix Pyraflufen Ethyl.
Found 1 groups of files with prefix Iodosulfuron.
Found 1 groups of files with prefix Metam.
Found 1 groups of files with prefix Atrazine.
Found 1 groups of files with prefix Thiodicarb.
Found 1 groups of files with prefix Hexythiazox.
Found 1 groups of files with prefix Bacillus Subtilis.
Found 1 groups of files with prefix Thiram.
Found 1 groups of files with prefix Etridiazole.
Found 1 groups of files with prefix Imazapic.
Found 1 groups of files with prefix Potassium Oleate.
Found 1 groups of files with prefix Myclobutanil.
Found 1 groups of files wi

Merging files:   0%|          | 0/295 [00:00<?, ?it/s]

Found 1 groups of files with prefix Cyhalothrin-Lambda.
Found 1 groups of files with prefix Copper.
Found 1 groups of files with prefix Sulfur.
Found 1 groups of files with prefix Imidacloprid.
Found 1 groups of files with prefix Harpin Protein.
Found 1 groups of files with prefix Imazethapyr.
Found 1 groups of files with prefix Oxydemeton-Methyl.
Found 1 groups of files with prefix Pymetrozine.
Found 1 groups of files with prefix Tribufos.
Found 1 groups of files with prefix Chloropicrin.
Found 1 groups of files with prefix Lactofen.
Found 1 groups of files with prefix Etridiazole.
Found 1 groups of files with prefix Thiobencarb.
Found 1 groups of files with prefix Zeta-Cypermethrin.
Found 1 groups of files with prefix Metam Potassium.
Found 1 groups of files with prefix Clodinafop.
Found 1 groups of files with prefix Imazapic.
Found 1 groups of files with prefix Mcpa.
Found 1 groups of files with prefix Cuprous Oxide.
Found 1 groups of files with prefix Benfluralin.
Found 1 groups of

Merging files:   0%|          | 0/314 [00:00<?, ?it/s]

Found 1 groups of files with prefix Chloropicrin.
Found 1 groups of files with prefix Zeta-Cypermethrin.
Found 1 groups of files with prefix Clodinafop.
Found 1 groups of files with prefix Butralin.
Found 1 groups of files with prefix Pelargonic Acid.
Found 1 groups of files with prefix Maneb.
Found 1 groups of files with prefix Pyridaben.
Found 1 groups of files with prefix Diquat.
Found 1 groups of files with prefix Pyraflufen Ethyl.
Found 1 groups of files with prefix Iodosulfuron.
Found 1 groups of files with prefix Metam.
Found 1 groups of files with prefix Atrazine.
Found 1 groups of files with prefix Thiodicarb.
Found 1 groups of files with prefix Hexythiazox.
Found 1 groups of files with prefix Bacillus Subtilis.
Found 1 groups of files with prefix Thiram.
Found 1 groups of files with prefix Etridiazole.
Found 1 groups of files with prefix Imazapic.
Found 1 groups of files with prefix Potassium Oleate.
Found 1 groups of files with prefix Myclobutanil.
Found 1 groups of files wi