# Experiment analysis

This notebook compares the results of two experiments

In [None]:
%pip install seaborn

In [None]:
import json
import os
import pandas as pd
from IPython.display import display
from pathlib import Path
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import seaborn as sns

sns.set_context("talk")

## Configs

Configure these values to analyze your results. This notebook will compare all the output files in the given path.

In [None]:
# The experiment name where the output files are expected to exist.
experiment_name = "nl2sql"

# The full path to the output files.
experiment_outputs_path = os.path.join("outputs", experiment_name)

# Set to True to save the figures, False otherwise.
save_figures = True

# The path to save the figures to
figures_output_path = "figures"

## Analysis

In [None]:
class ExperimentOutput:
    def __init__(self, path):
        self.path = path
        self.data = []
        self.load_data()

    def load_data(self):
        with open(self.path, "r") as f:
            for line in f.readlines():
                self.data.append(json.loads(line))

    def get_data(self):
        return self.data

    def get_error_points(self):
        return [d for d in self.data if d["error"]]

    def to_dataframe(self) -> pd.DataFrame:
        pass

    def to_metric_dataframe(
        self, fill_value=0.0, included_metrics=None
    ) -> pd.DataFrame:
        evaluation_metrics: List[Dict[Dict[str, float]]] = []
        for d in self.data:
            try:
                evaluation_metrics.append(d["experiment_metrics"])
            except KeyError:
                print(d)

        # Flatten the list of dicts
        flattened: List[Dict[str, float]] = []
        all_metrics = set()
        for d in evaluation_metrics:
            datum = {}
            for component, metrics in d.items():
                for metric, value in metrics.items():
                    new_metric_name = f"{component.split('.')[-1]}.{metric}"
                    if type(value) == list:
                        value = value[0]
                    datum[new_metric_name] = value
                    all_metrics.add(new_metric_name)
            flattened.append(datum)

        if included_metrics:
            all_metrics = set(included_metrics)

        # Fill in missing metrics
        for d in flattened:
            for metric in all_metrics:
                if metric not in d:
                    d[metric] = fill_value

            keys = list(d.keys())
            for metric in keys:
                if metric not in all_metrics:
                    del d[metric]

        return pd.DataFrame(flattened)

In [None]:
files = os.listdir(experiment_outputs_path)
files = [f for f in files if f.endswith(".jsonl")]
print(files)

# Ensure the figures path exists
if save_figures:
    Path(figures_output_path).mkdir(parents=True, exist_ok=True)

data: Dict[str, ExperimentOutput] = {}
for file in files:
    data[os.path.splitext(file)[0]] = ExperimentOutput(
        os.path.join(experiment_outputs_path, file)
    )

for key, value in data.items():
    fig = plt.figure(figsize=(8, 4))
    ax = fig.gca()
    included_metrics = [
        "components.evaluators.rouge",
        "components.evaluators.fuzzy",
    ]
    value.to_metric_dataframe(included_metrics=included_metrics).hist(
        ax=ax, bins=range(0, 101, 5)
    )
    fig.suptitle(f"Experiment {key}", fontsize=12)
    fig.tight_layout()
    if save_figures:
        fig_path = os.path.join(figures_output_path, f"{key}.png")
        fig.savefig(fig_path)

In [None]:
dfs = []
included_metrics = [
    "components.evaluators.rouge",
    "components.evaluators.fuzzy",
]
for key, value in data.items():
    df = value.to_metric_dataframe(included_metrics=included_metrics)
    df["experiment"] = key
    dfs.append(df)

join = pd.concat(dfs)
join.groupby("experiment").mean().head(10)