In [60]:
import pandas as pd
import json

# Load the models from .csv file
models = pd.read_csv("./output/full-dataset.csv")

# Get evaluation_df from models dataframe
evaluation_column = models[["id", "evaluation_metrics"]]
evaluation_column.to_csv("./output/evaluation_metrics.csv", index=False)
evaluation_df = pd.read_csv("./output/evaluation_metrics.csv")

#### Helper function that prints all non-numeric values in evaluation_df. 
#### It helps me to understand what I've to clean/format

In [61]:
def non_numeric_metric_values(df):
    for idx, row in df.iterrows():
        metrics_list = json.loads(row["evaluation_metrics"])
        for metric in metrics_list:
            if "metric_value" in metric: 
                value = metric["metric_value"]
                try:
                    float(value)
                except Exception:
                    print(f"At idx {idx}, non-numeric metric_value: {value}")
            else:
                error_value = metric["metric_error"]
                mean_value = metric["metric_mean"]
                try:
                    float(error_value)
                    float(mean_value)
                except Exception:
                    print(f"At idx {idx}, non-numeric metric_error: {error_value} or metric_mean: {mean_value}")

non_numeric_metric_values(df = evaluation_df)

At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-nu

#### Datetime formatting on models.csv

In [62]:
models['created_at'] = models['created_at'].str.split('+').str[0].str.replace(' ', 'T')

models.to_csv("./data/full-dataset-preprocessed.csv", index=False)

#### Delete metric_values stored as a list because we don't have any reference to the metric_type

In [63]:
def remove_list_values(df):
    for idx, row in df.iterrows():
        row_json = row["evaluation_metrics"]
        try:
            metrics_list = json.loads(row_json)
        except Exception as e:
            print(f"Error parsing JSON in row {idx}: {e}")
            continue
        filtered_metrics = [metric for metric in metrics_list 
                            if not isinstance(metric.get("metric_value"), list)]
        removed = len(metrics_list) - len(filtered_metrics)
        if removed > 0:
            print(f"Row {idx}: removed {removed} evaluation metric(s) with list values.")
        df.at[idx, "evaluation_metrics"] = json.dumps(filtered_metrics)
    return df

df = remove_list_values(df = evaluation_df)
df.to_csv("./data/evaluation_metrics_updated.csv", index=False)
print("Update complete.")

Row 18: removed 11 evaluation metric(s) with list values.
Row 51: removed 18 evaluation metric(s) with list values.
Row 338: removed 11 evaluation metric(s) with list values.
Row 373: removed 18 evaluation metric(s) with list values.
Row 385: removed 18 evaluation metric(s) with list values.
Row 400: removed 7 evaluation metric(s) with list values.
Row 520: removed 11 evaluation metric(s) with list values.
Row 1231: removed 2 evaluation metric(s) with list values.
Row 1689: removed 1 evaluation metric(s) with list values.
Row 11881: removed 3 evaluation metric(s) with list values.
Row 11882: removed 3 evaluation metric(s) with list values.
Row 11884: removed 3 evaluation metric(s) with list values.
Row 11885: removed 3 evaluation metric(s) with list values.
Row 11887: removed 3 evaluation metric(s) with list values.
Row 11896: removed 3 evaluation metric(s) with list values.
Row 11899: removed 3 evaluation metric(s) with list values.
Row 11917: removed 3 evaluation metric(s) with list 

In [64]:
non_numeric_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))

At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-nu

#### Metrics percentages to float

In [65]:
def convert_percentage_to_float(df):
    for idx, row in df.iterrows():
        metrics_list = json.loads(row["evaluation_metrics"])
        for metric in metrics_list:
            value = metric["metric_value"]
            if isinstance(value, str) and value.strip().endswith("%"):
                try:
                    numeric_value = float(value.replace('%', 'e-2'))
                    metric["metric_value"] = numeric_value
                    print(f"Converted {value} to {numeric_value}")
                except ValueError:
                    print(f"Could not convert value to float: {value} at idx {idx}")
                    pass
            elif isinstance(value, str) and "," in value.strip():
                try:
                    numeric_value = float(value.replace(',', ''))
                    metric["metric_value"] = numeric_value
                    print(f"Converted {value} to {numeric_value}")
                except ValueError:
                    print(f"Could not convert comma float value to dot float: {value} at idx {idx}")
                    pass
        
        df.at[idx, "evaluation_metrics"] = json.dumps(metrics_list)
    return df

df = convert_percentage_to_float(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))
df.to_csv("./data/evaluation_metrics_updated.csv", index=False)
print("Conversion complete")

Converted 0,857 to 857.0
Converted 0,856 to 856.0
Converted 0,537 to 537.0
Converted 0,497 to 497.0
Converted 0,732 to 732.0
Converted 0,788 to 788.0
Converted 0,761 to 761.0
Converted 92% to 0.92
Converted 98.38% to 0.9838
Could not convert comma float value to dot float: 0.14 [0.14, 0.15] at idx 278
Could not convert comma float value to dot float: 0.38 [0.37, 0.39] at idx 278
Could not convert comma float value to dot float: 0.99 [0.99, 0.99] at idx 278
Could not convert comma float value to dot float: 0.65 [0.64, 0.67] at idx 278
Could not convert comma float value to dot float: 0.85 [0.83, 0.86] at idx 278
Converted 34.3% to 0.343
Converted 93% to 0.93
Converted 88% to 0.88
Converted 90% to 0.9
Converted 11.00% to 0.11
Converted 11.53% to 0.1153
Converted 17% to 0.17
Converted 64.2% to 0.642
Converted 29,603788751645216 to 2.9603788751645216e+16
Converted 20.41% to 0.2041
Converted 19.30% to 0.193
Converted 19.735723% to 0.19735723
Converted 13.914924% to 0.13914924
Converted 13.7

In [66]:
non_numeric_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))

At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-numeric metric_value: None
At idx 0, non-nu

#### Delete metrics that are "None"

In [67]:
def remove_invalid_metric_values(df):
    invalid_values = {"NA", "N/A", "na", "n/a", "???", ""}
    for idx, row in df.iterrows():
        row_json = row["evaluation_metrics"]
        try:
            metrics_list = json.loads(row_json)
        except Exception as e:
            print(f"Error parsing JSON in row {idx}: {e}")
            continue

        filtered_metrics = []
        for metric in metrics_list:
            value = metric.get("metric_value")
            if value is None:
                continue
            value_str = str(value).strip()
            if value_str in invalid_values:
                continue
            filtered_metrics.append(metric)
        removed = len(metrics_list) - len(filtered_metrics)
        if removed > 0:
            print(f"Row {idx}: removed {removed} evaluation metric(s) with invalid metric_value.")
        df.at[idx, "evaluation_metrics"] = json.dumps(filtered_metrics)
    return df


df = remove_invalid_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))
df.to_csv("./data/evaluation_metrics_updated.csv", index=False)
print("Update complete.")

Row 0: removed 39 evaluation metric(s) with invalid metric_value.
Row 56: removed 42 evaluation metric(s) with invalid metric_value.
Row 77: removed 1 evaluation metric(s) with invalid metric_value.
Row 78: removed 3 evaluation metric(s) with invalid metric_value.
Row 92: removed 2 evaluation metric(s) with invalid metric_value.
Row 143: removed 1 evaluation metric(s) with invalid metric_value.
Row 369: removed 9 evaluation metric(s) with invalid metric_value.
Row 388: removed 21 evaluation metric(s) with invalid metric_value.
Row 393: removed 51 evaluation metric(s) with invalid metric_value.
Row 407: removed 12 evaluation metric(s) with invalid metric_value.
Row 438: removed 30 evaluation metric(s) with invalid metric_value.
Row 439: removed 3 evaluation metric(s) with invalid metric_value.
Row 476: removed 45 evaluation metric(s) with invalid metric_value.
Row 477: removed 42 evaluation metric(s) with invalid metric_value.
Row 480: removed 3 evaluation metric(s) with invalid metric_

In [68]:
non_numeric_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))

At idx 61, non-numeric metric_value: 261.42 +/- 16.66
At idx 131, non-numeric metric_value: 6251.93 +/- 35.95
At idx 278, non-numeric metric_value: 0.14 [0.14, 0.15]
At idx 278, non-numeric metric_value: 0.38 [0.37, 0.39]
At idx 278, non-numeric metric_value: 0.99 [0.99, 0.99]
At idx 278, non-numeric metric_value: 0.65 [0.64, 0.67]
At idx 278, non-numeric metric_value: 0.85 [0.83, 0.86]
At idx 278, non-numeric metric_value: 1518.70 +/- 568.14
At idx 278, non-numeric metric_value: 0.08 +/- 0.03
At idx 278, non-numeric metric_value: 0.19 +/- 0.08
At idx 278, non-numeric metric_value: 89.17 +/- 78.73
At idx 278, non-numeric metric_value: 0.04 +/- 0.04
At idx 278, non-numeric metric_value: 0.05 +/- 0.05
At idx 278, non-numeric metric_value: 1676.91 +/- 780.73
At idx 278, non-numeric metric_value: 0.09 +/- 0.05
At idx 278, non-numeric metric_value: 2.80 +/- 1.50
At idx 278, non-numeric metric_value: 844.50 +/- 546.85
At idx 278, non-numeric metric_value: 0.18 +/- 0.16
At idx 278, non-numeri

### Handle metric mean and error values by splitting them into two separate metrics

In [69]:
def convert_composite_metric_values(df):
    for idx, row in df.iterrows():
        try:
            metrics_list = json.loads(row["evaluation_metrics"])
        except Exception as e:
            print(f"Error parsing JSON in row {idx}: {e}")
            continue
        
        updated_metrics = []
        for metric in metrics_list:
            value = metric.get("metric_value")
            if isinstance(value, str) and ("+/-" in value or "±" in value):
                if "+/-" in value:
                    parts = value.split("+/-")
                else:
                    parts = value.split("±")
                if len(parts) == 2:
                    mean_val = parts[0].strip()
                    error_val = parts[1].strip()
                    metric.pop("metric_value", None)
                    metric["metric_mean"] = mean_val  
                    try:
                        metric["metric_error"] = float(error_val)
                    except ValueError:
                        metric["metric_error"] = error_val
            updated_metrics.append(metric)
        df.at[idx, "evaluation_metrics"] = json.dumps(updated_metrics)
    return df

df = convert_composite_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))
df.to_csv("./data/evaluation_metrics_updated.csv", index=False)
print("Update complete.")


Update complete.


In [70]:
non_numeric_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))

At idx 278, non-numeric metric_value: 0.14 [0.14, 0.15]
At idx 278, non-numeric metric_value: 0.38 [0.37, 0.39]
At idx 278, non-numeric metric_value: 0.99 [0.99, 0.99]
At idx 278, non-numeric metric_value: 0.65 [0.64, 0.67]
At idx 278, non-numeric metric_value: 0.85 [0.83, 0.86]
At idx 523, non-numeric metric_value: experimental
At idx 523, non-numeric metric_value: high
At idx 523, non-numeric metric_value: moderate
At idx 795, non-numeric metric_value: xx.xx
At idx 809, non-numeric metric_value: to recompute with STEP 24000
At idx 809, non-numeric metric_value: to recompute with STEP 24000
At idx 1385, non-numeric metric_value: {'wer_result_on_test': None}
At idx 1654, non-numeric metric_value: No
At idx 3241, non-numeric metric_value: {'f1': 0.8276613385259164}
At idx 5301, non-numeric metric_value: {'accuracy': 0.8733333333333333}
At idx 6066, non-numeric metric_value: {'accuracy': 0.6011306532663316}
At idx 6066, non-numeric metric_value: {'f1': 0.5956396413406886}
At idx 6253, no

In [71]:
def unpack_metric_values(df):
    for idx, row in df.iterrows():
        metrics_list = json.loads(row["evaluation_metrics"])
        new_metrics = []
        for metric in metrics_list:
            value = metric.get("metric_value")
            # Check if metric_value is a dictionary
            if isinstance(value, dict):
                for k, v in value.items():
                    # Create a new metric dictionary based on the original
                    new_metric = metric.copy()
                    # Remove the original metric_value
                    new_metric.pop("metric_value", None)
                    # Update metric_type to include the sub-key
                    new_metric["metric_type"] = f"{metric.get('metric_type')}/{k}"
                    # Set the new metric_value to the corresponding value
                    new_metric["metric_value"] = v
                    new_metrics.append(new_metric)
            else:
                # If it's not a dict, leave the metric unchanged
                new_metrics.append(metric)
        df.at[idx, "evaluation_metrics"] = json.dumps(new_metrics)
    return df

df = unpack_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))
df.to_csv("./data/evaluation_metrics_preprocessed.csv", index=False)
print("Update complete.")


Update complete.


In [72]:
non_numeric_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))

At idx 278, non-numeric metric_value: 0.14 [0.14, 0.15]
At idx 278, non-numeric metric_value: 0.38 [0.37, 0.39]
At idx 278, non-numeric metric_value: 0.99 [0.99, 0.99]
At idx 278, non-numeric metric_value: 0.65 [0.64, 0.67]
At idx 278, non-numeric metric_value: 0.85 [0.83, 0.86]
At idx 523, non-numeric metric_value: experimental
At idx 523, non-numeric metric_value: high
At idx 523, non-numeric metric_value: moderate
At idx 795, non-numeric metric_value: xx.xx
At idx 809, non-numeric metric_value: to recompute with STEP 24000
At idx 809, non-numeric metric_value: to recompute with STEP 24000
At idx 1385, non-numeric metric_value: {'wer_result_on_test': None}
At idx 1654, non-numeric metric_value: No
At idx 3241, non-numeric metric_value: {'f1': 0.8276613385259164}
At idx 5301, non-numeric metric_value: {'accuracy': 0.8733333333333333}
At idx 6066, non-numeric metric_value: {'accuracy': 0.6011306532663316}
At idx 6066, non-numeric metric_value: {'f1': 0.5956396413406886}
At idx 6253, no

#### Delete all the leftovers non-numeric metric-value that have weird formatting

In [73]:
def remove_non_numeric_metrics(df):
    for idx, row in df.iterrows():
        try:
            metrics_list = json.loads(row["evaluation_metrics"])
        except Exception as e:
            print(f"Error parsing JSON in row {idx}: {e}")
            continue
        filtered_metrics = []
        for metric in metrics_list:
            if "metric_value" in metric and metric["metric_value"] is not None:
                value = metric["metric_value"]
                if isinstance(value, list):
                    print(f"At idx {idx}, deleting metric with metric_value as list: {value}")
                    continue
                try:
                    float(value)
                    filtered_metrics.append(metric)
                except Exception:
                    print(f"At idx {idx}, deleting metric with non-numeric metric_value: {value}")
            elif "metric_mean" in metric and "metric_error" in metric:
                mean_val = metric["metric_mean"]
                error_val = metric["metric_error"]
                try:
                    float(mean_val)
                    float(error_val)
                    filtered_metrics.append(metric)
                except Exception:
                    print(f"At idx {idx}, deleting composite metric with non-numeric values: metric_mean: {mean_val}, metric_error: {error_val}")
            else:
                print(f"At idx {idx}, deleting metric with unrecognized numeric field: {metric}")
        df.at[idx, "evaluation_metrics"] = json.dumps(filtered_metrics)
    return df

df = remove_non_numeric_metrics(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))
df.to_csv("./data/evaluation_metrics_updated.csv", index=False)
print("Deletion complete.")

At idx 278, deleting metric with non-numeric metric_value: 0.14 [0.14, 0.15]
At idx 278, deleting metric with non-numeric metric_value: 0.38 [0.37, 0.39]
At idx 278, deleting metric with non-numeric metric_value: 0.99 [0.99, 0.99]
At idx 278, deleting metric with non-numeric metric_value: 0.65 [0.64, 0.67]
At idx 278, deleting metric with non-numeric metric_value: 0.85 [0.83, 0.86]
At idx 523, deleting metric with non-numeric metric_value: experimental
At idx 523, deleting metric with non-numeric metric_value: high
At idx 523, deleting metric with non-numeric metric_value: moderate
At idx 795, deleting metric with non-numeric metric_value: xx.xx
At idx 809, deleting metric with non-numeric metric_value: to recompute with STEP 24000
At idx 809, deleting metric with non-numeric metric_value: to recompute with STEP 24000
At idx 1385, deleting metric with non-numeric metric_value: {'wer_result_on_test': None}
At idx 1654, deleting metric with non-numeric metric_value: No
At idx 3241, delet

In [74]:
non_numeric_metric_values(df=pd.read_csv("./data/evaluation_metrics_updated.csv"))

#### Check whether there are empty evaluation metrics lists, eventually dropping them

In [75]:
df = pd.read_csv("./data/evaluation_metrics_updated.csv")
for idx, row in df.iterrows():
    metrics_list = json.loads(row["evaluation_metrics"])
    if len(metrics_list) == 0:
        print(f"Row {idx} has no evaluation metrics, dropping it.")
        df.drop(idx, inplace=True)
df.to_csv("./data/evaluation_metrics_preprocessed", index=False)


Row 77 has no evaluation metrics, dropping it.
Row 523 has no evaluation metrics, dropping it.
Row 795 has no evaluation metrics, dropping it.
Row 1157 has no evaluation metrics, dropping it.
Row 1231 has no evaluation metrics, dropping it.
Row 1385 has no evaluation metrics, dropping it.
Row 1654 has no evaluation metrics, dropping it.
Row 1689 has no evaluation metrics, dropping it.
Row 3241 has no evaluation metrics, dropping it.
Row 6066 has no evaluation metrics, dropping it.
Row 6253 has no evaluation metrics, dropping it.
Row 6254 has no evaluation metrics, dropping it.
Row 6280 has no evaluation metrics, dropping it.
Row 6287 has no evaluation metrics, dropping it.
Row 6426 has no evaluation metrics, dropping it.
Row 7413 has no evaluation metrics, dropping it.
Row 7702 has no evaluation metrics, dropping it.
Row 8359 has no evaluation metrics, dropping it.
Row 8540 has no evaluation metrics, dropping it.
Row 8786 has no evaluation metrics, dropping it.
Row 8819 has no evaluati