In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import matplotlib.cm as cm

output_base_dir = "/gpfs/data/fs72607/juarezs98/subsets_finetune/Outputs_finetune/"
subset_sizes = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000] 

In [None]:
subset_data = []
for subset in subset_sizes:
    report_path = os.path.join(output_base_dir, str(subset), "classification_report.csv")
    
    if not os.path.exists(report_path):
        print(f"Warning: Classification report missing for subset {subset}")
        subset_data.append((subset, None))
        continue  
    
    df = pd.read_csv(report_path, index_col=0)
    if "weighted avg" in df.index:
        f1_score = df.loc["weighted avg", "f1-score"]
    else:
        print(f"Warning: No weighted average F1-score found in report for subset {subset}")
        f1_score = None
    
    subset_data.append((subset, f1_score))

metrics_df = pd.DataFrame(subset_data, columns=["Subset Size", "F1-Score"])

# Plotting
plt.figure(figsize=(12, 6))
valid_df = metrics_df.dropna(subset=["F1-Score"])  # keep only rows with valid F1-Score

plt.plot(valid_df["Subset Size"], valid_df["F1-Score"], marker='o', linestyle='-', label="F1-Score")

# Annotate points with the F1 score values
for x_val, y_val in zip(valid_df["Subset Size"], valid_df["F1-Score"]):
    plt.text(x_val, y_val, f"{y_val:.3f}", ha='center', va='bottom', fontsize=9, color='blue')

plt.xticks(valid_df["Subset Size"])
plt.xlabel("Subset Size", fontsize=14)
plt.ylabel("F1-Score", fontsize=14)
plt.title("Performance (F1 Score) across incremental subsets", fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()

In [None]:
# initialises figure with two subplots (loss and accuracy)
fig, (ax_loss, ax_acc) = plt.subplots(1, 2, figsize=(20, 6))
colormap = cm.get_cmap("tab20", len(subset_sizes))
colors = [colormap(i / len(subset_sizes)) for i in range(len(subset_sizes))]

lines, labels = [], []
all_epochs = set()

# loops through each subset size and plot the training history
for subset, color in zip(subset_sizes, colors):
    history_csv_path = os.path.join(output_base_dir, str(subset), "training_history.csv")

    if not os.path.isfile(history_csv_path):
        print(f"Warning: No training_history.csv for subset {subset}")
        continue

    df = pd.read_csv(history_csv_path)
    all_epochs.update(df["epoch"])

    # plots loss
    line_loss, = ax_loss.plot(df["epoch"], df["val_loss"], color=color, linewidth=2)
    # plots accuracy
    line_acc,  = ax_acc.plot(df["epoch"], df["val_acc"], color=color, linewidth=2)

    # only one line per subset for the legend 
    lines.append(line_loss)
    labels.append(str(subset))

sorted_epochs = sorted(all_epochs)

for ax in (ax_loss, ax_acc):
    ax.set_xticks(sorted_epochs)
    ax.set_xticklabels(sorted_epochs, rotation=45, ha="right")
    ax.grid(True)

# titles and axis labels
ax_loss.set_title("Loss over epochs", fontsize=12)
ax_loss.set_xlabel("Epoch", fontsize=10)
ax_loss.set_ylabel("Loss", fontsize=10)

ax_acc.set_title("Accuracy over epochs", fontsize=12)
ax_acc.set_xlabel("Epoch", fontsize=10)
ax_acc.set_ylabel("Accuracy", fontsize=10)

# layout
plt.tight_layout(rect=[0, 0, 0.92, 1])

# single legend for all subset lines
fig.legend(lines, labels, title="Subset Size",
           loc="center left", bbox_to_anchor=(0.93, 0.5))

plt.show()