In [14]:
import pickle
import seaborn as sns
from IPython.display import display
import matplotlib.pyplot as plt

# Load
def load_and_visualize_phenotyping_dataset():
    with open("data/biomass_sample_data_reps_separate.pkl", "rb") as f:
        df_loaded = pickle.load(f)

    # scale digital_biomass from mm³ to millions (for easier plotting/metrics)
    df_loaded["digital_biomass"] = df_loaded["digital_biomass"] / 1e6

    columns_to_display = ["days_of_phenotyping", "species", "nitrogen_applied", "drought_stress", "digital_biomass"]

    # Small inline histograms (like DataWrangler) above the table
    n = len(columns_to_display)
    fig, axes = plt.subplots(1, n, figsize=(2.4 * n, 2.2), constrained_layout=True)

    # if only one column, make axes iterable
    if n == 1:
        axes = [axes]

    for ax, col in zip(axes, columns_to_display):
        series = df_loaded[col].dropna()
        if series.dtype.kind in "biufc":  # numeric
            sns.histplot(series, bins=20, ax=ax, color="C0", edgecolor=None)
        else:  # categorical -> show counts
            counts = series.value_counts().nlargest(10)
            sns.barplot(x=counts.index.astype(str), y=counts.values, ax=ax, color="C0")
            ax.tick_params(axis="x", rotation=45)
        ax.set_title(col, fontsize=10)
        ax.set_yticks([])
        ax.set_xlabel("")
        ax.set_ylabel("")
        # remove spines for a cleaner "sparkline" look
        for spine in ["top", "right", "left"]:
            ax.spines[spine].set_visible(False)

    plt.suptitle("Phenotyping Dataset - Column Histograms", fontsize=14)
    plt.show()

    # show a few rows below the histograms
    print("Sample Data (showing first 5 rows of dataset):")
    display(df_loaded[columns_to_display].head())

    plt.figure(figsize=(12, 7))
    sns.lineplot(
        data=df_loaded,
        x="days_of_phenotyping",
        y="digital_biomass",
        hue="species",
        style="nitrogen_applied",
        markers=True,
        markersize=10,
        dashes=False,
    )
    plt.title("Biomass vs. Day of Phenotyping by Species and Nitrogen Applied")
    plt.xlabel("Day of Phenotyping", fontsize=14)
    plt.ylabel("Biomass [dm³]", fontsize=14)
    plt.legend(title="Species and Nitrogen Applied")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return df_loaded