In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
csv_file = "/Users/robertjost/Fall_2025/Biogeochemistry/dust_dataHW1.csv"

In [3]:
def visualize_dataset(csv_path: str = None, df: pd.DataFrame = None, group_by: str = "Ecosystem") -> None:
    """
    Visualize ecosystem dataset using pandas and matplotlib.

    Parameters
    ----------
    csv_path : str, optional
        Path to the CSV file. Required if df is not provided.
    df : pandas.DataFrame, optional
        A DataFrame with the required columns:
        ['ID', 'Ecosystem', 'Season', 'P_conc', 'Ca_conc', 'flux_gm2yr'].
    group_by : str, default="Ecosystem"
        Column to group by in plots. Options: "Ecosystem", "Season", or None.

    Returns
    -------
    None
        Displays matplotlib plots (no return value).
    """
    if df is None:
        if csv_path is None:
            raise ValueError("Provide either a CSV file path or a DataFrame.")
        df = pd.read_csv(csv_path)

    # Validate group_by
    if group_by not in ["Ecosystem", "Season", None]:
        raise ValueError("group_by must be 'Ecosystem', 'Season', or None")

    # Ensure correct dtypes
    if "Ecosystem" in df.columns:
        df["Ecosystem"] = df["Ecosystem"].astype("category")
    if "Season" in df.columns:
        df["Season"] = df["Season"].astype("category")

    # 1. Boxplot of flux by group
    if group_by:
        plt.figure(figsize=(8, 5))
        df.boxplot(column="flux_gm2yr", by=group_by, grid=False)
        plt.title(f"Flux by {group_by}")
        plt.suptitle("")
        plt.xlabel(group_by)
        plt.ylabel("Flux (g/m²/yr)")
        plt.show()

    # 2. Scatter plot: P vs Ca (colored by group if specified)
    plt.figure(figsize=(7, 6))
    if group_by:
        for key, group in df.groupby(group_by):
            plt.scatter(group["P_conc"], group["Ca_conc"], label=key, alpha=0.7)
        plt.legend(title=group_by)
    else:
        plt.scatter(df["P_conc"], df["Ca_conc"], c="blue", alpha=0.7)
    plt.title(f"P vs Ca Concentration{' by ' + group_by if group_by else ''}")
    plt.xlabel("P concentration")
    plt.ylabel("Ca concentration")
    plt.show()

    # 3. Scatter plot: Flux vs P concentration
    plt.figure(figsize=(7, 6))
    if group_by:
        for key, group in df.groupby(group_by):
            plt.scatter(group["P_conc"], group["flux_gm2yr"], label=key, alpha=0.7)
        plt.legend(title=group_by)
    else:
        plt.scatter(df["P_conc"], df["flux_gm2yr"], c="blue", alpha=0.7)
    plt.title(f"Flux vs P Concentration{' by ' + group_by if group_by else ''}")
    plt.xlabel("P concentration")
    plt.ylabel("Flux (g/m²/yr)")
    plt.show()


In [None]:
# concentration is g / ml, flux is g / m^2 / yr. g = mg/1000
dust_df = pd.read_csv(csv_file)
dust_df

Unnamed: 0,ID,Ecosystem,Season,P_conc,Ca_conc,flux_gm2yr
0,1,Foothills,NDJ,1028.0,10861.0,5.307173
1,2,Montane,NDJ,778.0,7455.0,8.608287
2,3,Foothills,FMA,1134.0,9418.0,6.999662
3,4,Montane,FMA,1235.5,8839.5,7.370548
4,5,Plains,MJ,2417.95,8247.0,15.986543
5,6,Foothills,MJ,3245.3,5843.0,21.734655
6,7,Montane,MJ,3558.5,4760.5,12.983546
7,8,Plains,JAS,2007.5,9701.5,14.282538
8,9,Foothills,JAS,2652.0,8372.5,9.707827
9,10,Montane,JAS,2813.0,7236.5,10.445604


In [9]:
def load_dataset(csv_path: str = None, df: pd.DataFrame = None, group_by: str = None) -> pd.DataFrame:
    """
    Load the ecosystem dataset from CSV or an existing DataFrame.

    Parameters
    ----------
    csv_path : str, optional
        Path to the CSV file. Required if df is not provided.
    df : pandas.DataFrame, optional
        A DataFrame with the required columns:
        ['ID', 'Ecosystem', 'Season', 'P_conc', 'Ca_conc', 'flux_gm2yr'].

    Returns
    -------
    pandas.DataFrame
        A cleaned DataFrame with categorical types applied.
    """
    if df is None:
        if csv_path is None:
            raise ValueError("Provide either a CSV file path or a DataFrame.")
        df = pd.read_csv(csv_path)

    # Ensure correct dtypes
    if "Ecosystem" in df.columns:
        df["Ecosystem"] = df["Ecosystem"].astype("category")
    if "Season" in df.columns:
        df["Season"] = df["Season"].astype("category")

    # Return grouped object if requested
    if group_by:
        if group_by not in df.columns:
            raise ValueError(f"group_by must be a column in DataFrame. Got '{group_by}'.")
        return df.groupby(group_by)

    return df

In [14]:
seasons = dust_df.groupby("Season", observed=True)

In [20]:
for name, group in seasons:
    print(name)
    print(group)

FMA
   ID  Ecosystem Season  P_conc  Ca_conc  flux_gm2yr
2   3  Foothills    FMA  1134.0   9418.0    6.999662
3   4    Montane    FMA  1235.5   8839.5    7.370548
JAS
    ID  Ecosystem Season  P_conc  Ca_conc  flux_gm2yr
7    8     Plains    JAS  2007.5   9701.5   14.282538
8    9  Foothills    JAS  2652.0   8372.5    9.707827
9   10    Montane    JAS  2813.0   7236.5   10.445604
10  11  Subalpine    JAS  1958.0   7061.0   11.843223
11  12     Alpine    JAS  3351.0  12149.5    6.371747
MJ
   ID  Ecosystem Season   P_conc  Ca_conc  flux_gm2yr
4   5     Plains     MJ  2417.95   8247.0   15.986543
5   6  Foothills     MJ  3245.30   5843.0   21.734655
6   7    Montane     MJ  3558.50   4760.5   12.983546
NDJ
   ID  Ecosystem Season  P_conc  Ca_conc  flux_gm2yr
0   1  Foothills    NDJ  1028.0  10861.0    5.307173
1   2    Montane    NDJ   778.0   7455.0    8.608287
