In [None]:
# plotting utilities (boxplots + scatterplots) for EDA
# - matplotlib only
# - one plot per figure
# - sentinel -200 treated as missing
# - saved to figures/eda/

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

def _ensure_dir(path):
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)
    return p

def _sanitize_filename(text: str) -> str:
    return "".join([c if c.isalnum() or c in ("-", "_", ".", "+") else "_" for c in str(text)])

def _drop_sentinel(series: pd.Series, sentinel=-200):
    return series.replace(sentinel, pd.NA).dropna()

def plot_box(df, column, drop_sentinel=True, sentinel=-200,
             save=True, save_dir="figures/eda", filename=None, show=False):
    if column not in df.columns:
        raise KeyError(f"{column} not in df.columns")
    series = df[column]
    if drop_sentinel:
        series = _drop_sentinel(series, sentinel)
    fig = plt.figure()
    plt.boxplot(series.values)
    plt.title(f"Box plot of {column}")
    plt.ylabel(column)
    out_path = None
    if save:
        out_dir = _ensure_dir(save_dir)
        fname = filename or _sanitize_filename(f"{column}_box.png")
        out_path = out_dir / fname
        fig.savefig(out_path, bbox_inches="tight", dpi=150)
    if show:
        plt.show()
    plt.close(fig)
    return out_path

def plot_box_all(df, columns, drop_sentinel=True, sentinel=-200, save_dir="figures/eda"):
    paths = []
    for col in columns:
        p = plot_box(df, col, drop_sentinel=drop_sentinel, sentinel=sentinel,
                     save=True, save_dir=save_dir, show=False)
        paths.append(p)
    return paths

def plot_scatter(df, x, y, hue=None, drop_sentinel=True, sentinel=-200,
                 alpha=0.6, save=True, save_dir="figures/eda", filename=None, show=False):
    for col in [x, y] + ([hue] if hue else []):
        if col not in df.columns:
            raise KeyError(f"{col} not in df.columns")

    X = df[[x, y] + ([hue] if hue else [])].copy()
    if drop_sentinel:
        X[x] = X[x].replace(sentinel, pd.NA)
        X[y] = X[y].replace(sentinel, pd.NA)
        X = X.dropna(subset=[x, y])

    fig = plt.figure()
    if hue is None:
        plt.scatter(X[x], X[y], alpha=alpha)
    else:
        for level in pd.unique(X[hue]):
            sub = X[X[hue] == level]
            plt.scatter(sub[x], sub[y], alpha=alpha, label=str(level))
        plt.legend(title=hue)

    plt.title(f"{y} vs {x}")
    plt.xlabel(x)
    plt.ylabel(y)

    out_path = None
    if save:
        out_dir = _ensure_dir(save_dir)
        base = f"{y}_vs_{x}" if hue is None else f"{y}_vs_{x}_by_{hue}"
        fname = filename or _sanitize_filename(base + "_scatter.png")
        out_path = out_dir / fname
        fig.savefig(out_path, bbox_inches="tight", dpi=150)
    if show:
        plt.show()
    plt.close(fig)
    return out_path

In [None]:
# example usage of plotting utilities
# This cell demonstrates how to call the functions above.
# NOTE:
# - actual column names must match the dataset in df
# - this cell will save figures into figures/eda/

TARGETS = ["CO(GT)", "C6H6(GT)", "NOx(GT)", "NMHC(GT)", "NO2(GT)"]

# 1) generate boxplots for all target variables
plot_box_all(df, TARGETS)

# 2) single variable boxplot example
plot_box(df, "CO(GT)")

# 3) simple bivariate scatterplot
plot_scatter(df, x="T", y="CO(GT)")

# 4) scatterplot with hue (e.g. coloring by weekday)
# assuming timestamp exists or can be constructed from Date + Time
if "timestamp" not in df.columns:
    if "Date" in df.columns and "Time" in df.columns:
        df["timestamp"] = pd.to_datetime(df["Date"] + " " + df["Time"])

df["weekday"] = pd.to_datetime(df["timestamp"]).dt.day_name()
plot_scatter(df, x="T", y="NO2(GT)", hue="weekday")