**Time series plotting** <br>
This part of code plots time series of data

# Time series of Raw data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
from math import ceil
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.ticker import MultipleLocator
from tqdm.notebook import tqdm
def plot_pm25_panels(
    df: pd.DataFrame,
    start_date="2023-07-01",
    end_date="2025-06-30",
    days_per_panel=3,
    cols=2,
    rows_per_fig=6,      # => 12 panels (6x2) per PDF page; adjust if you want more/less per page
    tz="Asia/Dhaka",
    output_pdf="pm25_timeseries_3day_panels.pdf",
    home_name = "n/a"

):
    """
    Build time-series panels where each subplot spans `days_per_panel` days.
    Two curves are plotted on every panel: 'pm2_5_atm' and 'pm2_5_atm_b'.
    Panels cover the full range [start_date, end_date], inclusive.
    Results are saved to a multi-page PDF.
    """

    # --- Prep & sanity checks ---
    # Ensure datetime and timezone handling
    if not np.issubdtype(df["BDDateTime"].dtype, np.datetime64):
        df = df.copy()
        df["BDDateTime"] = pd.to_datetime(df["BDDateTime"], errors="coerce")

    # Localize/convert to the requested timezone
    # - If tz-aware: convert; if naive: localize.
    if getattr(df["BDDateTime"].dt, "tz", None) is None:
        df["BDDateTime"] = df["BDDateTime"].dt.tz_localize(tz)
    else:
        df["BDDateTime"] = df["BDDateTime"].dt.tz_convert(tz)

    # Sort and set index
    df = df.sort_values("BDDateTime").set_index("BDDateTime")

    # Keep only needed columns; coerce numeric
    for col in ["pm2_5_atm", "pm2_5_atm_b"]:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in df")
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Filter to requested range (inclusive)
    start = pd.Timestamp(start_date).tz_localize(tz) if pd.Timestamp(start_date).tzinfo is None else pd.Timestamp(start_date).tz_convert(tz)
    end   = pd.Timestamp(end_date).tz_localize(tz) if pd.Timestamp(end_date).tzinfo is None else pd.Timestamp(end_date).tz_convert(tz)
    # Make end inclusive by adding one microsecond shy of next day
    df = df.loc[(df.index >= start) & (df.index <= end + pd.Timedelta(days=1) - pd.Timedelta(microseconds=1))]

    if df.empty:
        raise ValueError("No data in the specified date range after filtering.")

    # Build 3-day windows
    panels = []
    cur = start
    while cur <= end:
        panel_end = cur + pd.Timedelta(days=days_per_panel)
        # Clamp to end+epsilon to include the last partial window if needed
        panels.append((cur, min(panel_end, end + pd.Timedelta(days=1))))
        cur = panel_end

    n_panels = len(panels)
    panels_per_fig = rows_per_fig * cols
    n_figs = ceil(n_panels / panels_per_fig)

    # Date formatting helpers
    major_locator = mdates.DayLocator(interval=1)         # tick every day
    major_fmt     = mdates.DateFormatter("%d %b %Y\n(%a)", tz=df.index.tz)  # e.g., 01 Jul 2023 (Sat)
    minor_locator = mdates.HourLocator(byhour=[0, 6, 12, 18])  # minor ticks at 0/6/12/18

    # Plot into a multi-page PDF
    with PdfPages(output_pdf) as pdf:
        for fig_idx in tqdm(range(n_figs), desc=f"{home_name}"):
            start_panel = fig_idx * panels_per_fig
            end_panel = min((fig_idx + 1) * panels_per_fig, n_panels)
            n_this_fig = end_panel - start_panel

            # Build figure & axes
            rows = ceil(n_this_fig / cols)
            fig, axes = plt.subplots(rows, cols, figsize=(12, 3.2 * rows), sharex=False, sharey=False)
            # Normalize axes to 2D array for easy indexing
            if rows == 1 and cols == 1:
                axes = np.array([[axes]])
            elif rows == 1:
                axes = np.array([axes])
            elif cols == 1:
                axes = np.array([[ax] for ax in axes])

            # Flatten to populate, but keep grid indexing for hiding extra axes
            flat_axes = axes.ravel()

            for ax_idx, (win_start, win_end) in enumerate(panels[start_panel:end_panel]):
                ax = flat_axes[ax_idx]
                seg = df.loc[(df.index >= win_start) & (df.index < win_end)]

                # Plot both series (thin lines due to dense ~2-min sampling)
                ax.plot(seg.index, seg["pm2_5_atm"], lw=0.8, alpha=0.9, label="pm2_5_atm")
                ax.plot(seg.index, seg["pm2_5_atm_b"], lw=0.8, alpha=0.9, label="pm2_5_atm_b")

                # Axes formatting
                ax.set_title(f"{win_start.strftime('%d %b %Y')} → { (win_end - pd.Timedelta(seconds=1)).strftime('%d %b %Y') }",
                             fontsize=10, loc="left")
                ax.xaxis.set_major_locator(major_locator)
                ax.xaxis.set_major_formatter(major_fmt)
                ax.xaxis.set_minor_locator(minor_locator)
                ax.grid(True, which="major", linewidth=0.6, alpha=0.4)
                ax.grid(True, which="minor", linewidth=0.3, alpha=0.2)
                ax.set_ylabel("PM2.5 (µg/m³)")
                ax.set_ylim(0, 300)  # consistent y-axis
                ax.yaxis.set_major_locator(MultipleLocator(100))
                ax.yaxis.set_minor_locator(MultipleLocator(10))

                # Put a compact legend only on the first axis of each figure
                if ax_idx == 0:
                    ax.legend(loc="upper right", fontsize=9, frameon=True)

            # Hide any unused axes slots on the last page
            for extra_ax in flat_axes[n_this_fig:]:
                extra_ax.set_visible(False)

            fig.suptitle(
                f"{home_name} : PM2.5 (pm2_5_atm & pm2_5_atm_b) — {start.strftime('%d %b %Y')} to {end.strftime('%d %b %Y')} — {days_per_panel}-day panels",
                fontsize=12
            )
            fig.tight_layout(pad=1.2, rect=[0, 0, 1, 0.96])
            pdf.savefig(fig)
            plt.close(fig)

    print(f"Saved multi-page PDF: {output_pdf}")

# -----------------------------
# Example usage (df must already exist):



In [None]:
import os

dir = "merged"
files = [f for f in os.listdir(dir) if f.endswith('.csv')]

for file in files:
    print("reading", file)
    df = pd.read_csv(os.path.join(dir, file), parse_dates=['BDDateTime'])
    if False and os.path.exists(os.path.join("raw_time_series", file.replace('.csv', '.pdf'))):
        print("skipping existing", file)
        continue
    plot_pm25_panels(df.copy(), output_pdf=f"raw_time_series/{file.replace('.csv', '.pdf')}", home_name=file.replace('.csv', ''))

#### PM and cf check

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
from math import ceil
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.ticker import MultipleLocator
from tqdm.notebook import tqdm

def plot_pm25_panels(
    df: pd.DataFrame,
    start_date="2023-07-01",
    end_date="2025-06-30",
    days_per_panel=3,
    cols=2,
    rows_per_fig=6,      # => 12 panels (6x2) per PDF page; adjust if you want more/less per page
    tz="Asia/Dhaka",
    output_pdf="pm25_timeseries_3day_panels.pdf",
    home_name = "n/a",

    first_col = 'pm2_5_atm',
    second_col = 'pm2_5_atm_b',
    time_col = 'BDDateTime',

    showfig = False,
    savepdf = True,


):
    """
    Build time-series panels where each subplot spans `days_per_panel` days.
    Two curves are plotted on every panel: 'pm2_5_atm' and 'pm2_5_atm_b'.
    Panels cover the full range [start_date, end_date], inclusive.
    Results are saved to a multi-page PDF.
    """

    # --- Prep & sanity checks ---
    # Ensure datetime and timezone handling
    if not np.issubdtype(df[time_col].dtype, np.datetime64):
        df = df.copy()
        df[time_col] = pd.to_datetime(df[time_col], errors="coerce")

    # Localize/convert to the requested timezone
    # - If tz-aware: convert; if naive: localize.
    if getattr(df[time_col].dt, "tz", None) is None:
        df[time_col] = df[time_col].dt.tz_localize(tz)
    else:
        df[time_col] = df[time_col].dt.tz_convert(tz)

    # Sort and set index
    df = df.sort_values(time_col).set_index(time_col)

    # Keep only needed columns; coerce numeric
    for col in [first_col, second_col]:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in df")
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Filter to requested range (inclusive)
    start = pd.Timestamp(start_date).tz_localize(tz) if pd.Timestamp(start_date).tzinfo is None else pd.Timestamp(start_date).tz_convert(tz)
    end   = pd.Timestamp(end_date).tz_localize(tz) if pd.Timestamp(end_date).tzinfo is None else pd.Timestamp(end_date).tz_convert(tz)
    # Make end inclusive by adding one microsecond shy of next day
    df = df.loc[(df.index >= start) & (df.index <= end + pd.Timedelta(days=1) - pd.Timedelta(microseconds=1))]

    if df.empty:
        raise ValueError("No data in the specified date range after filtering.")

    # Build 3-day windows
    panels = []
    cur = start
    while cur <= end:
        panel_end = cur + pd.Timedelta(days=days_per_panel)
        # Clamp to end+epsilon to include the last partial window if needed
        panels.append((cur, min(panel_end, end + pd.Timedelta(days=1))))
        cur = panel_end

    n_panels = len(panels)
    panels_per_fig = rows_per_fig * cols
    n_figs = ceil(n_panels / panels_per_fig)

    # Date formatting helpers
    major_locator = mdates.DayLocator(interval=1)         # tick every day
    major_fmt     = mdates.DateFormatter("%d %b %Y\n(%a)", tz=df.index.tz)  # e.g., 01 Jul 2023 (Sat)
    minor_locator = mdates.HourLocator(byhour=[0, 6, 12, 18])  # minor ticks at 0/6/12/18

    # Plot into a multi-page PDF
    with PdfPages(output_pdf) as pdf:
        for fig_idx in tqdm(range(n_figs), desc=f"{home_name}"):
            start_panel = fig_idx * panels_per_fig
            end_panel = min((fig_idx + 1) * panels_per_fig, n_panels)
            n_this_fig = end_panel - start_panel

            # Build figure & axes
            rows = ceil(n_this_fig / cols)
            fig, axes = plt.subplots(rows, cols, figsize=(12, 3.2 * rows), sharex=False, sharey=False)
            # Normalize axes to 2D array for easy indexing
            if rows == 1 and cols == 1:
                axes = np.array([[axes]])
            elif rows == 1:
                axes = np.array([axes])
            elif cols == 1:
                axes = np.array([[ax] for ax in axes])

            # Flatten to populate, but keep grid indexing for hiding extra axes
            flat_axes = axes.ravel()

            for ax_idx, (win_start, win_end) in enumerate(panels[start_panel:end_panel]):
                ax = flat_axes[ax_idx]
                seg = df.loc[(df.index >= win_start) & (df.index < win_end)]

                # Plot both series (thin lines due to dense ~2-min sampling)
                ax.plot(seg.index, seg[first_col], lw=0.8, alpha=0.9, label=first_col)
                ax.plot(seg.index, seg[second_col], lw=0.8, alpha=0.9, label=second_col)

                # Axes formatting
                ax.set_title(f"{win_start.strftime('%d %b %Y')} → { (win_end - pd.Timedelta(seconds=1)).strftime('%d %b %Y') }",
                             fontsize=10, loc="left")
                ax.xaxis.set_major_locator(major_locator)
                ax.xaxis.set_major_formatter(major_fmt)
                ax.xaxis.set_minor_locator(minor_locator)
                ax.grid(True, which="major", linewidth=0.6, alpha=0.4)
                ax.grid(True, which="minor", linewidth=0.3, alpha=0.2)
                ax.set_ylabel("PM2.5 (µg/m³)")
                ax.set_ylim(0, 300)  # consistent y-axis
                ax.yaxis.set_major_locator(MultipleLocator(100))
                ax.yaxis.set_minor_locator(MultipleLocator(10))

                # Put a compact legend only on the first axis of each figure
                if ax_idx == 0:
                    ax.legend(loc="upper right", fontsize=9, frameon=True)

            # Hide any unused axes slots on the last page
            for extra_ax in flat_axes[n_this_fig:]:
                extra_ax.set_visible(False)

            fig.suptitle(
                f"{home_name} : PM2.5 (pm2_5_atm & pm2_5_atm_b) — {start.strftime('%d %b %Y')} to {end.strftime('%d %b %Y')} — {days_per_panel}-day panels",
                fontsize=12
            )
            fig.tight_layout(pad=1.2, rect=[0, 0, 1, 0.96])
            pdf.savefig(fig)
            plt.close(fig)

    print(f"Saved multi-page PDF: {output_pdf}")

# -----------------------------
# Example usage (df must already exist):



In [None]:
import os

dir = "calibrated_2min"
# files = [f for f in os.listdir(dir) if f.endswith('.csv')]
files = ['R1.csv']

for file in files:
    print("reading", file)
    df = pd.read_csv(os.path.join(dir, file), parse_dates=['BDDateTime'])
    if False and os.path.exists(os.path.join("cf_atm_check", file.replace('.csv', '.pdf'))):
        print("skipping existing", file)
        continue
    plot_pm25_panels(df.copy(), output_pdf=f"cf_atm_check/{file.replace('.csv', '.pdf')}", home_name=file.replace('.csv', ''), first_col="pm2_5_atm", second_col="pm2_5_cf_1")

In [None]:
df = pd.read_csv("calibrated_2min/R1.csv", parse_dates=['BDDateTime'])

mask = (df['BDDateTime'] >= "2024-12-12") & (df['BDDateTime'] <= "2024-12-15")

plt.figure(figsize = (20, 7))
df = df[mask]
df = df.sort_values('BDDateTime')
plt.plot(df['BDDateTime'], df['pm2_5_atm'], label=r'$PM_{2.5} ATM$', linewidth=1)
plt.plot(df['BDDateTime'], df['pm2_5_cf_1'], label=r'$PM_{2.5} CF_1$', linewidth=1)

plt.legend()
plt.ylim([0, 300])
plt.ylabel(r"$PM_{2.5}$ concentration ($\mu g/m^3$)")
