# Event Statistics

This notebook plots basic event statistics for the datasets obtained through the detection pipeline. 

In [None]:
# setup
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.stats import linregress
import pickle
import glob
import sunpy.map
from sunpy.visualization.colormaps import cm
from matplotlib.ticker import MultipleLocator, LogLocator, NullFormatter

In [None]:
# set up paths
csv_dir = Path("../results/csv")
dataset = "20200530"

event_df = pd.read_csv(csv_dir / "detections.csv")

Time for some plots.

In [None]:
# Campfire Lifetime distribution plot
# Data
x_data = event_df["total_lifetime_seconds"].values
x_data = x_data[x_data > 0]  # remove non-positive values

# Linear-scale histogram (equal-width bins)
fig, ax = plt.subplots(figsize=(8, 5), dpi = 150)

# calculate bin number with Freedman-Diaconis rule
n = len(x_data)
iqr = np.percentile(x_data, 75) - np.percentile(x_data, 25)

n_bins = int((x_data.max() - x_data.min()) / (2 * iqr * n**(-1/3)))
linear_bins = np.linspace(x_data.min(), x_data.max(), n_bins + 1)

sns.histplot(
    x_data,
    ax=ax,
    bins=linear_bins,
    stat="count",
    color="skyblue",
    alpha=0.7,
    edgecolor="black",
    linewidth=0.5,
    label="data"
)

# Final formatting
#plt.yscale("log")
#plt.xscale("log")
ax.set_xlabel("Lifetime (s)", fontsize = 18)
ax.set_ylabel("Count", fontsize = 18)
ax.set_title("Campfire Lifetime Distribution", fontsize = 18)
ax.grid(True, ls=":", alpha=0.8)
ax.tick_params(axis="both", which="major", labelsize=14)
plt.show()

In [None]:
import numpy as np
from scipy.stats import linregress
import matplotlib.pyplot as plt
import seaborn as sns  # Optional, if keeping sns styling

# Peak Intensity vs Area Plot
# 1. Prepare and filter data (only positive values for log)
df = event_df.copy()

# Remove non-positive values
df = df[df["total_area_Mm2"] > 0]
df = df[df["peak_intensity"] > 0]

# Optional: physical cuts (adjust as needed)
#df = df[df["mean_area_km2"] <= 5e7]        # remove extreme outliers
df = df[df["peak_intensity"] <= 1.9e3]

filtered_df = df

print(f"Plotting {len(filtered_df)} events")

# 2. Log-log scatterplot with colormap for peak_intensity
plt.figure(figsize=(9, 7), dpi=150)

# Use matplotlib scatter for easy colormap integration
scatter = plt.scatter(
    filtered_df["total_area_Mm2"],
    filtered_df["peak_intensity"],
    c=filtered_df["total_lifetime_seconds"],
    cmap="plasma",  # Choose a colormap; alternatives: 'viridis', 'magma'
    alpha=0.6,
    edgecolor="k",
    linewidth=0.4,
    s=50,
    label=rf"Detections ($\rho$ = {r_value:.3f})"
)

# Add colorbar
cb = plt.colorbar(scatter)
cb.set_label("Total lifetime (s)", fontsize=18)
cb.ax.tick_params(labelsize = 15)

# 4. Log-log axes and polish
plt.xscale("log")
plt.yscale("log")

# X-axis ticks (example: 0, 1, 10, 100 ... but 0 is invalid on log scale)
plt.xticks([0.1, 1, 10])  # add whatever tick positions you want

# Y-axis ticks (e.g. 1300 instead of 1.3e3)
plt.yticks([1260, 1280, 1300, 1320, 1340, 1360])  # customize as needed

# Force non-scientific notation
formatter = ScalarFormatter()
formatter.set_scientific(False)
formatter.set_useOffset(False)

plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

plt.xlabel("Total Area (Mm$^2$)", fontsize=21)  # Updated label to match x variable
plt.ylabel("Peak Intensity (DN/s)", fontsize=21, labelpad=15)
plt.title("Peak Intensity vs Total Area with lifetime colormap\n(log-log scale)", fontsize=21, pad=18)
plt.tick_params("x", labelsize=15)
plt.tick_params("y", labelsize=15)
plt.xlim(0, 15)
plt.ylim(1250, 1370)

plt.grid(True, which="both", ls="--", alpha=0.6, lw=0.8)

plt.legend(fontsize=18, loc="lower right")
plt.tight_layout()
plt.show()

In [None]:
# Area vs Lifetime + Colormap + Fit
# Perform log-log linear regression for power-law fit: log(y) = α log(x) + β
logx = np.log10(event_df["total_lifetime_seconds"])
logy = np.log10(event_df["total_area_Mm2"])

# Filter out non-finite values to avoid fit issues
mask = np.isfinite(logx) & np.isfinite(logy)
logx_clean = logx[mask]
logy_clean = logy[mask]

result = linregress(logx_clean, logy_clean)
alpha = result.slope
alpha_err = result.stderr

# Check if the fit is meaningful
print(rf"R$^2$ = {result.rvalue**2:.3}, p-value = {result.pvalue:.3e}")

plt.figure(figsize=(10,7), dpi = 150)
sc = plt.scatter(event_df["total_lifetime_seconds"],
                 event_df["total_area_Mm2"],
                 c=event_df["n_detections"],
                 cmap="viridis", alpha=0.7, s=40, edgecolors="k", linewidth=0.3)
cb = plt.colorbar(sc)
cb.set_label(label="Number of detections in event", fontsize = 23)
cb.ax.tick_params(labelsize = 18)

# Set log scales
plt.xscale("log")
plt.yscale("log")

# Custom x-axis ticks
ax = plt.gca()
x_ticks = np.arange(10, event_df["total_lifetime_seconds"].max() + 10, 60)
ax.set_xticks(x_ticks)
ax.set_xticklabels([f"{int(x)}" for x in x_ticks])

# Custom y-axis ticks
y_ticks = np.arange(0.25, event_df["total_area_Mm2"].max() + 0.25, 2.75)
ax.set_yticks(y_ticks)
ax.set_yticklabels([f"{y:.2f}" for y in y_ticks])
ax.tick_params("both", labelsize = 18)

# Add the power-law fit line
x_min, x_max = event_df["total_lifetime_seconds"].min(), event_df["total_lifetime_seconds"].max()
x_fit = np.logspace(np.log10(x_min), np.log10(x_max), 100)
logx_fit = np.log10(x_fit)
y_fit = 10 ** (alpha * logx_fit + result.intercept)
plt.plot(x_fit, y_fit, "r--", linewidth=2, label=rf"Area ~ Lifetime$^{{{alpha:.2f} \pm {alpha_err:.2f}}}$"+"\n" +rf"R$^2$ = {result.rvalue**2:.3}")

plt.xlabel("Lifetime (s)", fontsize=23)
plt.ylabel(rf"Total area (Mm$^2$)", fontsize = 23)
plt.title(rf"Area vs Lifetime with color = number of frames detected", fontsize = 23, pad = 10)
plt.grid(alpha=0.3, which="both")
plt.legend(fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
# Hexbin heatmap
# Settings
sample_frame = 28 # sample frame
hex_gridsize = 80 # try 40-100 depending on image size and desired smoothing
hex_mincnt = 1 # min counts per hex bin to show
hex_alpha = 0.7     
cmap_hex = plt.cm.YlOrRd_r # color map for hexbin; any that contrasts with greyscale is fine

# set up paths
raw_dir = Path(f"../data/raw/{dataset}")
processed_dir = Path("../data/processed")
cache_file = processed_dir / f"{dataset}_sequence.pkl"
processed_dir.mkdir(parents=True, exist_ok=True)

# load or render sequence if necessary
if cache_file.exists():
    sequence = pickle.load(open(cache_file, "rb"))
    print(f"Cached sequence loaded ({len(sequence)} maps)")
else:
    files = sorted(glob.glob(str(raw_dir / "*.fits")))
    sequence = sunpy.map.Map(files, sequence=True)
    pickle.dump(sequence, open(cache_file, "wb"))
    print(f"First load → saved cache ({len(sequence)} maps)")

# base image data (use float)
m = sequence[sample_frame]
data = m.data.astype(float)

# plot in grayscale (black & white)
plt.figure(figsize=(8,8), dpi = 150)
ax = plt.gca()

# display grayscale image with percentile stretch for good contrast
vmin, vmax = np.percentile(data, (0, 99))
ax.imshow(data, cmap="binary" , origin="lower", vmin=vmin, vmax=vmax)

# Build x,y lists for hexbin
x = event_df["x_centroid"].values
y = event_df["y_centroid"].values

hb = ax.hexbin(x, y, gridsize=hex_gridsize, cmap = cmap_hex, mincnt=hex_mincnt,
               extent=(0, data.shape[1], 0, data.shape[0]), alpha=hex_alpha)

# Add colorbar for hex counts
cb = plt.colorbar(hb, ax=ax, fraction=0.046, pad = -0.02)
cb.set_label("counts", fontsize = 23)
cb.ax.tick_params(labelsize = 18)

# set integer tick labels
cb.ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

ax.set_title(f"Sample (frame {sample_frame}) with hexbin heatmap", fontsize = 23)
ax.set_axis_off()
plt.tight_layout()
plt.show()

In [None]:
# Histogram: Area distribution (no fit)
# calculate bin number with Freedman-Diaconis rule
x_data = event_df["total_area_Mm2"]
n = len(x_data)
iqr = np.percentile(x_data, 75) - np.percentile(x_data, 25)

n_bins = int((x_data.max() - x_data.min()) / (2 * iqr * n**(-1/3)))

plt.figure(figsize=(9,5), dpi = 200)
plt.hist(event_df["total_area_Mm2"], bins=n_bins, color="skyblue", alpha=0.8, edgecolor="k", linewidth=0.4)
plt.xlabel("Total area (Mm^2)", fontsize = 20)
plt.ylabel("Number of campfires", fontsize = 20)
plt.title("Distribution of campfire areas", fontsize = 20)
plt.tick_params("both", labelsize = 18)
plt.grid(alpha=0.3)
plt.show()